Tweak sync timeouts
* increase the EWMA default and decay * increase the block download retries * increase the request and block download timeouts * increase the sync timeout
This commit is contained in:
parent
ce12d4dadc
commit
e6e859dce2
|
|
@ -8,10 +8,14 @@ use crate::protocol::external::types::*;
|
||||||
use zebra_chain::parameters::NetworkUpgrade;
|
use zebra_chain::parameters::NetworkUpgrade;
|
||||||
|
|
||||||
/// The buffer size for the peer set.
|
/// The buffer size for the peer set.
|
||||||
|
///
|
||||||
|
/// We assume that Zebra nodes have at least 10 Mbps bandwidth. Therefore, a
|
||||||
|
/// maximum-sized block will take 2 seconds to download. Based on the current
|
||||||
|
/// `BLOCK_DOWNLOAD_TIMEOUT`, this is the largest buffer size we can support.
|
||||||
pub const PEERSET_BUFFER_SIZE: usize = 10;
|
pub const PEERSET_BUFFER_SIZE: usize = 10;
|
||||||
|
|
||||||
/// The timeout for requests made to a remote peer.
|
/// The timeout for requests made to a remote peer.
|
||||||
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(20);
|
||||||
|
|
||||||
/// The timeout for handshakes when connecting to new peers.
|
/// The timeout for handshakes when connecting to new peers.
|
||||||
pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4);
|
pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4);
|
||||||
|
|
@ -27,7 +31,7 @@ pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4);
|
||||||
/// This avoids explicit synchronization, but relies on the peer
|
/// This avoids explicit synchronization, but relies on the peer
|
||||||
/// connector actually setting up channels and these heartbeats in a
|
/// connector actually setting up channels and these heartbeats in a
|
||||||
/// specific manner that matches up with this math.
|
/// specific manner that matches up with this math.
|
||||||
pub const LIVE_PEER_DURATION: Duration = Duration::from_secs(60 + 10 + 10 + 10);
|
pub const LIVE_PEER_DURATION: Duration = Duration::from_secs(60 + 20 + 20 + 20);
|
||||||
|
|
||||||
/// Regular interval for sending keepalive `Ping` messages to each
|
/// Regular interval for sending keepalive `Ping` messages to each
|
||||||
/// connected peer.
|
/// connected peer.
|
||||||
|
|
@ -66,10 +70,20 @@ pub const CURRENT_VERSION: Version = Version(170_012);
|
||||||
pub const MIN_NETWORK_UPGRADE: NetworkUpgrade = NetworkUpgrade::Heartwood;
|
pub const MIN_NETWORK_UPGRADE: NetworkUpgrade = NetworkUpgrade::Heartwood;
|
||||||
|
|
||||||
/// The default RTT estimate for peer responses.
|
/// The default RTT estimate for peer responses.
|
||||||
pub const EWMA_DEFAULT_RTT: Duration = Duration::from_secs(1);
|
///
|
||||||
|
/// We choose a high value for the default RTT, so that new peers must prove they
|
||||||
|
/// are fast, before we prefer them to other peers. This is particularly
|
||||||
|
/// important on testnet, which has a small number of peers, which are often
|
||||||
|
/// slow.
|
||||||
|
///
|
||||||
|
/// Make the default RTT one second higher than the response timeout.
|
||||||
|
pub const EWMA_DEFAULT_RTT: Duration = Duration::from_secs(20 + 1);
|
||||||
|
|
||||||
/// The decay time for the EWMA response time metric used for load balancing.
|
/// The decay time for the EWMA response time metric used for load balancing.
|
||||||
pub const EWMA_DECAY_TIME: Duration = Duration::from_secs(60);
|
///
|
||||||
|
/// This should be much larger than the `SYNC_RESTART_TIMEOUT`, so we choose
|
||||||
|
/// better peers when we restart the sync.
|
||||||
|
pub const EWMA_DECAY_TIME: Duration = Duration::from_secs(120);
|
||||||
|
|
||||||
/// Magic numbers used to identify different Zcash networks.
|
/// Magic numbers used to identify different Zcash networks.
|
||||||
pub mod magics {
|
pub mod magics {
|
||||||
|
|
@ -95,4 +109,18 @@ mod tests {
|
||||||
|
|
||||||
assert_eq!(LIVE_PEER_DURATION, constructed_live_peer_duration);
|
assert_eq!(LIVE_PEER_DURATION, constructed_live_peer_duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Make sure that the timeout values are consistent with each other.
|
||||||
|
#[test]
|
||||||
|
fn ensure_timeouts_consistent() {
|
||||||
|
assert!(HANDSHAKE_TIMEOUT <= REQUEST_TIMEOUT,
|
||||||
|
"Handshakes are requests, so the handshake timeout can't be longer than the timeout for all requests.");
|
||||||
|
// This check is particularly important on testnet, which has a small
|
||||||
|
// number of peers, which are often slow.
|
||||||
|
assert!(EWMA_DEFAULT_RTT > REQUEST_TIMEOUT,
|
||||||
|
"The default EWMA RTT should be higher than the request timeout, so new peers are required to prove they are fast, before we prefer them to other peers.");
|
||||||
|
|
||||||
|
assert!(EWMA_DECAY_TIME > REQUEST_TIMEOUT,
|
||||||
|
"The EWMA decay time should be higher than the request timeout, so timed out peers are penalised by the EWMA.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,29 @@ use zebra_consensus::parameters;
|
||||||
use zebra_network::{self as zn, RetryLimit};
|
use zebra_network::{self as zn, RetryLimit};
|
||||||
use zebra_state as zs;
|
use zebra_state as zs;
|
||||||
|
|
||||||
|
/// Controls the number of peers used for each ObtainTips and ExtendTips request.
|
||||||
// XXX in the future, we may not be able to access the checkpoint module.
|
// XXX in the future, we may not be able to access the checkpoint module.
|
||||||
const FANOUT: usize = checkpoint::MAX_QUEUED_BLOCKS_PER_HEIGHT;
|
const FANOUT: usize = checkpoint::MAX_QUEUED_BLOCKS_PER_HEIGHT;
|
||||||
|
/// Controls how many times we will retry each block download.
|
||||||
|
///
|
||||||
|
/// If all the retries fail, then the syncer will reset, and start downloading
|
||||||
|
/// blocks from the verified tip in the state, including blocks which previously
|
||||||
|
/// downloaded successfully.
|
||||||
|
///
|
||||||
|
/// But if a node is on a slow or unreliable network, sync restarts can result
|
||||||
|
/// in a flood of download requests, making future syncs more likely to fail.
|
||||||
|
/// So it's much faster to retry each block multiple times.
|
||||||
|
///
|
||||||
|
/// When we implement a peer reputation system, we can reduce the number of
|
||||||
|
/// retries, because we will be more likely to choose a good peer.
|
||||||
|
const BLOCK_DOWNLOAD_RETRY_LIMIT: usize = 5;
|
||||||
|
|
||||||
/// Controls how far ahead of the chain tip the syncer tries to download before
|
/// Controls how far ahead of the chain tip the syncer tries to download before
|
||||||
/// waiting for queued verifications to complete. Set to twice the maximum
|
/// waiting for queued verifications to complete. Set to twice the maximum
|
||||||
/// checkpoint distance.
|
/// checkpoint distance.
|
||||||
|
///
|
||||||
|
/// Some checkpoints contain larger blocks, so the maximum checkpoint gap can
|
||||||
|
/// represent multiple gigabytes of data.
|
||||||
const LOOKAHEAD_LIMIT: usize = checkpoint::MAX_CHECKPOINT_HEIGHT_GAP * 2;
|
const LOOKAHEAD_LIMIT: usize = checkpoint::MAX_CHECKPOINT_HEIGHT_GAP * 2;
|
||||||
|
|
||||||
/// Controls how long we wait for a tips response to return.
|
/// Controls how long we wait for a tips response to return.
|
||||||
|
|
@ -53,7 +71,23 @@ const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(MAX_CHECKPOINT_DOWNLO
|
||||||
/// their connection state.
|
/// their connection state.
|
||||||
const TIPS_RETRY_TIMEOUT: Duration = Duration::from_secs(60);
|
const TIPS_RETRY_TIMEOUT: Duration = Duration::from_secs(60);
|
||||||
/// Controls how long we wait to restart syncing after finishing a sync run.
|
/// Controls how long we wait to restart syncing after finishing a sync run.
|
||||||
const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(20);
|
///
|
||||||
|
/// This timeout should be long enough to:
|
||||||
|
/// - allow pending downloads and verifies to complete or time out.
|
||||||
|
/// Sync restarts don't cancel downloads, so quick restarts can overload
|
||||||
|
/// network-bound nodes with lots of peers, leading to further failures.
|
||||||
|
/// (The total number of requests being processed by peers is only
|
||||||
|
/// constrained by the number of peers.)
|
||||||
|
/// - allow zcashd peers to process pending requests. If the node only has a
|
||||||
|
/// few peers, we want to clear as much peer state as possible. In
|
||||||
|
/// particular, zcashd sends "next block range" hints, based on zcashd's
|
||||||
|
/// internal model of our sync progress. But we want to discard these hints,
|
||||||
|
/// so they don't get confused with ObtainTips and ExtendTips responses.
|
||||||
|
///
|
||||||
|
/// Make sure each sync run can download an entire checkpoint, even on instances
|
||||||
|
/// with slow or unreliable networks. This is particularly important on testnet,
|
||||||
|
/// which has a small number of slow peers.
|
||||||
|
const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(60);
|
||||||
|
|
||||||
/// Helps work around defects in the bitcoin protocol by checking whether
|
/// Helps work around defects in the bitcoin protocol by checking whether
|
||||||
/// the returned hashes actually extend a chain tip.
|
/// the returned hashes actually extend a chain tip.
|
||||||
|
|
@ -642,3 +676,32 @@ where
|
||||||
|
|
||||||
type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
|
type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
|
||||||
type ReportAndHash = (Report, block::Hash);
|
type ReportAndHash = (Report, block::Hash);
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Make sure the timeout values are consistent with each other.
|
||||||
|
#[test]
|
||||||
|
fn ensure_timeouts_consistent() {
|
||||||
|
let max_download_retry_time =
|
||||||
|
BLOCK_DOWNLOAD_TIMEOUT.as_secs() * (BLOCK_DOWNLOAD_RETRY_LIMIT as u64);
|
||||||
|
assert!(
|
||||||
|
max_download_retry_time < BLOCK_VERIFY_TIMEOUT.as_secs(),
|
||||||
|
"Verify timeout should allow for previous block download retries"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
BLOCK_DOWNLOAD_TIMEOUT.as_secs() * 2 < SYNC_RESTART_TIMEOUT.as_secs(),
|
||||||
|
"Sync restart should allow for pending and buffered requests to complete"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
TIPS_RETRY_TIMEOUT < BLOCK_VERIFY_TIMEOUT,
|
||||||
|
"Verify timeout should allow for retrying tips"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
SYNC_RESTART_TIMEOUT < BLOCK_VERIFY_TIMEOUT,
|
||||||
|
"Verify timeout should allow for a sync restart"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue