From e6e859dce29268137ac7cb8609d5739effb4b93a Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 2 Sep 2020 11:20:32 +1000 Subject: [PATCH] Tweak sync timeouts * increase the EWMA default and decay * increase the block download retries * increase the request and block download timeouts * increase the sync timeout --- zebra-network/src/constants.rs | 36 +++++++++++++++-- zebrad/src/commands/start/sync.rs | 65 ++++++++++++++++++++++++++++++- 2 files changed, 96 insertions(+), 5 deletions(-) diff --git a/zebra-network/src/constants.rs b/zebra-network/src/constants.rs index 4afd9275..69323ae1 100644 --- a/zebra-network/src/constants.rs +++ b/zebra-network/src/constants.rs @@ -8,10 +8,14 @@ use crate::protocol::external::types::*; use zebra_chain::parameters::NetworkUpgrade; /// The buffer size for the peer set. +/// +/// We assume that Zebra nodes have at least 10 Mbps bandwidth. Therefore, a +/// maximum-sized block will take 2 seconds to download. Based on the current +/// `BLOCK_DOWNLOAD_TIMEOUT`, this is the largest buffer size we can support. pub const PEERSET_BUFFER_SIZE: usize = 10; /// The timeout for requests made to a remote peer. -pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); +pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(20); /// The timeout for handshakes when connecting to new peers. pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4); @@ -27,7 +31,7 @@ pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4); /// This avoids explicit synchronization, but relies on the peer /// connector actually setting up channels and these heartbeats in a /// specific manner that matches up with this math. -pub const LIVE_PEER_DURATION: Duration = Duration::from_secs(60 + 10 + 10 + 10); +pub const LIVE_PEER_DURATION: Duration = Duration::from_secs(60 + 20 + 20 + 20); /// Regular interval for sending keepalive `Ping` messages to each /// connected peer. @@ -66,10 +70,20 @@ pub const CURRENT_VERSION: Version = Version(170_012); pub const MIN_NETWORK_UPGRADE: NetworkUpgrade = NetworkUpgrade::Heartwood; /// The default RTT estimate for peer responses. -pub const EWMA_DEFAULT_RTT: Duration = Duration::from_secs(1); +/// +/// We choose a high value for the default RTT, so that new peers must prove they +/// are fast, before we prefer them to other peers. This is particularly +/// important on testnet, which has a small number of peers, which are often +/// slow. +/// +/// Make the default RTT one second higher than the response timeout. +pub const EWMA_DEFAULT_RTT: Duration = Duration::from_secs(20 + 1); /// The decay time for the EWMA response time metric used for load balancing. -pub const EWMA_DECAY_TIME: Duration = Duration::from_secs(60); +/// +/// This should be much larger than the `SYNC_RESTART_TIMEOUT`, so we choose +/// better peers when we restart the sync. +pub const EWMA_DECAY_TIME: Duration = Duration::from_secs(120); /// Magic numbers used to identify different Zcash networks. pub mod magics { @@ -95,4 +109,18 @@ mod tests { assert_eq!(LIVE_PEER_DURATION, constructed_live_peer_duration); } + + /// Make sure that the timeout values are consistent with each other. + #[test] + fn ensure_timeouts_consistent() { + assert!(HANDSHAKE_TIMEOUT <= REQUEST_TIMEOUT, + "Handshakes are requests, so the handshake timeout can't be longer than the timeout for all requests."); + // This check is particularly important on testnet, which has a small + // number of peers, which are often slow. + assert!(EWMA_DEFAULT_RTT > REQUEST_TIMEOUT, + "The default EWMA RTT should be higher than the request timeout, so new peers are required to prove they are fast, before we prefer them to other peers."); + + assert!(EWMA_DECAY_TIME > REQUEST_TIMEOUT, + "The EWMA decay time should be higher than the request timeout, so timed out peers are penalised by the EWMA."); + } } diff --git a/zebrad/src/commands/start/sync.rs b/zebrad/src/commands/start/sync.rs index f240665b..ecc0db45 100644 --- a/zebrad/src/commands/start/sync.rs +++ b/zebrad/src/commands/start/sync.rs @@ -16,11 +16,29 @@ use zebra_consensus::parameters; use zebra_network::{self as zn, RetryLimit}; use zebra_state as zs; +/// Controls the number of peers used for each ObtainTips and ExtendTips request. // XXX in the future, we may not be able to access the checkpoint module. const FANOUT: usize = checkpoint::MAX_QUEUED_BLOCKS_PER_HEIGHT; +/// Controls how many times we will retry each block download. +/// +/// If all the retries fail, then the syncer will reset, and start downloading +/// blocks from the verified tip in the state, including blocks which previously +/// downloaded successfully. +/// +/// But if a node is on a slow or unreliable network, sync restarts can result +/// in a flood of download requests, making future syncs more likely to fail. +/// So it's much faster to retry each block multiple times. +/// +/// When we implement a peer reputation system, we can reduce the number of +/// retries, because we will be more likely to choose a good peer. +const BLOCK_DOWNLOAD_RETRY_LIMIT: usize = 5; + /// Controls how far ahead of the chain tip the syncer tries to download before /// waiting for queued verifications to complete. Set to twice the maximum /// checkpoint distance. +/// +/// Some checkpoints contain larger blocks, so the maximum checkpoint gap can +/// represent multiple gigabytes of data. const LOOKAHEAD_LIMIT: usize = checkpoint::MAX_CHECKPOINT_HEIGHT_GAP * 2; /// Controls how long we wait for a tips response to return. @@ -53,7 +71,23 @@ const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(MAX_CHECKPOINT_DOWNLO /// their connection state. const TIPS_RETRY_TIMEOUT: Duration = Duration::from_secs(60); /// Controls how long we wait to restart syncing after finishing a sync run. -const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(20); +/// +/// This timeout should be long enough to: +/// - allow pending downloads and verifies to complete or time out. +/// Sync restarts don't cancel downloads, so quick restarts can overload +/// network-bound nodes with lots of peers, leading to further failures. +/// (The total number of requests being processed by peers is only +/// constrained by the number of peers.) +/// - allow zcashd peers to process pending requests. If the node only has a +/// few peers, we want to clear as much peer state as possible. In +/// particular, zcashd sends "next block range" hints, based on zcashd's +/// internal model of our sync progress. But we want to discard these hints, +/// so they don't get confused with ObtainTips and ExtendTips responses. +/// +/// Make sure each sync run can download an entire checkpoint, even on instances +/// with slow or unreliable networks. This is particularly important on testnet, +/// which has a small number of slow peers. +const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(60); /// Helps work around defects in the bitcoin protocol by checking whether /// the returned hashes actually extend a chain tip. @@ -642,3 +676,32 @@ where type Error = Box; type ReportAndHash = (Report, block::Hash); + +#[cfg(test)] +mod test { + use super::*; + + /// Make sure the timeout values are consistent with each other. + #[test] + fn ensure_timeouts_consistent() { + let max_download_retry_time = + BLOCK_DOWNLOAD_TIMEOUT.as_secs() * (BLOCK_DOWNLOAD_RETRY_LIMIT as u64); + assert!( + max_download_retry_time < BLOCK_VERIFY_TIMEOUT.as_secs(), + "Verify timeout should allow for previous block download retries" + ); + assert!( + BLOCK_DOWNLOAD_TIMEOUT.as_secs() * 2 < SYNC_RESTART_TIMEOUT.as_secs(), + "Sync restart should allow for pending and buffered requests to complete" + ); + + assert!( + TIPS_RETRY_TIMEOUT < BLOCK_VERIFY_TIMEOUT, + "Verify timeout should allow for retrying tips" + ); + assert!( + SYNC_RESTART_TIMEOUT < BLOCK_VERIFY_TIMEOUT, + "Verify timeout should allow for a sync restart" + ); + } +}