From e6e859dce29268137ac7cb8609d5739effb4b93a Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Wed, 2 Sep 2020 11:20:32 +1000
Subject: [PATCH] Tweak sync timeouts

* increase the EWMA default and decay
* increase the block download retries
* increase the request and block download timeouts
* increase the sync timeout
---
 zebra-network/src/constants.rs    | 36 +++++++++++++++--
 zebrad/src/commands/start/sync.rs | 65 ++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/zebra-network/src/constants.rs b/zebra-network/src/constants.rs
index 4afd9275..69323ae1 100644
--- a/zebra-network/src/constants.rs
+++ b/zebra-network/src/constants.rs
@@ -8,10 +8,14 @@ use crate::protocol::external::types::*;
 use zebra_chain::parameters::NetworkUpgrade;
 
 /// The buffer size for the peer set.
+///
+/// We assume that Zebra nodes have at least 10 Mbps bandwidth. Therefore, a
+/// maximum-sized block will take 2 seconds to download. Based on the current
+/// `BLOCK_DOWNLOAD_TIMEOUT`, this is the largest buffer size we can support.
 pub const PEERSET_BUFFER_SIZE: usize = 10;
 
 /// The timeout for requests made to a remote peer.
-pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(20);
 
 /// The timeout for handshakes when connecting to new peers.
 pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4);
@@ -27,7 +31,7 @@ pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4);
 /// This avoids explicit synchronization, but relies on the peer
 /// connector actually setting up channels and these heartbeats in a
 /// specific manner that matches up with this math.
-pub const LIVE_PEER_DURATION: Duration = Duration::from_secs(60 + 10 + 10 + 10);
+pub const LIVE_PEER_DURATION: Duration = Duration::from_secs(60 + 20 + 20 + 20);
 
 /// Regular interval for sending keepalive `Ping` messages to each
 /// connected peer.
@@ -66,10 +70,20 @@ pub const CURRENT_VERSION: Version = Version(170_012);
 pub const MIN_NETWORK_UPGRADE: NetworkUpgrade = NetworkUpgrade::Heartwood;
 
 /// The default RTT estimate for peer responses.
-pub const EWMA_DEFAULT_RTT: Duration = Duration::from_secs(1);
+///
+/// We choose a high value for the default RTT, so that new peers must prove they
+/// are fast, before we prefer them to other peers. This is particularly
+/// important on testnet, which has a small number of peers, which are often
+/// slow.
+///
+/// Make the default RTT one second higher than the response timeout.
+pub const EWMA_DEFAULT_RTT: Duration = Duration::from_secs(20 + 1);
 
 /// The decay time for the EWMA response time metric used for load balancing.
-pub const EWMA_DECAY_TIME: Duration = Duration::from_secs(60);
+///
+/// This should be much larger than the `SYNC_RESTART_TIMEOUT`, so we choose
+/// better peers when we restart the sync.
+pub const EWMA_DECAY_TIME: Duration = Duration::from_secs(120);
 
 /// Magic numbers used to identify different Zcash networks.
 pub mod magics {
@@ -95,4 +109,18 @@ mod tests {
 
         assert_eq!(LIVE_PEER_DURATION, constructed_live_peer_duration);
     }
+
+    /// Make sure that the timeout values are consistent with each other.
+    #[test]
+    fn ensure_timeouts_consistent() {
+        assert!(HANDSHAKE_TIMEOUT <= REQUEST_TIMEOUT,
+                "Handshakes are requests, so the handshake timeout can't be longer than the timeout for all requests.");
+        // This check is particularly important on testnet, which has a small
+        // number of peers, which are often slow.
+        assert!(EWMA_DEFAULT_RTT > REQUEST_TIMEOUT,
+                "The default EWMA RTT should be higher than the request timeout, so new peers are required to prove they are fast, before we prefer them to other peers.");
+
+        assert!(EWMA_DECAY_TIME > REQUEST_TIMEOUT,
+                "The EWMA decay time should be higher than the request timeout, so timed out peers are penalised by the EWMA.");
+    }
 }
diff --git a/zebrad/src/commands/start/sync.rs b/zebrad/src/commands/start/sync.rs
index f240665b..ecc0db45 100644
--- a/zebrad/src/commands/start/sync.rs
+++ b/zebrad/src/commands/start/sync.rs
@@ -16,11 +16,29 @@ use zebra_consensus::parameters;
 use zebra_network::{self as zn, RetryLimit};
 use zebra_state as zs;
 
+/// Controls the number of peers used for each ObtainTips and ExtendTips request.
 // XXX in the future, we may not be able to access the checkpoint module.
 const FANOUT: usize = checkpoint::MAX_QUEUED_BLOCKS_PER_HEIGHT;
+/// Controls how many times we will retry each block download.
+///
+/// If all the retries fail, then the syncer will reset, and start downloading
+/// blocks from the verified tip in the state, including blocks which previously
+/// downloaded successfully.
+///
+/// But if a node is on a slow or unreliable network, sync restarts can result
+/// in a flood of download requests, making future syncs more likely to fail.
+/// So it's much faster to retry each block multiple times.
+///
+/// When we implement a peer reputation system, we can reduce the number of
+/// retries, because we will be more likely to choose a good peer.
+const BLOCK_DOWNLOAD_RETRY_LIMIT: usize = 5;
+
 /// Controls how far ahead of the chain tip the syncer tries to download before
 /// waiting for queued verifications to complete. Set to twice the maximum
 /// checkpoint distance.
+///
+/// Some checkpoints contain larger blocks, so the maximum checkpoint gap can
+/// represent multiple gigabytes of data.
 const LOOKAHEAD_LIMIT: usize = checkpoint::MAX_CHECKPOINT_HEIGHT_GAP * 2;
 
 /// Controls how long we wait for a tips response to return.
@@ -53,7 +71,23 @@ const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(MAX_CHECKPOINT_DOWNLO
 /// their connection state.
 const TIPS_RETRY_TIMEOUT: Duration = Duration::from_secs(60);
 /// Controls how long we wait to restart syncing after finishing a sync run.
-const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(20);
+///
+/// This timeout should be long enough to:
+///   - allow pending downloads and verifies to complete or time out.
+///     Sync restarts don't cancel downloads, so quick restarts can overload
+///     network-bound nodes with lots of peers, leading to further failures.
+///     (The total number of requests being processed by peers is only
+///     constrained by the number of peers.)
+///   - allow zcashd peers to process pending requests. If the node only has a
+///     few peers, we want to clear as much peer state as possible. In
+///     particular, zcashd sends "next block range" hints, based on zcashd's
+///     internal model of our sync progress. But we want to discard these hints,
+///     so they don't get confused with ObtainTips and ExtendTips responses.
+///
+/// Make sure each sync run can download an entire checkpoint, even on instances
+/// with slow or unreliable networks. This is particularly important on testnet,
+/// which has a small number of slow peers.
+const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(60);
 
 /// Helps work around defects in the bitcoin protocol by checking whether
 /// the returned hashes actually extend a chain tip.
@@ -642,3 +676,32 @@ where
 
 type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
 type ReportAndHash = (Report, block::Hash);
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    /// Make sure the timeout values are consistent with each other.
+    #[test]
+    fn ensure_timeouts_consistent() {
+        let max_download_retry_time =
+            BLOCK_DOWNLOAD_TIMEOUT.as_secs() * (BLOCK_DOWNLOAD_RETRY_LIMIT as u64);
+        assert!(
+            max_download_retry_time < BLOCK_VERIFY_TIMEOUT.as_secs(),
+            "Verify timeout should allow for previous block download retries"
+        );
+        assert!(
+            BLOCK_DOWNLOAD_TIMEOUT.as_secs() * 2 < SYNC_RESTART_TIMEOUT.as_secs(),
+            "Sync restart should allow for pending and buffered requests to complete"
+        );
+
+        assert!(
+            TIPS_RETRY_TIMEOUT < BLOCK_VERIFY_TIMEOUT,
+            "Verify timeout should allow for retrying tips"
+        );
+        assert!(
+            SYNC_RESTART_TIMEOUT < BLOCK_VERIFY_TIMEOUT,
+            "Verify timeout should allow for a sync restart"
+        );
+    }
+}