From 2a68ef5acb96006ea3522031e64043435e59c146 Mon Sep 17 00:00:00 2001 From: teor Date: Tue, 8 Sep 2020 20:04:01 +1000 Subject: [PATCH] Update the peerset buffer size and sync timeout Also add a bunch of comments and documentation for network-constrained nodes, and for testnet. --- zebra-network/src/config.rs | 18 ++++++++++++++++++ zebra-network/src/constants.rs | 15 ++++++++++++--- zebrad/src/commands/start/sync.rs | 18 ++++++++++++------ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/zebra-network/src/config.rs b/zebra-network/src/config.rs index 30ef5b98..3bcab18d 100644 --- a/zebra-network/src/config.rs +++ b/zebra-network/src/config.rs @@ -26,6 +26,10 @@ pub struct Config { pub initial_testnet_peers: HashSet, /// The initial target size for the peer set. + /// + /// If you have a slow network connection, and Zebra is having trouble + /// syncing, try reducing the peer set size. You can also reduce the peer + /// set size to reduce Zebra's bandwidth usage. pub peerset_initial_target_size: usize, /// How frequently we attempt to connect to a new peer. @@ -79,6 +83,20 @@ impl Default for Config { initial_mainnet_peers: mainnet_peers, initial_testnet_peers: testnet_peers, new_peer_interval: Duration::from_secs(60), + + // The default peerset target size should be large enough to ensure + // nodes have a reliable set of peers. But it should also be limited + // to a reasonable size, to avoid queueing too many in-flight block + // downloads. A large queue of in-flight block downloads can choke a + // constrained local network connection. + // + // We assume that Zebra nodes have at least 10 Mbps bandwidth. + // Therefore, a maximum-sized block can take up to 2 seconds to + // download. So a full default peer set adds up to 100 seconds worth + // of blocks to the queue. + // + // But the peer set for slow nodes is typically much smaller, due to + // the handshake RTT timeout. peerset_initial_target_size: 50, } } diff --git a/zebra-network/src/constants.rs b/zebra-network/src/constants.rs index 69323ae1..5077288d 100644 --- a/zebra-network/src/constants.rs +++ b/zebra-network/src/constants.rs @@ -9,15 +9,24 @@ use zebra_chain::parameters::NetworkUpgrade; /// The buffer size for the peer set. /// +/// This should be greater than 1 to avoid sender contention, but also reasonably +/// small, to avoid queueing too many in-flight block downloads. (A large queue +/// of in-flight block downloads can choke a constrained local network +/// connection, or a small peer set on testnet.) +/// /// We assume that Zebra nodes have at least 10 Mbps bandwidth. Therefore, a -/// maximum-sized block will take 2 seconds to download. Based on the current -/// `BLOCK_DOWNLOAD_TIMEOUT`, this is the largest buffer size we can support. -pub const PEERSET_BUFFER_SIZE: usize = 10; +/// maximum-sized block can take up to 2 seconds to download. So the peer set +/// buffer adds up to 6 seconds worth of blocks to the queue. +pub const PEERSET_BUFFER_SIZE: usize = 3; /// The timeout for requests made to a remote peer. pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(20); /// The timeout for handshakes when connecting to new peers. +/// +/// This timeout should remain small, because it helps stop slow peers getting +/// into the peer set. This is particularly important for network-constrained +/// nodes, and on testnet. pub const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(4); /// We expect to receive a message from a live peer at least once in this time duration. diff --git a/zebrad/src/commands/start/sync.rs b/zebrad/src/commands/start/sync.rs index 644484e8..fa50bbea 100644 --- a/zebrad/src/commands/start/sync.rs +++ b/zebrad/src/commands/start/sync.rs @@ -76,18 +76,24 @@ const TIPS_RETRY_TIMEOUT: Duration = Duration::from_secs(60); /// - allow pending downloads and verifies to complete or time out. /// Sync restarts don't cancel downloads, so quick restarts can overload /// network-bound nodes with lots of peers, leading to further failures. -/// (The total number of requests being processed by peers is only -/// constrained by the number of peers.) +/// (The total number of requests being processed by peers is the sum of +/// the number of peers, and the peer request buffer size.) +/// +/// We assume that Zebra nodes have at least 10 Mbps bandwidth. So a +/// maximum-sized block can take up to 2 seconds to download. Therefore, we +/// set this timeout to twice the default number of peers. (The peer request +/// buffer size is small enough that any buffered requests will overlap with +/// the post-restart ObtainTips.) +/// /// - allow zcashd peers to process pending requests. If the node only has a /// few peers, we want to clear as much peer state as possible. In /// particular, zcashd sends "next block range" hints, based on zcashd's /// internal model of our sync progress. But we want to discard these hints, /// so they don't get confused with ObtainTips and ExtendTips responses. /// -/// Make sure each sync run can download an entire checkpoint, even on instances -/// with slow or unreliable networks. This is particularly important on testnet, -/// which has a small number of slow peers. -const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(60); +/// This timeout is particularly important on instances with slow or unreliable +/// networks, and on testnet, which has a small number of slow peers. +const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(100); /// Helps work around defects in the bitcoin protocol by checking whether /// the returned hashes actually extend a chain tip.