diff --git a/zebra-consensus/src/checkpoint.rs b/zebra-consensus/src/checkpoint.rs index f4d3937f..858f422a 100644 --- a/zebra-consensus/src/checkpoint.rs +++ b/zebra-consensus/src/checkpoint.rs @@ -88,6 +88,14 @@ pub const MAX_QUEUED_BLOCKS_PER_HEIGHT: usize = 4; /// bugs. So the most efficient gap is slightly less than 500 blocks. pub const MAX_CHECKPOINT_HEIGHT_GAP: usize = 400; +/// We limit the memory usage and download contention for each checkpoint, +/// based on the cumulative size of the serialized blocks in the chain. +/// +/// Deserialized blocks (in memory) are slightly larger than serialized blocks +/// (on the network or disk). But they should be within a constant factor of the +/// serialized size. +pub const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024; + /// A checkpointing block verifier. /// /// Verifies blocks using a supplied list of checkpoints. There must be at diff --git a/zebra-consensus/src/lib.rs b/zebra-consensus/src/lib.rs index f18397d2..37bf747a 100644 --- a/zebra-consensus/src/lib.rs +++ b/zebra-consensus/src/lib.rs @@ -53,6 +53,7 @@ mod transaction; pub mod chain; pub mod error; +pub use checkpoint::MAX_CHECKPOINT_BYTE_COUNT; pub use checkpoint::MAX_CHECKPOINT_HEIGHT_GAP; pub use config::Config; diff --git a/zebra-utils/src/bin/zebra-checkpoints/main.rs b/zebra-utils/src/bin/zebra-checkpoints/main.rs index e30187fc..679249f0 100644 --- a/zebra-utils/src/bin/zebra-checkpoints/main.rs +++ b/zebra-utils/src/bin/zebra-checkpoints/main.rs @@ -24,12 +24,6 @@ use std::os::unix::process::ExitStatusExt; mod args; -/// We limit the memory usage for each checkpoint, based on the cumulative size of -/// the serialized blocks in the chain. Deserialized blocks are slightly larger -/// than serialized blocks, but they should be within a constant factor of the -/// serialized size. -const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024; - /// Initialise tracing using its defaults. fn init_tracing() { tracing_subscriber::Registry::default() @@ -135,7 +129,7 @@ fn main() -> Result<()> { // check if checkpoint if height == block::Height(0) - || cumulative_bytes >= MAX_CHECKPOINT_BYTE_COUNT + || cumulative_bytes >= zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT || height_gap.0 >= zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP as u32 { // print to output diff --git a/zebrad/src/components/sync.rs b/zebrad/src/components/sync.rs index ca68ea8b..fe1c2b50 100644 --- a/zebrad/src/components/sync.rs +++ b/zebrad/src/components/sync.rs @@ -88,16 +88,22 @@ const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(180); /// Controls how long we wait to restart syncing after finishing a sync run. /// -/// This timeout should be long enough to: +/// This delay should be long enough to: /// - allow zcashd peers to process pending requests. If the node only has a /// few peers, we want to clear as much peer state as possible. In /// particular, zcashd sends "next block range" hints, based on zcashd's /// internal model of our sync progress. But we want to discard these hints, /// so they don't get confused with ObtainTips and ExtendTips responses. /// -/// This timeout is particularly important on instances with slow or unreliable +/// This delay is particularly important on instances with slow or unreliable /// networks, and on testnet, which has a small number of slow peers. -const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(45); +/// +/// ## Correctness +/// +/// If this delay is removed (or set too low), the syncer will +/// sometimes get stuck in a failure loop, due to leftover downloads from +/// previous sync runs. +const SYNC_RESTART_DELAY: Duration = Duration::from_secs(61); type BoxError = Box; @@ -178,7 +184,7 @@ where AlwaysHedge, 20, 0.95, - 2 * SYNC_RESTART_TIMEOUT, + 2 * SYNC_RESTART_DELAY, ); // We apply a timeout to the verifier to avoid hangs due to missing earlier blocks. let verifier = Timeout::new(verifier, BLOCK_VERIFY_TIMEOUT); @@ -204,11 +210,11 @@ where 'sync: loop { if started_once { - tracing::info!(timeout = ?SYNC_RESTART_TIMEOUT, "waiting to restart sync"); + tracing::info!(timeout = ?SYNC_RESTART_DELAY, "waiting to restart sync"); self.prospective_tips = HashSet::new(); self.downloads.cancel_all(); self.update_metrics(); - sleep(SYNC_RESTART_TIMEOUT).await; + sleep(SYNC_RESTART_DELAY).await; } else { started_once = true; } @@ -613,6 +619,8 @@ where #[cfg(test)] mod test { + use zebra_chain::parameters::NetworkUpgrade; + use super::*; /// Make sure the timeout values are consistent with each other. @@ -620,9 +628,45 @@ mod test { fn ensure_timeouts_consistent() { zebra_test::init(); + // This constraint clears the download pipeline during a restart assert!( - BLOCK_DOWNLOAD_TIMEOUT.as_secs() * 2 < SYNC_RESTART_TIMEOUT.as_secs(), + SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(), "Sync restart should allow for pending and buffered requests to complete" ); + + // This constraint avoids spurious failures due to block retries timing out. + // We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT + // seconds before retrying. + const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 = + 2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(); + assert!( + SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT, + "Sync restart should allow for block downloads to time out on every retry" + ); + + // This constraint avoids spurious failures due to block download timeouts + assert!( + BLOCK_VERIFY_TIMEOUT.as_secs() + > SYNC_RESTART_DELAY.as_secs() + + BLOCK_DOWNLOAD_HEDGE_TIMEOUT + + BLOCK_DOWNLOAD_TIMEOUT.as_secs(), + "Block verify should allow for a block timeout, a sync restart, and some block fetches" + ); + + // The minimum recommended network speed for Zebra, in bytes per second. + const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8; + + // This constraint avoids spurious failures when restarting large checkpoints + assert!( + BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC, + "Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download" + ); + + // This constraint avoids spurious failures after checkpointing has finished + assert!( + BLOCK_VERIFY_TIMEOUT.as_secs() + > 2 * NetworkUpgrade::Blossom.target_spacing().num_seconds() as u64, + "Block verify should allow for at least one new block to be generated and distributed" + ); } }