From c75a68e655b773ae205fa948c33e31d5e9cf5fbe Mon Sep 17 00:00:00 2001 From: teor Date: Thu, 23 Jun 2022 04:17:21 +1000 Subject: [PATCH] fix(sync): change default sync config to improve reliability (#4670) * Decrease the default lookahead limit to 400 * Increase the block verification timeout to 10 minutes * Halve the default concurrent downloads config * Try to run the spawned download task before queueing the next download * Allow verification to be cancelled if the verifier is busy --- zebrad/src/components/sync.rs | 10 +++++--- zebrad/src/components/sync/downloads.rs | 32 ++++++++++++++++++++++--- zebrad/src/config.rs | 15 ++++++++---- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/zebrad/src/components/sync.rs b/zebrad/src/components/sync.rs index 6d49c0df..118808f1 100644 --- a/zebrad/src/components/sync.rs +++ b/zebrad/src/components/sync.rs @@ -57,7 +57,7 @@ const BLOCK_DOWNLOAD_RETRY_LIMIT: usize = 3; /// A lower bound on the user-specified lookahead limit. /// -/// Set to the maximum checkpoint interval, so the pipeline holds at least one checkpoint's +/// Set to the maximum checkpoint interval, so the pipeline holds around a checkpoint's /// worth of blocks. /// /// ## Security @@ -79,7 +79,9 @@ pub const MIN_LOOKAHEAD_LIMIT: usize = zebra_consensus::MAX_CHECKPOINT_HEIGHT_GA /// The default for the user-specified lookahead limit. /// /// See [`MIN_LOOKAHEAD_LIMIT`] for details. -pub const DEFAULT_LOOKAHEAD_LIMIT: usize = zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP * 5; +/// +/// TODO: increase to `MAX_CHECKPOINT_HEIGHT_GAP * 5`, after we implement orchard batching +pub const DEFAULT_LOOKAHEAD_LIMIT: usize = MIN_LOOKAHEAD_LIMIT; /// The expected maximum number of hashes in an ObtainTips or ExtendTips response. /// @@ -141,7 +143,9 @@ pub(super) const BLOCK_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(15); /// /// If this timeout is set too low, the syncer will sometimes get stuck in a /// failure loop. -pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(6 * 60); +/// +/// TODO: reduce to `6 * 60`, after we implement orchard batching? +pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(10 * 60); /// Controls how long we wait to restart syncing after finishing a sync run. /// diff --git a/zebrad/src/components/sync/downloads.rs b/zebrad/src/components/sync/downloads.rs index cc1477cd..02cf168f 100644 --- a/zebrad/src/components/sync/downloads.rs +++ b/zebrad/src/components/sync/downloads.rs @@ -118,6 +118,15 @@ pub enum BlockDownloadVerifyError { #[error("block download & verification was cancelled during download: {hash:?}")] CancelledDuringDownload { hash: block::Hash }, + #[error( + "block download & verification was cancelled while waiting for the verifier service: \ + to become ready: {height:?} {hash:?}" + )] + CancelledAwaitingVerifierReadiness { + height: block::Height, + hash: block::Hash, + }, + #[error( "block download & verification was cancelled during verification: {height:?} {hash:?}" )] @@ -282,6 +291,7 @@ where let task = tokio::spawn( async move { + // Download the block. // Prefer the cancel handle if both are ready. let rsp = tokio::select! { biased; @@ -393,12 +403,24 @@ where Err(BlockDownloadVerifyError::BehindTipHeightLimit { height: block_height, hash })?; } + // Wait for the verifier service to be ready. + let readiness = verifier.ready(); + // Prefer the cancel handle if both are ready. + let verifier = tokio::select! { + biased; + _ = &mut cancel_rx => { + trace!("task cancelled waiting for verifier service readiness"); + metrics::counter!("sync.cancelled.verify.ready.count", 1); + return Err(BlockDownloadVerifyError::CancelledAwaitingVerifierReadiness { height: block_height, hash }) + } + verifier = readiness => verifier, + }; + + // Verify the block. let rsp = verifier - .ready() - .await .map_err(|error| BlockDownloadVerifyError::VerifierServiceError { error })? .call(block); - // Prefer the cancel handle if both are ready. + let verification = tokio::select! { biased; _ = &mut cancel_rx => { @@ -408,6 +430,7 @@ where } verification = rsp => verification, }; + if verification.is_ok() { metrics::counter!("sync.verified.block.count", 1); } @@ -425,6 +448,9 @@ where .map_err(move |e| (e, hash)), ); + // Try to start the spawned task before queueing the next block request + tokio::task::yield_now().await; + self.pending.push(task); assert!( self.cancel_handles.insert(hash, cancel_tx).is_none(), diff --git a/zebrad/src/config.rs b/zebrad/src/config.rs index 910a4d7d..1087b05e 100644 --- a/zebrad/src/config.rs +++ b/zebrad/src/config.rs @@ -166,7 +166,7 @@ impl Default for MetricsSection { #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(deny_unknown_fields, default)] pub struct SyncSection { - /// The maximum number of concurrent block requests during sync. + /// The maximum number of concurrent block download requests during sync. /// /// This is set to a low value by default, to avoid task and /// network contention. Increasing this value may improve @@ -178,22 +178,27 @@ pub struct SyncSection { /// download before waiting for queued verifications to complete. /// /// Increasing this limit increases the buffer size, so it reduces - /// the impact of an individual block request failing. The block - /// size limit is 2MB, so in theory, this could represent multiple + /// the impact of an individual block request failing. However, it + /// also increases memory and CPU usage if block validation stalls, + /// or there are some large blocks in the pipeline. + /// + /// The block size limit is 2MB, so in theory, this could represent multiple /// gigabytes of data, if we downloaded arbitrary blocks. However, /// because we randomly load balance outbound requests, and separate /// block download from obtaining block hashes, an adversary would /// have to control a significant fraction of our peers to lead us /// astray. /// - /// This value is clamped to an implementation-defined lower bound. + /// For reliable checkpoint syncing, Zebra enforces a + /// [`MIN_LOOKAHEAD_LIMIT`](sync::MIN_LOOKAHEAD_LIMIT). pub lookahead_limit: usize, } impl Default for SyncSection { fn default() -> Self { Self { - max_concurrent_block_requests: 50, + // TODO: increase to 50, after we implement orchard batching + max_concurrent_block_requests: 25, lookahead_limit: sync::DEFAULT_LOOKAHEAD_LIMIT, } }