Add some additional sync correctness constraints
And adjust the sync restart delay as a consequence.
This commit is contained in:
parent
cef0a492d8
commit
3699bbdae6
|
|
@ -88,6 +88,14 @@ pub const MAX_QUEUED_BLOCKS_PER_HEIGHT: usize = 4;
|
||||||
/// bugs. So the most efficient gap is slightly less than 500 blocks.
|
/// bugs. So the most efficient gap is slightly less than 500 blocks.
|
||||||
pub const MAX_CHECKPOINT_HEIGHT_GAP: usize = 400;
|
pub const MAX_CHECKPOINT_HEIGHT_GAP: usize = 400;
|
||||||
|
|
||||||
|
/// We limit the memory usage and download contention for each checkpoint,
|
||||||
|
/// based on the cumulative size of the serialized blocks in the chain.
|
||||||
|
///
|
||||||
|
/// Deserialized blocks (in memory) are slightly larger than serialized blocks
|
||||||
|
/// (on the network or disk). But they should be within a constant factor of the
|
||||||
|
/// serialized size.
|
||||||
|
pub const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024;
|
||||||
|
|
||||||
/// A checkpointing block verifier.
|
/// A checkpointing block verifier.
|
||||||
///
|
///
|
||||||
/// Verifies blocks using a supplied list of checkpoints. There must be at
|
/// Verifies blocks using a supplied list of checkpoints. There must be at
|
||||||
|
|
|
||||||
|
|
@ -53,6 +53,7 @@ mod transaction;
|
||||||
pub mod chain;
|
pub mod chain;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
|
||||||
|
pub use checkpoint::MAX_CHECKPOINT_BYTE_COUNT;
|
||||||
pub use checkpoint::MAX_CHECKPOINT_HEIGHT_GAP;
|
pub use checkpoint::MAX_CHECKPOINT_HEIGHT_GAP;
|
||||||
pub use config::Config;
|
pub use config::Config;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,12 +24,6 @@ use std::os::unix::process::ExitStatusExt;
|
||||||
|
|
||||||
mod args;
|
mod args;
|
||||||
|
|
||||||
/// We limit the memory usage for each checkpoint, based on the cumulative size of
|
|
||||||
/// the serialized blocks in the chain. Deserialized blocks are slightly larger
|
|
||||||
/// than serialized blocks, but they should be within a constant factor of the
|
|
||||||
/// serialized size.
|
|
||||||
const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024;
|
|
||||||
|
|
||||||
/// Initialise tracing using its defaults.
|
/// Initialise tracing using its defaults.
|
||||||
fn init_tracing() {
|
fn init_tracing() {
|
||||||
tracing_subscriber::Registry::default()
|
tracing_subscriber::Registry::default()
|
||||||
|
|
@ -135,7 +129,7 @@ fn main() -> Result<()> {
|
||||||
|
|
||||||
// check if checkpoint
|
// check if checkpoint
|
||||||
if height == block::Height(0)
|
if height == block::Height(0)
|
||||||
|| cumulative_bytes >= MAX_CHECKPOINT_BYTE_COUNT
|
|| cumulative_bytes >= zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT
|
||||||
|| height_gap.0 >= zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP as u32
|
|| height_gap.0 >= zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP as u32
|
||||||
{
|
{
|
||||||
// print to output
|
// print to output
|
||||||
|
|
|
||||||
|
|
@ -88,16 +88,22 @@ const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(180);
|
||||||
|
|
||||||
/// Controls how long we wait to restart syncing after finishing a sync run.
|
/// Controls how long we wait to restart syncing after finishing a sync run.
|
||||||
///
|
///
|
||||||
/// This timeout should be long enough to:
|
/// This delay should be long enough to:
|
||||||
/// - allow zcashd peers to process pending requests. If the node only has a
|
/// - allow zcashd peers to process pending requests. If the node only has a
|
||||||
/// few peers, we want to clear as much peer state as possible. In
|
/// few peers, we want to clear as much peer state as possible. In
|
||||||
/// particular, zcashd sends "next block range" hints, based on zcashd's
|
/// particular, zcashd sends "next block range" hints, based on zcashd's
|
||||||
/// internal model of our sync progress. But we want to discard these hints,
|
/// internal model of our sync progress. But we want to discard these hints,
|
||||||
/// so they don't get confused with ObtainTips and ExtendTips responses.
|
/// so they don't get confused with ObtainTips and ExtendTips responses.
|
||||||
///
|
///
|
||||||
/// This timeout is particularly important on instances with slow or unreliable
|
/// This delay is particularly important on instances with slow or unreliable
|
||||||
/// networks, and on testnet, which has a small number of slow peers.
|
/// networks, and on testnet, which has a small number of slow peers.
|
||||||
const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(45);
|
///
|
||||||
|
/// ## Correctness
|
||||||
|
///
|
||||||
|
/// If this delay is removed (or set too low), the syncer will
|
||||||
|
/// sometimes get stuck in a failure loop, due to leftover downloads from
|
||||||
|
/// previous sync runs.
|
||||||
|
const SYNC_RESTART_DELAY: Duration = Duration::from_secs(61);
|
||||||
|
|
||||||
type BoxError = Box<dyn std::error::Error + Send + Sync + 'static>;
|
type BoxError = Box<dyn std::error::Error + Send + Sync + 'static>;
|
||||||
|
|
||||||
|
|
@ -178,7 +184,7 @@ where
|
||||||
AlwaysHedge,
|
AlwaysHedge,
|
||||||
20,
|
20,
|
||||||
0.95,
|
0.95,
|
||||||
2 * SYNC_RESTART_TIMEOUT,
|
2 * SYNC_RESTART_DELAY,
|
||||||
);
|
);
|
||||||
// We apply a timeout to the verifier to avoid hangs due to missing earlier blocks.
|
// We apply a timeout to the verifier to avoid hangs due to missing earlier blocks.
|
||||||
let verifier = Timeout::new(verifier, BLOCK_VERIFY_TIMEOUT);
|
let verifier = Timeout::new(verifier, BLOCK_VERIFY_TIMEOUT);
|
||||||
|
|
@ -204,11 +210,11 @@ where
|
||||||
|
|
||||||
'sync: loop {
|
'sync: loop {
|
||||||
if started_once {
|
if started_once {
|
||||||
tracing::info!(timeout = ?SYNC_RESTART_TIMEOUT, "waiting to restart sync");
|
tracing::info!(timeout = ?SYNC_RESTART_DELAY, "waiting to restart sync");
|
||||||
self.prospective_tips = HashSet::new();
|
self.prospective_tips = HashSet::new();
|
||||||
self.downloads.cancel_all();
|
self.downloads.cancel_all();
|
||||||
self.update_metrics();
|
self.update_metrics();
|
||||||
sleep(SYNC_RESTART_TIMEOUT).await;
|
sleep(SYNC_RESTART_DELAY).await;
|
||||||
} else {
|
} else {
|
||||||
started_once = true;
|
started_once = true;
|
||||||
}
|
}
|
||||||
|
|
@ -613,6 +619,8 @@ where
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
use zebra_chain::parameters::NetworkUpgrade;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
/// Make sure the timeout values are consistent with each other.
|
/// Make sure the timeout values are consistent with each other.
|
||||||
|
|
@ -620,9 +628,45 @@ mod test {
|
||||||
fn ensure_timeouts_consistent() {
|
fn ensure_timeouts_consistent() {
|
||||||
zebra_test::init();
|
zebra_test::init();
|
||||||
|
|
||||||
|
// This constraint clears the download pipeline during a restart
|
||||||
assert!(
|
assert!(
|
||||||
BLOCK_DOWNLOAD_TIMEOUT.as_secs() * 2 < SYNC_RESTART_TIMEOUT.as_secs(),
|
SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
|
||||||
"Sync restart should allow for pending and buffered requests to complete"
|
"Sync restart should allow for pending and buffered requests to complete"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// This constraint avoids spurious failures due to block retries timing out.
|
||||||
|
// We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT
|
||||||
|
// seconds before retrying.
|
||||||
|
const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 =
|
||||||
|
2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs();
|
||||||
|
assert!(
|
||||||
|
SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT,
|
||||||
|
"Sync restart should allow for block downloads to time out on every retry"
|
||||||
|
);
|
||||||
|
|
||||||
|
// This constraint avoids spurious failures due to block download timeouts
|
||||||
|
assert!(
|
||||||
|
BLOCK_VERIFY_TIMEOUT.as_secs()
|
||||||
|
> SYNC_RESTART_DELAY.as_secs()
|
||||||
|
+ BLOCK_DOWNLOAD_HEDGE_TIMEOUT
|
||||||
|
+ BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
|
||||||
|
"Block verify should allow for a block timeout, a sync restart, and some block fetches"
|
||||||
|
);
|
||||||
|
|
||||||
|
// The minimum recommended network speed for Zebra, in bytes per second.
|
||||||
|
const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8;
|
||||||
|
|
||||||
|
// This constraint avoids spurious failures when restarting large checkpoints
|
||||||
|
assert!(
|
||||||
|
BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC,
|
||||||
|
"Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download"
|
||||||
|
);
|
||||||
|
|
||||||
|
// This constraint avoids spurious failures after checkpointing has finished
|
||||||
|
assert!(
|
||||||
|
BLOCK_VERIFY_TIMEOUT.as_secs()
|
||||||
|
> 2 * NetworkUpgrade::Blossom.target_spacing().num_seconds() as u64,
|
||||||
|
"Block verify should allow for at least one new block to be generated and distributed"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue