From 5c60528b843bb9aae7876f67b3af601d75ff85b8 Mon Sep 17 00:00:00 2001 From: teor Date: Fri, 18 Feb 2022 12:00:24 +1000 Subject: [PATCH] fix(test): make full sync test more accurate (#3555) * feat(log): log current height when logging sync progress * fix(test): log the specific error when full sync tests fail * doc(start): remove an obsolete TODO We can't decrease this log level, because the tests rely on it. * fix(test): wait until mempool activates in full sync tests Changes the log message and log test so that the test only finishes when the mempool has activated. There is still a race condition here, between the log timer and mempool activation. But it should be very rare, because the mempool is activated immediately when `is_close_to_tip()` becomes true. * fix(test): warn when Zebra stalls below the maximum checkpoint height This also improves the full sync tests, because the warning is checked before logging a successful sync. * feat(log): warn when sync stalls downloading the genesis block * fix(test): warn when the state hasn't committed a block for a long time This also improves the full sync tests, because the warning is checked before logging a successful sync. * doc(test): update some sync acceptance test comments * fix(log): use Display formatting to log chrono::Duration Debug formatting is complicated and hard to read. * fix(log): stop saying that we've activated the mempool without checking it We're not checking if the mempool is active, so we can't say that. * fix(log): minor tidying and TODOs * fix(doc): fix a typo in the tests * fix(log): explain the post-checkpoint blocks in progress warning calculations * fix(doc): explain what could happen if we don't wait for extra blocks * fix(log): add a percent symbol to a percent log Co-authored-by: Janito Vaqueiro Ferreira Filho Co-authored-by: Janito Vaqueiro Ferreira Filho Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- Cargo.lock | 1 + zebra-consensus/src/checkpoint.rs | 2 +- zebra-consensus/src/checkpoint/list.rs | 2 +- zebra-consensus/src/lib.rs | 4 +- zebrad/Cargo.toml | 1 + zebrad/src/commands/start.rs | 188 ++++++++++++++++++++++--- zebrad/tests/acceptance.rs | 31 ++-- 7 files changed, 200 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 349e94bf..ecaf502d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5734,6 +5734,7 @@ dependencies = [ "lazy_static", "metrics", "metrics-exporter-prometheus", + "num-integer", "once_cell", "pin-project 1.0.7", "proptest", diff --git a/zebra-consensus/src/checkpoint.rs b/zebra-consensus/src/checkpoint.rs index 0905bb09..7895e09d 100644 --- a/zebra-consensus/src/checkpoint.rs +++ b/zebra-consensus/src/checkpoint.rs @@ -42,7 +42,7 @@ mod types; #[cfg(test)] mod tests; -pub(crate) use list::CheckpointList; +pub use list::CheckpointList; use types::{Progress, Progress::*}; use types::{TargetHeight, TargetHeight::*}; diff --git a/zebra-consensus/src/checkpoint/list.rs b/zebra-consensus/src/checkpoint/list.rs index 17729d4f..48f66302 100644 --- a/zebra-consensus/src/checkpoint/list.rs +++ b/zebra-consensus/src/checkpoint/list.rs @@ -53,7 +53,7 @@ const TESTNET_CHECKPOINTS: &str = include_str!("test-checkpoints.txt"); /// This is actually a bijective map, but since it is read-only, we use a /// BTreeMap, and do the value uniqueness check on initialisation. #[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub(crate) struct CheckpointList(BTreeMap); +pub struct CheckpointList(BTreeMap); impl FromStr for CheckpointList { type Err = BoxError; diff --git a/zebra-consensus/src/lib.rs b/zebra-consensus/src/lib.rs index 86a0e0f9..bbfb366e 100644 --- a/zebra-consensus/src/lib.rs +++ b/zebra-consensus/src/lib.rs @@ -48,7 +48,9 @@ pub mod chain; pub mod error; pub use block::VerifyBlockError; -pub use checkpoint::{VerifyCheckpointError, MAX_CHECKPOINT_BYTE_COUNT, MAX_CHECKPOINT_HEIGHT_GAP}; +pub use checkpoint::{ + CheckpointList, VerifyCheckpointError, MAX_CHECKPOINT_BYTE_COUNT, MAX_CHECKPOINT_HEIGHT_GAP, +}; pub use config::Config; pub use error::BlockError; pub use primitives::groth16; diff --git a/zebrad/Cargo.toml b/zebrad/Cargo.toml index 48707fab..213b5d82 100644 --- a/zebrad/Cargo.toml +++ b/zebrad/Cargo.toml @@ -48,6 +48,7 @@ atty = "0.2.14" sentry = { version = "0.23.0", default-features = false, features = ["backtrace", "contexts", "reqwest", "rustls"] } sentry-tracing = "0.23.0" +num-integer = "0.1.44" rand = "0.8.5" [build-dependencies] diff --git a/zebrad/src/commands/start.rs b/zebrad/src/commands/start.rs index ed652afd..fab8bcd7 100644 --- a/zebrad/src/commands/start.rs +++ b/zebrad/src/commands/start.rs @@ -54,17 +54,23 @@ //! * runs in the background and gossips newly added mempool transactions //! to peers -use std::{cmp::max, time::Duration}; +use std::{cmp::max, ops::Add, time::Duration}; use abscissa_core::{config, Command, FrameworkError, Options, Runnable}; use chrono::Utc; use color_eyre::eyre::{eyre, Report}; use futures::FutureExt; +use num_integer::div_ceil; use tokio::{pin, select, sync::oneshot}; use tower::{builder::ServiceBuilder, util::BoxService}; use tracing_futures::Instrument; -use zebra_chain::{chain_tip::ChainTip, parameters::Network}; +use zebra_chain::{ + block::Height, + chain_tip::ChainTip, + parameters::{Network, NetworkUpgrade, POST_BLOSSOM_POW_TARGET_SPACING}, +}; +use zebra_consensus::CheckpointList; use crate::{ components::{ @@ -315,9 +321,71 @@ impl StartCmd { // TODO: replace with `MAX_CLOSE_TO_TIP_BLOCKS` after fixing slow syncing near tip (#3375) const MIN_SYNC_WARNING_BLOCKS: i32 = 60; + // The number of fractional digits in sync percentages. + const SYNC_PERCENT_FRAC_DIGITS: usize = 3; + + // The minimum number of extra blocks mined between updating a checkpoint list, + // and running an automated test that depends on that list. + // + // Makes sure that the block finalization code always runs in sync tests, + // even if the miner or test node clock is wrong by a few minutes. + // + // This is an estimate based on the time it takes to: + // - get the tip height from `zcashd`, + // - run `zebra-checkpoints` to update the checkpoint list, + // - submit a pull request, and + // - run a CI test that logs progress based on the new checkpoint height. + // + // We might add tests that sync from a cached tip state, + // so we only allow a few extra blocks here. + const MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE: i32 = 10; + + // The minimum number of extra blocks after the highest checkpoint, based on: + // - the non-finalized state limit, and + // - the minimum number of extra blocks mined between a checkpoint update, + // and the automated tests for that update. + let min_after_checkpoint_blocks = i32::try_from(zebra_state::MAX_BLOCK_REORG_HEIGHT) + .expect("constant fits in i32") + + MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE; + + // The minimum height of the valid best chain, based on: + // - the hard-coded checkpoint height, + // - the minimum number of blocks after the highest checkpoint. + let after_checkpoint_height = CheckpointList::new(network) + .max_height() + .add(min_after_checkpoint_blocks) + .expect("hard-coded checkpoint height is far below Height::MAX"); + + let target_block_spacing = NetworkUpgrade::target_spacing_for_height(network, Height::MAX); + let max_block_spacing = + NetworkUpgrade::minimum_difficulty_spacing_for_height(network, Height::MAX); + + // We expect the state height to increase at least once in this interval. + // + // Most chain forks are 1-7 blocks long. + // + // TODO: remove the target_block_spacing multiplier, + // after fixing slow syncing near tip (#3375) + let min_state_block_interval = max_block_spacing.unwrap_or(target_block_spacing * 4) * 2; + + // Formatted string for logging. + let max_block_spacing = max_block_spacing + .map(|duration| duration.to_string()) + .unwrap_or_else(|| "None".to_string()); + + // The last time we downloaded and verified at least one block. + // + // Initialized to the start time to simplify the code. + let mut last_state_change_time = Utc::now(); + + // The state tip height, when we last downloaded and verified at least one block. + // + // Initialized to the genesis height to simplify the code. + let mut last_state_change_height = Height(0); + loop { let now = Utc::now(); - let is_close_to_tip = sync_status.is_close_to_tip(); + let is_syncer_stopped = sync_status.is_close_to_tip(); if let Some(estimated_height) = latest_chain_tip.estimate_network_chain_tip_height(network, now) @@ -328,49 +396,133 @@ impl StartCmd { .best_tip_height() .expect("unexpected empty state: estimate requires a block height"); + // Work out the sync progress towards the estimated tip. let sync_progress = f64::from(current_height.0) / f64::from(estimated_height.0); - let sync_percent = format!("{:.3}", sync_progress * 100.0); + let sync_percent = format!( + "{:.frac$} %", + sync_progress * 100.0, + frac = SYNC_PERCENT_FRAC_DIGITS, + ); let remaining_sync_blocks = estimated_height - current_height; - // TODO: - // - estimate the remaining sync time - // - log progress, remaining blocks, and remaining time to next network upgrade - // - also add this info to the metrics + // Work out how long it has been since the state height has increased. + // + // Non-finalized forks can decrease the height, we only want to track increases. + if current_height > last_state_change_height { + last_state_change_height = current_height; + last_state_change_time = now; + } - if is_close_to_tip && remaining_sync_blocks > MIN_SYNC_WARNING_BLOCKS { + let time_since_last_state_block = last_state_change_time.signed_duration_since(now); + + // TODO: + // - log progress, remaining blocks, and remaining time to next network upgrade + // - add some of this info to the metrics + + if time_since_last_state_block > min_state_block_interval { + // The state tip height hasn't increased for a long time. + // + // Block verification can fail if the local node's clock is wrong. + warn!( + %sync_percent, + ?current_height, + %time_since_last_state_block, + %target_block_spacing, + %max_block_spacing, + ?is_syncer_stopped, + "chain updates have stalled, \ + state height has not increased for {} minutes. \ + Hint: check your network connection, \ + and your computer clock and time zone", + time_since_last_state_block.num_minutes(), + ); + } else if is_syncer_stopped && remaining_sync_blocks > MIN_SYNC_WARNING_BLOCKS { // We've stopped syncing blocks, but we estimate we're a long way from the tip. // // TODO: warn after fixing slow syncing near tip (#3375) info!( %sync_percent, + ?current_height, ?remaining_sync_blocks, - "initial sync might have stalled, or estimated tip is wrong. \ - Hint: check that your computer's clock and time zone are correct" + ?after_checkpoint_height, + %time_since_last_state_block, + "initial sync is very slow, or estimated tip is wrong. \ + Hint: check your network connection, \ + and your computer clock and time zone", ); - } else if is_close_to_tip { + } else if is_syncer_stopped && current_height <= after_checkpoint_height { + // We've stopped syncing blocks, + // but we're below the minimum height estimated from our checkpoints. + let min_minutes_after_checkpoint_update: i64 = div_ceil( + i64::from(MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE) + * POST_BLOSSOM_POW_TARGET_SPACING, + 60, + ); + + warn!( + %sync_percent, + ?current_height, + ?remaining_sync_blocks, + ?after_checkpoint_height, + %time_since_last_state_block, + "initial sync is very slow, and state is below the highest checkpoint. \ + Hint: check your network connection, \ + and your computer clock and time zone. \ + Dev Hint: were the checkpoints updated in the last {} minutes?", + min_minutes_after_checkpoint_update, + ); + } else if is_syncer_stopped { // We've stayed near the tip for a while, and we've stopped syncing lots of blocks. - // - // TODO: reduce this log level after full sync tests are merged, - // and have been stable for a while (#1592) + // So we're mostly using gossiped blocks now. info!( %sync_percent, + ?current_height, ?remaining_sync_blocks, - "finished initial sync to chain tip" + %time_since_last_state_block, + "finished initial sync to chain tip, using gossiped blocks", ); } else if remaining_sync_blocks <= MAX_CLOSE_TO_TIP_BLOCKS { // We estimate we're near the tip, but we have been syncing lots of blocks recently. + // We might also be using some gossiped blocks. info!( %sync_percent, + ?current_height, ?remaining_sync_blocks, - "finished initial sync to chain tip" + %time_since_last_state_block, + "close to finishing initial sync, \ + confirming using syncer and gossiped blocks", ); } else { // We estimate we're far from the tip, and we've been syncing lots of blocks. info!( %sync_percent, + ?current_height, ?remaining_sync_blocks, - "estimated progress to chain tip" + %time_since_last_state_block, + "estimated progress to chain tip", + ); + } + } else { + let sync_percent = format!("{:.frac$} %", 0.0f64, frac = SYNC_PERCENT_FRAC_DIGITS,); + + if is_syncer_stopped { + // We've stopped syncing blocks, + // but we haven't downloaded and verified the genesis block. + warn!( + %sync_percent, + current_height = %"None", + "initial sync can't download and verify the genesis block. \ + Hint: check your network connection, \ + and your computer clock and time zone", + ); + } else { + // We're waiting for the genesis block to be committed to the state, + // before we can estimate the best chain tip. + info!( + %sync_percent, + current_height = %"None", + "initial sync is waiting to download the genesis block", ); } } diff --git a/zebrad/tests/acceptance.rs b/zebrad/tests/acceptance.rs index 3750f0ac..e24ecdff 100644 --- a/zebrad/tests/acceptance.rs +++ b/zebrad/tests/acceptance.rs @@ -739,9 +739,14 @@ const LARGE_CHECKPOINT_TEST_HEIGHT: Height = const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height"; -/// The text that should be logged when synchronization finishes and reaches the estimated chain -/// tip. -const SYNC_FINISHED_REGEX: &str = "finished initial sync to chain tip"; +/// The text that should be logged when the initial sync finishes at the estimated chain tip. +/// +/// This message is only logged if: +/// - we have reached the estimated chain tip, +/// - we have synced all known checkpoints, +/// - the syncer has stopped downloading lots of blocks, and +/// - we are regularly downloading some blocks via the syncer or block gossip. +const SYNC_FINISHED_REGEX: &str = "finished initial sync to chain tip, using gossiped blocks"; /// The maximum amount of time Zebra should take to reload after shutting down. /// @@ -903,7 +908,7 @@ fn sync_large_checkpoints_mempool_mainnet() -> Result<()> { #[test] #[ignore] fn full_sync_mainnet() { - assert!(full_sync_test(Mainnet, "FULL_SYNC_MAINNET_TIMEOUT_MINUTES").is_ok()); + full_sync_test(Mainnet, "FULL_SYNC_MAINNET_TIMEOUT_MINUTES").expect("unexpected test failure"); } /// Test if `zebrad` can fully sync the chain on testnet. @@ -914,7 +919,7 @@ fn full_sync_mainnet() { #[test] #[ignore] fn full_sync_testnet() { - assert!(full_sync_test(Testnet, "FULL_SYNC_TESTNET_TIMEOUT_MINUTES").is_ok()); + full_sync_test(Testnet, "FULL_SYNC_TESTNET_TIMEOUT_MINUTES").expect("unexpected test failure"); } /// Sync `network` until the chain tip is reached, or a timeout elapses. @@ -939,6 +944,13 @@ fn full_sync_test(network: Network, timeout_argument_name: &'static str) -> Resu ) .map(|_| ()) } else { + tracing::info!( + ?network, + "skipped full sync test, \ + set the {:?} environmental variable to run the test", + timeout_argument_name, + ); + Ok(()) } } @@ -950,9 +962,8 @@ fn full_sync_test(network: Network, timeout_argument_name: &'static str) -> Resu /// If `check_legacy_chain` is true, /// make sure the logs contain the legacy chain check. /// -/// If `enable_mempool_at_height` is `Some(Height(_))`, -/// configure `zebrad` to debug-enable the mempool at that height. -/// Then check the logs for the mempool being enabled. +/// Configure `zebrad` to debug-enable the mempool based on `mempool_behavior`, +/// then check the logs for the expected `mempool_behavior`. /// /// If `stop_regex` is encountered before the process exits, kills the /// process, and mark the test as successful, even if `height` has not @@ -1115,6 +1126,10 @@ fn create_cached_database(network: Network) -> Result<()> { true, |test_child: &mut TestChild| { // make sure pre-cached databases finish before the mandatory checkpoint + // + // TODO: this check passes even if we reach the mandatory checkpoint, + // because we sync finalized state, then non-finalized state. + // Instead, fail if we see "best non-finalized chain root" in the logs. test_child.expect_stdout_line_matches("CommitFinalized request")?; Ok(()) },