Run a block sync in CI with 2 large checkpoints (#1193)

* Run large checkpoint sync tests in CI
* Improve test child output match error context
* Add a debug_stop_at_height config
* Use stop at height in acceptance tests

And add some restart acceptance tests, to make sure the stop at
height feature works correctly.
This commit is contained in:
teor 2020-10-27 19:25:29 +10:00 committed by GitHub
parent 83c844abb5
commit ea510b7d41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 266 additions and 36 deletions

View File

@ -37,6 +37,14 @@ jobs:
with:
command: test
args: --verbose --all
# Explicitly run any tests that are usually #[ignored]
- name: Run zebrad large sync tests
env:
RUST_BACKTRACE: full
uses: actions-rs/cargo@v1
with:
command: test
args: --verbose --manifest-path zebrad/Cargo.toml -- --ignored
build-chain-no-features:
name: Build zebra-chain w/o features on ubuntu-latest

View File

@ -38,6 +38,11 @@ pub struct Config {
///
/// [`cache_dir`]: struct.Config.html#structfield.cache_dir
pub ephemeral: bool,
/// Commit blocks to the finalized state up to this height, then exit Zebra.
///
/// If `None`, continue syncing indefinitely.
pub debug_stop_at_height: Option<u32>,
}
impl Config {
@ -79,6 +84,7 @@ impl Default for Config {
cache_dir,
memory_cache_bytes: 512 * 1024 * 1024,
ephemeral: false,
debug_stop_at_height: None,
}
}
}

View File

@ -44,6 +44,8 @@ pub struct FinalizedState {
// sapling_nullifiers: sled::Tree,
// sprout_anchors: sled::Tree,
// sapling_anchors: sled::Tree,
/// Commit blocks to the finalized state up to this height, then exit Zebra.
debug_stop_at_height: Option<block::Height>,
}
/// Helper trait for inserting (Key, Value) pairs into sled when both the key and
@ -116,11 +118,20 @@ impl SledDeserialize for sled::Tree {
}
}
/// Where is the stop check being performed?
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
enum StopCheckContext {
/// Checking when the state is loaded
OnLoad,
/// Checking when a block is committed
OnCommit,
}
impl FinalizedState {
pub fn new(config: &Config, network: Network) -> Self {
let db = config.sled_config(network).open().unwrap();
Self {
let new_state = Self {
queued_by_prev_hash: HashMap::new(),
hash_by_height: db.open_tree(b"hash_by_height").unwrap(),
height_by_hash: db.open_tree(b"height_by_hash").unwrap(),
@ -129,7 +140,97 @@ impl FinalizedState {
utxo_by_outpoint: db.open_tree(b"utxo_by_outpoint").unwrap(),
// sprout_nullifiers: db.open_tree(b"sprout_nullifiers").unwrap(),
// sapling_nullifiers: db.open_tree(b"sapling_nullifiers").unwrap(),
debug_stop_at_height: config.debug_stop_at_height.map(block::Height),
};
if let Some(tip_height) = new_state.finalized_tip_height() {
new_state.stop_if_at_height_limit(
StopCheckContext::OnLoad,
tip_height,
new_state.finalized_tip_hash(),
);
}
new_state
}
/// Synchronously flushes all dirty IO buffers and calls fsync.
///
/// Returns the number of bytes flushed during this call.
/// See sled's `Tree.flush` for more details.
pub fn flush(&self) -> sled::Result<usize> {
let mut total_flushed = 0;
total_flushed += self.hash_by_height.flush()?;
total_flushed += self.height_by_hash.flush()?;
total_flushed += self.block_by_height.flush()?;
// total_flushed += self.tx_by_hash.flush()?;
total_flushed += self.utxo_by_outpoint.flush()?;
// total_flushed += self.sprout_nullifiers.flush()?;
// total_flushed += self.sapling_nullifiers.flush()?;
Ok(total_flushed)
}
/// If `block_height` is greater than or equal to the configured stop height,
/// stop the process.
///
/// Flushes sled trees before exiting.
///
/// `called_from` and `block_hash` are used for assertions and logging.
fn stop_if_at_height_limit(
&self,
called_from: StopCheckContext,
block_height: block::Height,
block_hash: block::Hash,
) {
let debug_stop_at_height = match self.debug_stop_at_height {
Some(debug_stop_at_height) => debug_stop_at_height,
None => return,
};
if block_height < debug_stop_at_height {
return;
}
// this error is expected on load, but unexpected on commit
if block_height > debug_stop_at_height {
if called_from == StopCheckContext::OnLoad {
tracing::error!(
?debug_stop_at_height,
?called_from,
?block_height,
?block_hash,
"previous state height is greater than the stop height",
);
} else {
unreachable!("committed blocks must be committed in order");
}
}
// Don't sync when the trees have just been opened
if called_from == StopCheckContext::OnCommit {
if let Err(e) = self.flush() {
tracing::error!(
?e,
?debug_stop_at_height,
?called_from,
?block_height,
?block_hash,
"error flushing sled state before stopping"
);
}
}
tracing::info!(
?debug_stop_at_height,
?called_from,
?block_height,
?block_hash,
"stopping at configured height"
);
std::process::exit(0);
}
/// Queue a finalized block to be committed to the state.
@ -184,7 +285,7 @@ impl FinalizedState {
trace!(?height, "Finalized block");
(
let result = (
&self.hash_by_height,
&self.height_by_hash,
&self.block_by_height,
@ -222,8 +323,13 @@ impl FinalizedState {
// for some reason type inference fails here
Ok::<_, sled::transaction::ConflictableTransactionError>(hash)
},
)
.map_err(Into::into)
);
if result.is_ok() {
self.stop_if_at_height_limit(StopCheckContext::OnCommit, height, hash);
}
result.map_err(Into::into)
}
/// Commit a finalized block to the state.

View File

@ -149,7 +149,7 @@ impl TestStatus {
#[derive(Debug)]
pub struct TestChild<T> {
dir: T,
pub dir: T,
pub cmd: String,
pub child: Child,
pub stdout: Option<Lines<BufReader<ChildStdout>>>,
@ -239,7 +239,8 @@ impl<T> TestChild<T> {
}
let report = eyre!("stdout of command did not contain any matches for the given regex")
.context_from(self);
.context_from(self)
.with_section(|| format!("{:?}", regex).header("Match Regex:"));
Err(report)
}
@ -296,6 +297,7 @@ impl<T> TestOutput<T> {
"stdout of command did not contain any matches for the given regex"
))
.context_from(self)
.with_section(|| format!("{:?}", regex).header("Match Regex:"))
}
#[instrument(skip(self))]
@ -306,7 +308,9 @@ impl<T> TestOutput<T> {
return Ok(self);
}
Err(eyre!("stdout of command is not equal the given string")).context_from(self)
Err(eyre!("stdout of command is not equal the given string"))
.context_from(self)
.with_section(|| format!("{:?}", s).header("Match String:"))
}
#[instrument(skip(self))]
@ -318,7 +322,9 @@ impl<T> TestOutput<T> {
return Ok(self);
}
Err(eyre!("stdout of command is not equal to the given regex")).context_from(self)
Err(eyre!("stdout of command is not equal to the given regex"))
.context_from(self)
.with_section(|| format!("{:?}", regex).header("Match Regex:"))
}
/// Returns Ok if the program was killed, Err(Report) if exit was by another

View File

@ -13,6 +13,7 @@
#![warn(warnings, missing_docs, trivial_casts, unused_qualifications)]
#![forbid(unsafe_code)]
#![allow(clippy::try_err)]
use color_eyre::eyre::Result;
use eyre::WrapErr;
@ -20,7 +21,10 @@ use tempdir::TempDir;
use std::{borrow::Borrow, env, fs, io::Write, time::Duration};
use zebra_chain::parameters::Network::{self, *};
use zebra_chain::{
block::Height,
parameters::Network::{self, *},
};
use zebra_test::{command::TestDirExt, prelude::*};
use zebrad::config::ZebradConfig;
@ -57,6 +61,10 @@ where
/// Add the given config to the test directory and use it for all
/// subsequently spawned processes.
fn with_config(self, config: ZebradConfig) -> Result<Self>;
/// Overwrite any existing config the test directory and use it for all
/// subsequently spawned processes.
fn replace_config(self, config: ZebradConfig) -> Result<Self>;
}
impl<T> ZebradTestDirExt for T
@ -97,6 +105,31 @@ where
Ok(self)
}
fn replace_config(self, mut config: ZebradConfig) -> Result<Self> {
let dir = self.borrow().path();
if !config.state.ephemeral {
let cache_dir = dir.join("state");
// Create dir, ignoring existing directories
match fs::create_dir(&cache_dir) {
Ok(_) => {}
Err(e) if (e.kind() == std::io::ErrorKind::AlreadyExists) => {}
Err(e) => Err(e)?,
};
config.state.cache_dir = cache_dir;
}
let config_file = dir.join("zebrad.toml");
// Remove any existing config before writing a new one
let _ = fs::remove_file(config_file.clone());
fs::File::create(config_file)?.write_all(toml::to_string(&config)?.as_bytes())?;
Ok(self)
}
}
#[test]
@ -440,16 +473,29 @@ fn valid_generated_config(command: &str, expected_output: &str) -> Result<()> {
Ok(())
}
const LARGE_CHECKPOINT_TEST_HEIGHT: Height =
Height((zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP * 2) as u32);
const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height";
const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(5);
// usually it's much shorter than this
const SMALL_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(30);
const LARGE_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(180);
/// Test if `zebrad` can sync the first checkpoint on mainnet.
///
/// The first checkpoint contains a single genesis block.
#[test]
fn sync_one_checkpoint_mainnet() -> Result<()> {
sync_until(
"verified checkpoint range",
Height(0),
Mainnet,
Duration::from_secs(20),
STOP_AT_HEIGHT_REGEX,
SMALL_CHECKPOINT_TIMEOUT,
None,
)
.map(|_tempdir| ())
}
/// Test if `zebrad` can sync the first checkpoint on testnet.
@ -458,73 +504,131 @@ fn sync_one_checkpoint_mainnet() -> Result<()> {
#[test]
fn sync_one_checkpoint_testnet() -> Result<()> {
sync_until(
"verified checkpoint range",
Height(0),
Testnet,
Duration::from_secs(20),
STOP_AT_HEIGHT_REGEX,
SMALL_CHECKPOINT_TIMEOUT,
None,
)
.map(|_tempdir| ())
}
/// Test if `zebrad` can sync the second checkpoint on mainnet.
/// Test if `zebrad` can sync the first checkpoint, restart, and stop on load.
#[test]
fn restart_stop_at_height() -> Result<()> {
let reuse_tempdir = sync_until(
Height(0),
Mainnet,
STOP_AT_HEIGHT_REGEX,
SMALL_CHECKPOINT_TIMEOUT,
None,
)?;
// if stopping corrupts the sled database, zebrad might hang here
// if stopping does not sync the sled database, the logs will contain OnCommit
sync_until(
Height(0),
Mainnet,
"called_from=OnLoad",
STOP_ON_LOAD_TIMEOUT,
Some(reuse_tempdir),
)?;
Ok(())
}
/// Test if `zebrad` can sync some larger checkpoints on mainnet.
///
/// The second checkpoint contains a large number of blocks.
/// This test might fail or timeout on slow or unreliable networks,
/// so we don't run it by default. It also takes a lot longer than
/// our 10 second target time for default tests.
#[test]
#[ignore]
fn sync_two_checkpoints_mainnet() -> Result<()> {
sync_until(
"verified checkpoint range block_count=2000",
fn sync_large_checkpoints_mainnet() -> Result<()> {
let reuse_tempdir = sync_until(
LARGE_CHECKPOINT_TEST_HEIGHT,
Mainnet,
Duration::from_secs(120),
)
STOP_AT_HEIGHT_REGEX,
LARGE_CHECKPOINT_TIMEOUT,
None,
)?;
// if this sync fails, see the failure notes in `restart_stop_at_height`
sync_until(
(LARGE_CHECKPOINT_TEST_HEIGHT - 1).unwrap(),
Mainnet,
"previous state height is greater than the stop height",
STOP_ON_LOAD_TIMEOUT,
Some(reuse_tempdir),
)?;
Ok(())
}
/// Test if `zebrad` can sync the second checkpoint on testnet.
/// Test if `zebrad` can sync some larger checkpoints on testnet.
///
/// This test does not run by default, see `sync_two_checkpoints_mainnet`
/// This test does not run by default, see `sync_large_checkpoints_mainnet`
/// for details.
#[test]
#[ignore]
fn sync_two_checkpoints_testnet() -> Result<()> {
fn sync_large_checkpoints_testnet() -> Result<()> {
sync_until(
"verified checkpoint range block_count=2000",
LARGE_CHECKPOINT_TEST_HEIGHT,
Testnet,
Duration::from_secs(120),
STOP_AT_HEIGHT_REGEX,
LARGE_CHECKPOINT_TIMEOUT,
None,
)
.map(|_tempdir| ())
}
/// Sync `network` until `zebrad` outputs `regex`.
/// Returns an error if `timeout` elapses before `regex` is output.
/// Sync `network` until `zebrad` reaches `height`, and ensure that
/// the output contains `stop_regex`. If `reuse_tempdir` is supplied,
/// use it as the test's temporary directory.
///
/// If `stop_regex` is encountered before the process exits, kills the
/// process, and mark the test as successful, even if `height` has not
/// been reached.
///
/// On success, returns the associated `TempDir`. Returns an error if
/// the child exits or `timeout` elapses before `regex` is found.
///
/// If your test environment does not have network access, skip
/// this test by setting the `ZEBRA_SKIP_NETWORK_TESTS` env var.
fn sync_until(regex: &str, network: Network, timeout: Duration) -> Result<()> {
fn sync_until(
height: Height,
network: Network,
stop_regex: &str,
timeout: Duration,
reuse_tempdir: Option<TempDir>,
) -> Result<TempDir> {
zebra_test::init();
if env::var_os("ZEBRA_SKIP_NETWORK_TESTS").is_some() {
// This message is captured by the test runner, use
// `cargo test -- --nocapture` to see it.
eprintln!("Skipping network test because '$ZEBRA_SKIP_NETWORK_TESTS' is set.");
return Ok(());
return Ok(testdir()?);
}
// Use a persistent state, so we can handle large syncs
let mut config = persistent_test_config()?;
// TODO: add a convenience method?
// TODO: add convenience methods?
config.network.network = network;
config.state.debug_stop_at_height = Some(height.0);
let mut child = testdir()?
.with_config(config)?
.spawn_child(&["start"])?
.with_timeout(timeout);
let tempdir = if let Some(reuse_tempdir) = reuse_tempdir {
reuse_tempdir.replace_config(config)?
} else {
testdir()?.with_config(config)?
};
let mut child = tempdir.spawn_child(&["start"])?.with_timeout(timeout);
// TODO: is there a way to check for testnet or mainnet here?
// For example: "network=Mainnet" or "network=Testnet"
child.expect_stdout(regex)?;
child.expect_stdout(stop_regex)?;
child.kill()?;
Ok(())
Ok(child.dir)
}
#[tokio::test]