Run a block sync in CI with 2 large checkpoints (#1193)

* Run large checkpoint sync tests in CI * Improve test child output match error context * Add a debug_stop_at_height config * Use stop at height in acceptance tests And add some restart acceptance tests, to make sure the stop at height feature works correctly.
2020-10-27 19:25:29 +10:00 · 2020-10-27 19:25:29 +10:00 · ea510b7d41
parent 83c844abb5
commit ea510b7d41
5 changed files with 266 additions and 36 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -37,6 +37,14 @@ jobs:
        with:
          command: test
          args: --verbose --all
+      # Explicitly run any tests that are usually #[ignored]
+      - name: Run zebrad large sync tests
+        env:
+          RUST_BACKTRACE: full
+        uses: actions-rs/cargo@v1
+        with:
+          command: test
+          args: --verbose --manifest-path zebrad/Cargo.toml -- --ignored

  build-chain-no-features:
    name: Build zebra-chain w/o features on ubuntu-latest
--- a/zebra-state/src/config.rs
+++ b/zebra-state/src/config.rs
@ -38,6 +38,11 @@ pub struct Config {
    ///
    /// [`cache_dir`]: struct.Config.html#structfield.cache_dir
    pub ephemeral: bool,
+
+    /// Commit blocks to the finalized state up to this height, then exit Zebra.
+    ///
+    /// If `None`, continue syncing indefinitely.
+    pub debug_stop_at_height: Option<u32>,
 }

 impl Config {
@ -79,6 +84,7 @@ impl Default for Config {
            cache_dir,
            memory_cache_bytes: 512 * 1024 * 1024,
            ephemeral: false,
+            debug_stop_at_height: None,
        }
    }
 }
--- a/zebra-state/src/sled_state.rs
+++ b/zebra-state/src/sled_state.rs
@ -44,6 +44,8 @@ pub struct FinalizedState {
    // sapling_nullifiers: sled::Tree,
    // sprout_anchors: sled::Tree,
    // sapling_anchors: sled::Tree,
+    /// Commit blocks to the finalized state up to this height, then exit Zebra.
+    debug_stop_at_height: Option<block::Height>,
 }

 /// Helper trait for inserting (Key, Value) pairs into sled when both the key and
@ -116,11 +118,20 @@ impl SledDeserialize for sled::Tree {
    }
 }

+/// Where is the stop check being performed?
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+enum StopCheckContext {
+    /// Checking when the state is loaded
+    OnLoad,
+    /// Checking when a block is committed
+    OnCommit,
+}
+
 impl FinalizedState {
    pub fn new(config: &Config, network: Network) -> Self {
        let db = config.sled_config(network).open().unwrap();

-        Self {
+        let new_state = Self {
            queued_by_prev_hash: HashMap::new(),
            hash_by_height: db.open_tree(b"hash_by_height").unwrap(),
            height_by_hash: db.open_tree(b"height_by_hash").unwrap(),
@ -129,7 +140,97 @@ impl FinalizedState {
            utxo_by_outpoint: db.open_tree(b"utxo_by_outpoint").unwrap(),
            // sprout_nullifiers: db.open_tree(b"sprout_nullifiers").unwrap(),
            // sapling_nullifiers: db.open_tree(b"sapling_nullifiers").unwrap(),
+            debug_stop_at_height: config.debug_stop_at_height.map(block::Height),
+        };
+
+        if let Some(tip_height) = new_state.finalized_tip_height() {
+            new_state.stop_if_at_height_limit(
+                StopCheckContext::OnLoad,
+                tip_height,
+                new_state.finalized_tip_hash(),
+            );
        }
+
+        new_state
+    }
+
+    /// Synchronously flushes all dirty IO buffers and calls fsync.
+    ///
+    /// Returns the number of bytes flushed during this call.
+    /// See sled's `Tree.flush` for more details.
+    pub fn flush(&self) -> sled::Result<usize> {
+        let mut total_flushed = 0;
+
+        total_flushed += self.hash_by_height.flush()?;
+        total_flushed += self.height_by_hash.flush()?;
+        total_flushed += self.block_by_height.flush()?;
+        // total_flushed += self.tx_by_hash.flush()?;
+        total_flushed += self.utxo_by_outpoint.flush()?;
+        // total_flushed += self.sprout_nullifiers.flush()?;
+        // total_flushed += self.sapling_nullifiers.flush()?;
+
+        Ok(total_flushed)
+    }
+
+    /// If `block_height` is greater than or equal to the configured stop height,
+    /// stop the process.
+    ///
+    /// Flushes sled trees before exiting.
+    ///
+    /// `called_from` and `block_hash` are used for assertions and logging.
+    fn stop_if_at_height_limit(
+        &self,
+        called_from: StopCheckContext,
+        block_height: block::Height,
+        block_hash: block::Hash,
+    ) {
+        let debug_stop_at_height = match self.debug_stop_at_height {
+            Some(debug_stop_at_height) => debug_stop_at_height,
+            None => return,
+        };
+
+        if block_height < debug_stop_at_height {
+            return;
+        }
+
+        // this error is expected on load, but unexpected on commit
+        if block_height > debug_stop_at_height {
+            if called_from == StopCheckContext::OnLoad {
+                tracing::error!(
+                    ?debug_stop_at_height,
+                    ?called_from,
+                    ?block_height,
+                    ?block_hash,
+                    "previous state height is greater than the stop height",
+                );
+            } else {
+                unreachable!("committed blocks must be committed in order");
+            }
+        }
+
+        // Don't sync when the trees have just been opened
+        if called_from == StopCheckContext::OnCommit {
+            if let Err(e) = self.flush() {
+                tracing::error!(
+                    ?e,
+                    ?debug_stop_at_height,
+                    ?called_from,
+                    ?block_height,
+                    ?block_hash,
+                    "error flushing sled state before stopping"
+                );
+            }
+        }
+
+        tracing::info!(
+            ?debug_stop_at_height,
+            ?called_from,
+            ?block_height,
+            ?block_hash,
+            "stopping at configured height"
+        );
+
+        std::process::exit(0);
    }

    /// Queue a finalized block to be committed to the state.
@ -184,7 +285,7 @@ impl FinalizedState {

        trace!(?height, "Finalized block");

-        (
+        let result = (
            &self.hash_by_height,
            &self.height_by_hash,
            &self.block_by_height,
@ -222,8 +323,13 @@ impl FinalizedState {
                    // for some reason type inference fails here
                    Ok::<_, sled::transaction::ConflictableTransactionError>(hash)
                },
-            )
-            .map_err(Into::into)
+            );
+
+        if result.is_ok() {
+            self.stop_if_at_height_limit(StopCheckContext::OnCommit, height, hash);
+        }
+
+        result.map_err(Into::into)
    }

    /// Commit a finalized block to the state.
--- a/zebra-test/src/command.rs
+++ b/zebra-test/src/command.rs
@ -149,7 +149,7 @@ impl TestStatus {

 #[derive(Debug)]
 pub struct TestChild<T> {
-    dir: T,
+    pub dir: T,
    pub cmd: String,
    pub child: Child,
    pub stdout: Option<Lines<BufReader<ChildStdout>>>,
@ -239,7 +239,8 @@ impl<T> TestChild<T> {
        }

        let report = eyre!("stdout of command did not contain any matches for the given regex")
-            .context_from(self);
+            .context_from(self)
+            .with_section(|| format!("{:?}", regex).header("Match Regex:"));

        Err(report)
    }
@ -296,6 +297,7 @@ impl<T> TestOutput<T> {
            "stdout of command did not contain any matches for the given regex"
        ))
        .context_from(self)
+        .with_section(|| format!("{:?}", regex).header("Match Regex:"))
    }

    #[instrument(skip(self))]
@ -306,7 +308,9 @@ impl<T> TestOutput<T> {
            return Ok(self);
        }

-        Err(eyre!("stdout of command is not equal the given string")).context_from(self)
+        Err(eyre!("stdout of command is not equal the given string"))
+            .context_from(self)
+            .with_section(|| format!("{:?}", s).header("Match String:"))
    }

    #[instrument(skip(self))]
@ -318,7 +322,9 @@ impl<T> TestOutput<T> {
            return Ok(self);
        }

-        Err(eyre!("stdout of command is not equal to the given regex")).context_from(self)
+        Err(eyre!("stdout of command is not equal to the given regex"))
+            .context_from(self)
+            .with_section(|| format!("{:?}", regex).header("Match Regex:"))
    }

    /// Returns Ok if the program was killed, Err(Report) if exit was by another
--- a/zebrad/tests/acceptance.rs
+++ b/zebrad/tests/acceptance.rs
@ -13,6 +13,7 @@

 #![warn(warnings, missing_docs, trivial_casts, unused_qualifications)]
 #![forbid(unsafe_code)]
+#![allow(clippy::try_err)]

 use color_eyre::eyre::Result;
 use eyre::WrapErr;
@ -20,7 +21,10 @@ use tempdir::TempDir;

 use std::{borrow::Borrow, env, fs, io::Write, time::Duration};

-use zebra_chain::parameters::Network::{self, *};
+use zebra_chain::{
+    block::Height,
+    parameters::Network::{self, *},
+};
 use zebra_test::{command::TestDirExt, prelude::*};
 use zebrad::config::ZebradConfig;

@ -57,6 +61,10 @@ where
    /// Add the given config to the test directory and use it for all
    /// subsequently spawned processes.
    fn with_config(self, config: ZebradConfig) -> Result<Self>;
+
+    /// Overwrite any existing config the test directory and use it for all
+    /// subsequently spawned processes.
+    fn replace_config(self, config: ZebradConfig) -> Result<Self>;
 }

 impl<T> ZebradTestDirExt for T
@ -97,6 +105,31 @@ where

        Ok(self)
    }
+
+    fn replace_config(self, mut config: ZebradConfig) -> Result<Self> {
+        let dir = self.borrow().path();
+
+        if !config.state.ephemeral {
+            let cache_dir = dir.join("state");
+
+            // Create dir, ignoring existing directories
+            match fs::create_dir(&cache_dir) {
+                Ok(_) => {}
+                Err(e) if (e.kind() == std::io::ErrorKind::AlreadyExists) => {}
+                Err(e) => Err(e)?,
+            };
+
+            config.state.cache_dir = cache_dir;
+        }
+
+        let config_file = dir.join("zebrad.toml");
+
+        // Remove any existing config before writing a new one
+        let _ = fs::remove_file(config_file.clone());
+        fs::File::create(config_file)?.write_all(toml::to_string(&config)?.as_bytes())?;
+
+        Ok(self)
+    }
 }

 #[test]
@ -440,16 +473,29 @@ fn valid_generated_config(command: &str, expected_output: &str) -> Result<()> {
    Ok(())
 }

+const LARGE_CHECKPOINT_TEST_HEIGHT: Height =
+    Height((zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP * 2) as u32);
+
+const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height";
+
+const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(5);
+// usually it's much shorter than this
+const SMALL_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(30);
+const LARGE_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(180);
+
 /// Test if `zebrad` can sync the first checkpoint on mainnet.
 ///
 /// The first checkpoint contains a single genesis block.
 #[test]
 fn sync_one_checkpoint_mainnet() -> Result<()> {
    sync_until(
-        "verified checkpoint range",
+        Height(0),
        Mainnet,
-        Duration::from_secs(20),
+        STOP_AT_HEIGHT_REGEX,
+        SMALL_CHECKPOINT_TIMEOUT,
+        None,
    )
+    .map(|_tempdir| ())
 }

 /// Test if `zebrad` can sync the first checkpoint on testnet.
@ -458,73 +504,131 @@ fn sync_one_checkpoint_mainnet() -> Result<()> {
 #[test]
 fn sync_one_checkpoint_testnet() -> Result<()> {
    sync_until(
-        "verified checkpoint range",
+        Height(0),
        Testnet,
-        Duration::from_secs(20),
+        STOP_AT_HEIGHT_REGEX,
+        SMALL_CHECKPOINT_TIMEOUT,
+        None,
    )
+    .map(|_tempdir| ())
 }

-/// Test if `zebrad` can sync the second checkpoint on mainnet.
+/// Test if `zebrad` can sync the first checkpoint, restart, and stop on load.
+#[test]
+fn restart_stop_at_height() -> Result<()> {
+    let reuse_tempdir = sync_until(
+        Height(0),
+        Mainnet,
+        STOP_AT_HEIGHT_REGEX,
+        SMALL_CHECKPOINT_TIMEOUT,
+        None,
+    )?;
+    // if stopping corrupts the sled database, zebrad might hang here
+    // if stopping does not sync the sled database, the logs will contain OnCommit
+    sync_until(
+        Height(0),
+        Mainnet,
+        "called_from=OnLoad",
+        STOP_ON_LOAD_TIMEOUT,
+        Some(reuse_tempdir),
+    )?;
+
+    Ok(())
+}
+
+/// Test if `zebrad` can sync some larger checkpoints on mainnet.
 ///
-/// The second checkpoint contains a large number of blocks.
 /// This test might fail or timeout on slow or unreliable networks,
 /// so we don't run it by default. It also takes a lot longer than
 /// our 10 second target time for default tests.
 #[test]
 #[ignore]
-fn sync_two_checkpoints_mainnet() -> Result<()> {
-    sync_until(
-        "verified checkpoint range block_count=2000",
+fn sync_large_checkpoints_mainnet() -> Result<()> {
+    let reuse_tempdir = sync_until(
+        LARGE_CHECKPOINT_TEST_HEIGHT,
        Mainnet,
-        Duration::from_secs(120),
-    )
+        STOP_AT_HEIGHT_REGEX,
+        LARGE_CHECKPOINT_TIMEOUT,
+        None,
+    )?;
+    // if this sync fails, see the failure notes in `restart_stop_at_height`
+    sync_until(
+        (LARGE_CHECKPOINT_TEST_HEIGHT - 1).unwrap(),
+        Mainnet,
+        "previous state height is greater than the stop height",
+        STOP_ON_LOAD_TIMEOUT,
+        Some(reuse_tempdir),
+    )?;
+
+    Ok(())
 }

-/// Test if `zebrad` can sync the second checkpoint on testnet.
+/// Test if `zebrad` can sync some larger checkpoints on testnet.
 ///
-/// This test does not run by default, see `sync_two_checkpoints_mainnet`
+/// This test does not run by default, see `sync_large_checkpoints_mainnet`
 /// for details.
 #[test]
 #[ignore]
-fn sync_two_checkpoints_testnet() -> Result<()> {
+fn sync_large_checkpoints_testnet() -> Result<()> {
    sync_until(
-        "verified checkpoint range block_count=2000",
+        LARGE_CHECKPOINT_TEST_HEIGHT,
        Testnet,
-        Duration::from_secs(120),
+        STOP_AT_HEIGHT_REGEX,
+        LARGE_CHECKPOINT_TIMEOUT,
+        None,
    )
+    .map(|_tempdir| ())
 }

-/// Sync `network` until `zebrad` outputs `regex`.
-/// Returns an error if `timeout` elapses before `regex` is output.
+/// Sync `network` until `zebrad` reaches `height`, and ensure that
+/// the output contains `stop_regex`. If `reuse_tempdir` is supplied,
+/// use it as the test's temporary directory.
+///
+/// If `stop_regex` is encountered before the process exits, kills the
+/// process, and mark the test as successful, even if `height` has not
+/// been reached.
+///
+/// On success, returns the associated `TempDir`. Returns an error if
+/// the child exits or `timeout` elapses before `regex` is found.
 ///
 /// If your test environment does not have network access, skip
 /// this test by setting the `ZEBRA_SKIP_NETWORK_TESTS` env var.
-fn sync_until(regex: &str, network: Network, timeout: Duration) -> Result<()> {
+fn sync_until(
+    height: Height,
+    network: Network,
+    stop_regex: &str,
+    timeout: Duration,
+    reuse_tempdir: Option<TempDir>,
+) -> Result<TempDir> {
    zebra_test::init();

    if env::var_os("ZEBRA_SKIP_NETWORK_TESTS").is_some() {
        // This message is captured by the test runner, use
        // `cargo test -- --nocapture` to see it.
        eprintln!("Skipping network test because '$ZEBRA_SKIP_NETWORK_TESTS' is set.");
-        return Ok(());
+        return Ok(testdir()?);
    }

    // Use a persistent state, so we can handle large syncs
    let mut config = persistent_test_config()?;
-    // TODO: add a convenience method?
+    // TODO: add convenience methods?
    config.network.network = network;
+    config.state.debug_stop_at_height = Some(height.0);

-    let mut child = testdir()?
-        .with_config(config)?
-        .spawn_child(&["start"])?
-        .with_timeout(timeout);
+    let tempdir = if let Some(reuse_tempdir) = reuse_tempdir {
+        reuse_tempdir.replace_config(config)?
+    } else {
+        testdir()?.with_config(config)?
+    };
+
+    let mut child = tempdir.spawn_child(&["start"])?.with_timeout(timeout);

    // TODO: is there a way to check for testnet or mainnet here?
    // For example: "network=Mainnet" or "network=Testnet"
-    child.expect_stdout(regex)?;
+    child.expect_stdout(stop_regex)?;
    child.kill()?;

-    Ok(())
+    Ok(child.dir)
 }

 #[tokio::test]