Security: Stop RocksDB or tokio calling unexpected code when zebrad exits (#3392)
* cancel background database tasks in `FinalizedState` destructor * use `shutdown_timeout()` * Log info-level messages while waiting for background tasks to shut down * Cancel background tasks during debug_stop_at_height shutdown This commit moves the database shutdown code into a common function. * Create a constant for the tokio timeout * Add a test script for Zebra shutdown errors * Increase the shutdown timeout to 20 seconds for slower machines * add title to building zebra * use imported duration Co-authored-by: teor <teor@riseup.net> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
This commit is contained in:
parent
aef75ee9ff
commit
cc594f1c58
|
|
@ -500,16 +500,9 @@ impl FinalizedState {
|
||||||
"stopping at configured height, flushing database to disk"
|
"stopping at configured height, flushing database to disk"
|
||||||
);
|
);
|
||||||
|
|
||||||
// We'd like to drop the database here, because that closes the
|
self.shutdown();
|
||||||
// column families and the database. But Rust's ownership rules
|
|
||||||
// make that difficult, so we just flush and delete ephemeral data instead.
|
|
||||||
|
|
||||||
// TODO: remove these extra logs once bugs like #2905 are fixed
|
|
||||||
self.db.flush().expect("flush is successful");
|
|
||||||
tracing::info!("flushed database to disk");
|
|
||||||
|
|
||||||
self.delete_ephemeral();
|
|
||||||
|
|
||||||
|
// TODO: replace with a graceful shutdown (#1678)
|
||||||
Self::exit_process();
|
Self::exit_process();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -519,6 +512,7 @@ impl FinalizedState {
|
||||||
/// Exit the host process.
|
/// Exit the host process.
|
||||||
///
|
///
|
||||||
/// Designed for debugging and tests.
|
/// Designed for debugging and tests.
|
||||||
|
/// TODO: replace with a graceful shutdown (#1678)
|
||||||
fn exit_process() -> ! {
|
fn exit_process() -> ! {
|
||||||
tracing::info!("exiting Zebra");
|
tracing::info!("exiting Zebra");
|
||||||
|
|
||||||
|
|
@ -830,15 +824,59 @@ impl FinalizedState {
|
||||||
|
|
||||||
self.db.write(batch).unwrap();
|
self.db.write(batch).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Shut down the database, cleaning up background tasks and ephemeral data.
|
||||||
|
fn shutdown(&mut self) {
|
||||||
|
// Drop isn't guaranteed to run, such as when we panic, or if the tokio shutdown times out.
|
||||||
|
//
|
||||||
|
// Zebra's data should be fine if we don't clean up, because:
|
||||||
|
// - the database flushes regularly anyway
|
||||||
|
// - Zebra commits each block in a database transaction, any incomplete blocks get rolled back
|
||||||
|
// - ephemeral files are placed in the os temp dir and should be cleaned up automatically eventually
|
||||||
|
tracing::info!("flushing database to disk");
|
||||||
|
self.db.flush().expect("flush is successful");
|
||||||
|
|
||||||
|
// But we should call `cancel_all_background_work` before Zebra exits.
|
||||||
|
// If we don't, we see these kinds of errors:
|
||||||
|
// ```
|
||||||
|
// pthread lock: Invalid argument
|
||||||
|
// pure virtual method called
|
||||||
|
// terminate called without an active exception
|
||||||
|
// pthread destroy mutex: Device or resource busy
|
||||||
|
// Aborted (core dumped)
|
||||||
|
// ```
|
||||||
|
//
|
||||||
|
// The RocksDB wiki says:
|
||||||
|
// > Q: Is it safe to close RocksDB while another thread is issuing read, write or manual compaction requests?
|
||||||
|
// >
|
||||||
|
// > A: No. The users of RocksDB need to make sure all functions have finished before they close RocksDB.
|
||||||
|
// > You can speed up the waiting by calling CancelAllBackgroundWork().
|
||||||
|
//
|
||||||
|
// https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ
|
||||||
|
tracing::info!("stopping background database tasks");
|
||||||
|
self.db.cancel_all_background_work(true);
|
||||||
|
|
||||||
|
// We'd like to drop the database before deleting its files,
|
||||||
|
// because that closes the column families and the database correctly.
|
||||||
|
// But Rust's ownership rules make that difficult,
|
||||||
|
// so we just flush and delete ephemeral data instead.
|
||||||
|
//
|
||||||
|
// The RocksDB wiki says:
|
||||||
|
// > rocksdb::DB instances need to be destroyed before your main function exits.
|
||||||
|
// > RocksDB instances usually depend on some internal static variables.
|
||||||
|
// > Users need to make sure rocksdb::DB instances are destroyed before those static variables.
|
||||||
|
//
|
||||||
|
// https://github.com/facebook/rocksdb/wiki/Known-Issues
|
||||||
|
//
|
||||||
|
// But our current code doesn't seem to cause any issues.
|
||||||
|
// We might want to explicitly drop the database as part of graceful shutdown (#1678).
|
||||||
|
self.delete_ephemeral();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Drop isn't guaranteed to run, such as when we panic, or if someone stored
|
|
||||||
// their FinalizedState in a static, but it should be fine if we don't clean
|
|
||||||
// this up since the files are placed in the os temp dir and should be cleaned
|
|
||||||
// up automatically eventually.
|
|
||||||
impl Drop for FinalizedState {
|
impl Drop for FinalizedState {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
self.delete_ephemeral()
|
self.shutdown();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,63 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# test that Zebra can shut down without errors
|
||||||
|
|
||||||
|
# print every command run by this script
|
||||||
|
#set -x
|
||||||
|
|
||||||
|
# initial shutdown delay
|
||||||
|
# interesting delays are between 1-30 seconds on most machines
|
||||||
|
SHUTDOWN_DELAY=0
|
||||||
|
|
||||||
|
# each test increases the shutdown delay by this much
|
||||||
|
SHUTDOWN_DELAY_INCREMENT=1
|
||||||
|
|
||||||
|
# final shutdown delay
|
||||||
|
# the test stops after a successful run with a delay this long
|
||||||
|
SHUTDOWN_DELAY_LIMIT=30
|
||||||
|
|
||||||
|
echo "Building Zebra"
|
||||||
|
echo
|
||||||
|
|
||||||
|
cargo build --bin zebrad || exit $?
|
||||||
|
|
||||||
|
EXIT_STATUS=0
|
||||||
|
while [ $EXIT_STATUS -eq 0 ] && [ $SHUTDOWN_DELAY -le $SHUTDOWN_DELAY_LIMIT ]; do
|
||||||
|
# remove previously downloaded Zcash parameter files
|
||||||
|
#
|
||||||
|
# if you don't have these downloaded already, the killed downloads will be incomplete,
|
||||||
|
# which causes an error in Zebra
|
||||||
|
# rm -r ~/.zcash-params
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Running Zebra for $SHUTDOWN_DELAY seconds"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# shut down Zebra if this script exits while it's running,
|
||||||
|
# but ignore "no such job" errors if Zebra has already exited
|
||||||
|
trap "kill %?zebrad 2> /dev/null" EXIT
|
||||||
|
|
||||||
|
target/debug/zebrad start &
|
||||||
|
sleep $SHUTDOWN_DELAY
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Killing Zebra after $SHUTDOWN_DELAY seconds"
|
||||||
|
echo
|
||||||
|
|
||||||
|
kill %?zebrad
|
||||||
|
wait %?zebrad
|
||||||
|
EXIT_STATUS=$?
|
||||||
|
|
||||||
|
# fix up the exit status caused by 'kill'
|
||||||
|
if [ $EXIT_STATUS -eq 143 ]; then
|
||||||
|
EXIT_STATUS=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Killing Zebra after $SHUTDOWN_DELAY seconds exited with $EXIT_STATUS"
|
||||||
|
echo
|
||||||
|
|
||||||
|
SHUTDOWN_DELAY=$[SHUTDOWN_DELAY + SHUTDOWN_DELAY_INCREMENT]
|
||||||
|
done
|
||||||
|
|
||||||
|
exit $EXIT_STATUS
|
||||||
|
|
@ -3,9 +3,12 @@
|
||||||
use crate::prelude::*;
|
use crate::prelude::*;
|
||||||
use abscissa_core::{Application, Component, FrameworkError, Shutdown};
|
use abscissa_core::{Application, Component, FrameworkError, Shutdown};
|
||||||
use color_eyre::Report;
|
use color_eyre::Report;
|
||||||
use std::future::Future;
|
use std::{future::Future, time::Duration};
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
|
|
||||||
|
/// When Zebra is shutting down, wait this long for tokio tasks to finish.
|
||||||
|
const TOKIO_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(20);
|
||||||
|
|
||||||
/// An Abscissa component which owns a Tokio runtime.
|
/// An Abscissa component which owns a Tokio runtime.
|
||||||
///
|
///
|
||||||
/// The runtime is stored as an `Option` so that when it's time to enter an async
|
/// The runtime is stored as an `Option` so that when it's time to enter an async
|
||||||
|
|
@ -40,11 +43,11 @@ async fn shutdown() {
|
||||||
/// Extension trait to centralize entry point for runnable subcommands that
|
/// Extension trait to centralize entry point for runnable subcommands that
|
||||||
/// depend on tokio
|
/// depend on tokio
|
||||||
pub(crate) trait RuntimeRun {
|
pub(crate) trait RuntimeRun {
|
||||||
fn run(&mut self, fut: impl Future<Output = Result<(), Report>>);
|
fn run(self, fut: impl Future<Output = Result<(), Report>>);
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RuntimeRun for Runtime {
|
impl RuntimeRun for Runtime {
|
||||||
fn run(&mut self, fut: impl Future<Output = Result<(), Report>>) {
|
fn run(self, fut: impl Future<Output = Result<(), Report>>) {
|
||||||
let result = self.block_on(async move {
|
let result = self.block_on(async move {
|
||||||
// Always poll the shutdown future first.
|
// Always poll the shutdown future first.
|
||||||
//
|
//
|
||||||
|
|
@ -57,12 +60,16 @@ impl RuntimeRun for Runtime {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Don't wait for long blocking tasks before shutting down
|
||||||
|
tracing::info!(
|
||||||
|
?TOKIO_SHUTDOWN_TIMEOUT,
|
||||||
|
"waiting for async tokio tasks to shut down"
|
||||||
|
);
|
||||||
|
self.shutdown_timeout(TOKIO_SHUTDOWN_TIMEOUT);
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
info!("shutting down Zebra");
|
info!("shutting down Zebra");
|
||||||
|
|
||||||
// Don't wait for the runtime to shut down all the tasks.
|
|
||||||
app_writer().shutdown(Shutdown::Graceful);
|
|
||||||
}
|
}
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
warn!(?error, "shutting down Zebra due to an error");
|
warn!(?error, "shutting down Zebra due to an error");
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue