Zebra/zebra-state/src/service/finalized_state/disk_db.rs

482 lines
19 KiB
Rust

//! Provides low-level access to RocksDB using some database-specific types.
//!
//! This module makes sure that:
//! - all disk writes happen inside a RocksDB transaction ([`WriteBatch`]), and
//! - format-specific invariants are maintained.
//!
//! # Correctness
//!
//! The [`crate::constants::DATABASE_FORMAT_VERSION`] constant must
//! be incremented each time the database format (column, serialization, etc) changes.
use std::{fmt::Debug, path::Path};
use rlimit::increase_nofile_limit;
use zebra_chain::parameters::Network;
use crate::{
service::finalized_state::disk_format::{FromDisk, IntoDisk},
Config,
};
#[cfg(any(test, feature = "proptest-impl"))]
mod tests;
/// Wrapper struct to ensure low-level database access goes through the correct API.
pub struct DiskDb {
/// The inner RocksDB database.
db: rocksdb::DB,
/// The configured temporary database setting.
///
/// If true, the database files are deleted on drop.
ephemeral: bool,
}
/// Wrapper struct to ensure low-level database writes go through the correct API.
///
/// [`rocksdb::WriteBatch`] is a batched set of database updates,
/// which must be written to the database using `DiskDb::write(batch)`.
#[must_use = "batches must be written to the database"]
pub struct DiskWriteBatch {
/// The inner RocksDB write batch.
batch: rocksdb::WriteBatch,
}
/// Helper trait for inserting (Key, Value) pairs into rocksdb with a consistently
/// defined format
pub trait WriteDisk {
/// Serialize and insert the given key and value into a rocksdb column family,
/// overwriting any existing `value` for `key`.
fn zs_insert<K, V>(&mut self, cf: &rocksdb::ColumnFamily, key: K, value: V)
where
K: IntoDisk + Debug,
V: IntoDisk;
/// Remove the given key form rocksdb column family if it exists.
fn zs_delete<K>(&mut self, cf: &rocksdb::ColumnFamily, key: K)
where
K: IntoDisk + Debug;
}
impl WriteDisk for DiskWriteBatch {
fn zs_insert<K, V>(&mut self, cf: &rocksdb::ColumnFamily, key: K, value: V)
where
K: IntoDisk + Debug,
V: IntoDisk,
{
let key_bytes = key.as_bytes();
let value_bytes = value.as_bytes();
self.batch.put_cf(cf, key_bytes, value_bytes);
}
fn zs_delete<K>(&mut self, cf: &rocksdb::ColumnFamily, key: K)
where
K: IntoDisk + Debug,
{
let key_bytes = key.as_bytes();
self.batch.delete_cf(cf, key_bytes);
}
}
/// Helper trait for retrieving values from rocksdb column familys with a consistently
/// defined format
pub trait ReadDisk {
/// Returns the value for `key` in the rocksdb column family `cf`, if present.
fn zs_get<K, V>(&self, cf: &rocksdb::ColumnFamily, key: &K) -> Option<V>
where
K: IntoDisk,
V: FromDisk;
/// Check if a rocksdb column family `cf` contains the serialized form of `key`.
fn zs_contains<K>(&self, cf: &rocksdb::ColumnFamily, key: &K) -> bool
where
K: IntoDisk;
}
impl ReadDisk for DiskDb {
fn zs_get<K, V>(&self, cf: &rocksdb::ColumnFamily, key: &K) -> Option<V>
where
K: IntoDisk,
V: FromDisk,
{
let key_bytes = key.as_bytes();
// We use `get_pinned_cf` to avoid taking ownership of the serialized
// value, because we're going to deserialize it anyways, which avoids an
// extra copy
//
// TODO: move disk reads to a blocking thread (#2188)
let value_bytes = self
.db
.get_pinned_cf(cf, key_bytes)
.expect("expected that disk errors would not occur");
value_bytes.map(V::from_bytes)
}
fn zs_contains<K>(&self, cf: &rocksdb::ColumnFamily, key: &K) -> bool
where
K: IntoDisk,
{
let key_bytes = key.as_bytes();
// We use `get_pinned_cf` to avoid taking ownership of the serialized
// value, because we don't use the value at all. This avoids an extra copy.
//
// TODO: move disk reads to a blocking thread (#2188)
self.db
.get_pinned_cf(cf, key_bytes)
.expect("expected that disk errors would not occur")
.is_some()
}
}
impl DiskWriteBatch {
pub fn new() -> Self {
DiskWriteBatch {
batch: rocksdb::WriteBatch::default(),
}
}
}
impl DiskDb {
/// The ideal open file limit for Zebra
const IDEAL_OPEN_FILE_LIMIT: u64 = 1024;
/// The minimum number of open files for Zebra to operate normally. Also used
/// as the default open file limit, when the OS doesn't tell us how many
/// files we can use.
///
/// We want 100+ file descriptors for peers, and 100+ for the database.
///
/// On Windows, the default limit is 512 high-level I/O files, and 8192
/// low-level I/O files:
/// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/setmaxstdio?view=msvc-160#remarks
const MIN_OPEN_FILE_LIMIT: u64 = 512;
/// The number of files used internally by Zebra.
///
/// Zebra uses file descriptors for OS libraries (10+), polling APIs (10+),
/// stdio (3), and other OS facilities (2+).
const RESERVED_FILE_COUNT: u64 = 48;
pub fn new(config: &Config, network: Network) -> DiskDb {
let path = config.db_path(network);
let db_options = DiskDb::options();
let column_families = vec![
rocksdb::ColumnFamilyDescriptor::new("hash_by_height", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("height_by_hash", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("block_by_height", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("tx_by_hash", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("utxo_by_outpoint", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("sprout_nullifiers", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("sapling_nullifiers", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("orchard_nullifiers", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("sprout_anchors", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("sapling_anchors", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("orchard_anchors", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("sprout_note_commitment_tree", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new(
"sapling_note_commitment_tree",
db_options.clone(),
),
rocksdb::ColumnFamilyDescriptor::new(
"orchard_note_commitment_tree",
db_options.clone(),
),
rocksdb::ColumnFamilyDescriptor::new("history_tree", db_options.clone()),
rocksdb::ColumnFamilyDescriptor::new("tip_chain_value_pool", db_options.clone()),
];
// TODO: move opening the database to a blocking thread (#2188)
let db_result = rocksdb::DB::open_cf_descriptors(&db_options, &path, column_families);
match db_result {
Ok(db) => {
info!("Opened Zebra state cache at {}", path.display());
let db = DiskDb {
db,
ephemeral: config.ephemeral,
};
db.assert_default_cf_is_empty();
db
}
// TODO: provide a different hint if the disk is full, see #1623
Err(e) => panic!(
"Opening database {:?} failed: {:?}. \
Hint: Check if another zebrad process is running. \
Try changing the state cache_dir in the Zebra config.",
path, e,
),
}
}
// Read methods
/// Returns the `Path` where the files used by this database are located.
pub fn path(&self) -> &Path {
self.db.path()
}
/// Returns the column family handle for `cf_name`.
pub fn cf_handle(&self, cf_name: &str) -> Option<&rocksdb::ColumnFamily> {
self.db.cf_handle(cf_name)
}
/// Returns an iterator over the keys in `cf_name`, starting from the first key.
///
/// TODO: add an iterator wrapper struct that does disk reads in a blocking thread (#2188)
pub fn forward_iterator(&self, cf_handle: &rocksdb::ColumnFamily) -> rocksdb::DBIterator {
self.db.iterator_cf(cf_handle, rocksdb::IteratorMode::Start)
}
/// Returns a reverse iterator over the keys in `cf_name`, starting from the last key.
///
/// TODO: add an iterator wrapper struct that does disk reads in a blocking thread (#2188)
pub fn reverse_iterator(&self, cf_handle: &rocksdb::ColumnFamily) -> rocksdb::DBIterator {
self.db.iterator_cf(cf_handle, rocksdb::IteratorMode::End)
}
/// Returns true if `cf` does not contain any entries.
pub fn is_empty(&self, cf_handle: &rocksdb::ColumnFamily) -> bool {
// Empty column families return invalid iterators.
!self.forward_iterator(cf_handle).valid()
}
// Write methods
/// Writes `batch` to the database.
pub fn write(&self, batch: DiskWriteBatch) -> Result<(), rocksdb::Error> {
// TODO: move writing to the database to a blocking thread (#2188)
self.db.write(batch.batch)
}
// Private methods
/// Returns the database options for the finalized state database.
fn options() -> rocksdb::Options {
let mut opts = rocksdb::Options::default();
opts.create_if_missing(true);
opts.create_missing_column_families(true);
let open_file_limit = DiskDb::increase_open_file_limit();
let db_file_limit = DiskDb::get_db_open_file_limit(open_file_limit);
// If the current limit is very large, set the DB limit using the ideal limit
let ideal_limit = DiskDb::get_db_open_file_limit(DiskDb::IDEAL_OPEN_FILE_LIMIT)
.try_into()
.expect("ideal open file limit fits in a c_int");
let db_file_limit = db_file_limit.try_into().unwrap_or(ideal_limit);
opts.set_max_open_files(db_file_limit);
opts
}
/// Calculate the database's share of `open_file_limit`
fn get_db_open_file_limit(open_file_limit: u64) -> u64 {
// Give the DB half the files, and reserve half the files for peers
(open_file_limit - DiskDb::RESERVED_FILE_COUNT) / 2
}
/// Increase the open file limit for this process to `IDEAL_OPEN_FILE_LIMIT`.
/// If that fails, try `MIN_OPEN_FILE_LIMIT`.
///
/// If the current limit is above `IDEAL_OPEN_FILE_LIMIT`, leaves it
/// unchanged.
///
/// Returns the current limit, after any successful increases.
///
/// # Panics
///
/// If the open file limit can not be increased to `MIN_OPEN_FILE_LIMIT`.
fn increase_open_file_limit() -> u64 {
// Zebra mainly uses TCP sockets (`zebra-network`) and low-level files
// (`zebra-state` database).
//
// On Unix-based platforms, `increase_nofile_limit` changes the limit for
// both database files and TCP connections.
//
// But it doesn't do anything on Windows in rlimit 0.7.0.
//
// On Windows, the default limits are:
// - 512 high-level stream I/O files (via the C standard functions),
// - 8192 low-level I/O files (via the Unix C functions), and
// - 1000 TCP Control Block entries (network connections).
//
// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/setmaxstdio?view=msvc-160#remarks
// http://smallvoid.com/article/winnt-tcpip-max-limit.html
//
// `zebra-state`'s `IDEAL_OPEN_FILE_LIMIT` is much less than
// the Windows low-level I/O file limit.
//
// The [`setmaxstdio` and `getmaxstdio`](https://docs.rs/rlimit/latest/rlimit/#windows)
// functions from the `rlimit` crate only change the high-level I/O file limit.
//
// `zebra-network`'s default connection limit is much less than
// the TCP Control Block limit on Windows.
// We try setting the ideal limit, then the minimum limit.
let current_limit = match increase_nofile_limit(DiskDb::IDEAL_OPEN_FILE_LIMIT) {
Ok(current_limit) => current_limit,
Err(limit_error) => {
// These errors can happen due to sandboxing or unsupported system calls,
// even if the file limit is high enough.
info!(
?limit_error,
min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT,
ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT,
"unable to increase the open file limit, \
assuming Zebra can open a minimum number of files"
);
return DiskDb::MIN_OPEN_FILE_LIMIT;
}
};
if current_limit < DiskDb::MIN_OPEN_FILE_LIMIT {
panic!(
"open file limit too low: \
unable to set the number of open files to {}, \
the minimum number of files required by Zebra. \
Current limit is {:?}. \
Hint: Increase the open file limit to {} before launching Zebra",
DiskDb::MIN_OPEN_FILE_LIMIT,
current_limit,
DiskDb::IDEAL_OPEN_FILE_LIMIT
);
} else if current_limit < DiskDb::IDEAL_OPEN_FILE_LIMIT {
warn!(
?current_limit,
min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT,
ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT,
"the maximum number of open files is below Zebra's ideal limit. \
Hint: Increase the open file limit to {} before launching Zebra",
DiskDb::IDEAL_OPEN_FILE_LIMIT
);
} else if cfg!(windows) {
info!(
min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT,
ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT,
"assuming the open file limit is high enough for Zebra",
);
} else {
info!(
?current_limit,
min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT,
ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT,
"the open file limit is high enough for Zebra",
);
}
current_limit
}
/// Shut down the database, cleaning up background tasks and ephemeral data.
///
/// TODO: make private after the stop height check has moved to the syncer (#3442)
/// move shutting down the database to a blocking thread (#2188)
pub(crate) fn shutdown(&mut self) {
self.assert_default_cf_is_empty();
// Drop isn't guaranteed to run, such as when we panic, or if the tokio shutdown times out.
//
// Zebra's data should be fine if we don't clean up, because:
// - the database flushes regularly anyway
// - Zebra commits each block in a database transaction, any incomplete blocks get rolled back
// - ephemeral files are placed in the os temp dir and should be cleaned up automatically eventually
info!("flushing database to disk");
self.db.flush().expect("flush is successful");
// But we should call `cancel_all_background_work` before Zebra exits.
// If we don't, we see these kinds of errors:
// ```
// pthread lock: Invalid argument
// pure virtual method called
// terminate called without an active exception
// pthread destroy mutex: Device or resource busy
// Aborted (core dumped)
// ```
//
// The RocksDB wiki says:
// > Q: Is it safe to close RocksDB while another thread is issuing read, write or manual compaction requests?
// >
// > A: No. The users of RocksDB need to make sure all functions have finished before they close RocksDB.
// > You can speed up the waiting by calling CancelAllBackgroundWork().
//
// https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ
info!("stopping background database tasks");
self.db.cancel_all_background_work(true);
// We'd like to drop the database before deleting its files,
// because that closes the column families and the database correctly.
// But Rust's ownership rules make that difficult,
// so we just flush and delete ephemeral data instead.
//
// The RocksDB wiki says:
// > rocksdb::DB instances need to be destroyed before your main function exits.
// > RocksDB instances usually depend on some internal static variables.
// > Users need to make sure rocksdb::DB instances are destroyed before those static variables.
//
// https://github.com/facebook/rocksdb/wiki/Known-Issues
//
// But our current code doesn't seem to cause any issues.
// We might want to explicitly drop the database as part of graceful shutdown (#1678).
self.delete_ephemeral();
}
/// If the database is `ephemeral`, delete it.
fn delete_ephemeral(&self) {
if self.ephemeral {
let path = self.path();
info!(cache_path = ?path, "removing temporary database files");
// We'd like to use `rocksdb::Env::mem_env` for ephemeral databases,
// but the Zcash blockchain might not fit in memory. So we just
// delete the database files instead.
//
// We'd like to call `DB::destroy` here, but calling destroy on a
// live DB is undefined behaviour:
// https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ#basic-readwrite
//
// So we assume that all the database files are under `path`, and
// delete them using standard filesystem APIs. Deleting open files
// might cause errors on non-Unix platforms, so we ignore the result.
// (The OS will delete them eventually anyway.)
let res = std::fs::remove_dir_all(path);
// TODO: downgrade to debug once bugs like #2905 are fixed
// but leave any errors at "info" level
info!(?res, "removed temporary database files");
}
}
/// Check that the "default" column family is empty.
///
/// # Panics
///
/// If Zebra has a bug where it is storing data in the wrong column family.
fn assert_default_cf_is_empty(&self) {
if let Some(default_cf) = self.cf_handle("default") {
assert!(
self.is_empty(default_cf),
"Zebra should not store data in the 'default' column family"
);
}
}
}
impl Drop for DiskDb {
fn drop(&mut self) {
self.shutdown();
}
}