//! Provides low-level access to RocksDB using some database-specific types. //! //! This module makes sure that: //! - all disk writes happen inside a RocksDB transaction ([`WriteBatch`]), and //! - format-specific invariants are maintained. //! //! # Correctness //! //! The [`crate::constants::DATABASE_FORMAT_VERSION`] constant must //! be incremented each time the database format (column, serialization, etc) changes. use std::{fmt::Debug, path::Path}; use rlimit::increase_nofile_limit; use zebra_chain::parameters::Network; use crate::{ service::finalized_state::disk_format::{FromDisk, IntoDisk}, Config, }; #[cfg(any(test, feature = "proptest-impl"))] mod tests; /// Wrapper struct to ensure low-level database access goes through the correct API. pub struct DiskDb { /// The inner RocksDB database. db: rocksdb::DB, /// The configured temporary database setting. /// /// If true, the database files are deleted on drop. ephemeral: bool, } /// Wrapper struct to ensure low-level database writes go through the correct API. /// /// [`rocksdb::WriteBatch`] is a batched set of database updates, /// which must be written to the database using `DiskDb::write(batch)`. #[must_use = "batches must be written to the database"] pub struct DiskWriteBatch { /// The inner RocksDB write batch. batch: rocksdb::WriteBatch, } /// Helper trait for inserting (Key, Value) pairs into rocksdb with a consistently /// defined format pub trait WriteDisk { /// Serialize and insert the given key and value into a rocksdb column family, /// overwriting any existing `value` for `key`. fn zs_insert(&mut self, cf: &rocksdb::ColumnFamily, key: K, value: V) where K: IntoDisk + Debug, V: IntoDisk; /// Remove the given key form rocksdb column family if it exists. fn zs_delete(&mut self, cf: &rocksdb::ColumnFamily, key: K) where K: IntoDisk + Debug; } impl WriteDisk for DiskWriteBatch { fn zs_insert(&mut self, cf: &rocksdb::ColumnFamily, key: K, value: V) where K: IntoDisk + Debug, V: IntoDisk, { let key_bytes = key.as_bytes(); let value_bytes = value.as_bytes(); self.batch.put_cf(cf, key_bytes, value_bytes); } fn zs_delete(&mut self, cf: &rocksdb::ColumnFamily, key: K) where K: IntoDisk + Debug, { let key_bytes = key.as_bytes(); self.batch.delete_cf(cf, key_bytes); } } /// Helper trait for retrieving values from rocksdb column familys with a consistently /// defined format pub trait ReadDisk { /// Returns the value for `key` in the rocksdb column family `cf`, if present. fn zs_get(&self, cf: &rocksdb::ColumnFamily, key: &K) -> Option where K: IntoDisk, V: FromDisk; /// Check if a rocksdb column family `cf` contains the serialized form of `key`. fn zs_contains(&self, cf: &rocksdb::ColumnFamily, key: &K) -> bool where K: IntoDisk; } impl ReadDisk for DiskDb { fn zs_get(&self, cf: &rocksdb::ColumnFamily, key: &K) -> Option where K: IntoDisk, V: FromDisk, { let key_bytes = key.as_bytes(); // We use `get_pinned_cf` to avoid taking ownership of the serialized // value, because we're going to deserialize it anyways, which avoids an // extra copy // // TODO: move disk reads to a blocking thread (#2188) let value_bytes = self .db .get_pinned_cf(cf, key_bytes) .expect("expected that disk errors would not occur"); value_bytes.map(V::from_bytes) } fn zs_contains(&self, cf: &rocksdb::ColumnFamily, key: &K) -> bool where K: IntoDisk, { let key_bytes = key.as_bytes(); // We use `get_pinned_cf` to avoid taking ownership of the serialized // value, because we don't use the value at all. This avoids an extra copy. // // TODO: move disk reads to a blocking thread (#2188) self.db .get_pinned_cf(cf, key_bytes) .expect("expected that disk errors would not occur") .is_some() } } impl DiskWriteBatch { pub fn new() -> Self { DiskWriteBatch { batch: rocksdb::WriteBatch::default(), } } } impl DiskDb { /// The ideal open file limit for Zebra const IDEAL_OPEN_FILE_LIMIT: u64 = 1024; /// The minimum number of open files for Zebra to operate normally. Also used /// as the default open file limit, when the OS doesn't tell us how many /// files we can use. /// /// We want 100+ file descriptors for peers, and 100+ for the database. /// /// On Windows, the default limit is 512 high-level I/O files, and 8192 /// low-level I/O files: /// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/setmaxstdio?view=msvc-160#remarks const MIN_OPEN_FILE_LIMIT: u64 = 512; /// The number of files used internally by Zebra. /// /// Zebra uses file descriptors for OS libraries (10+), polling APIs (10+), /// stdio (3), and other OS facilities (2+). const RESERVED_FILE_COUNT: u64 = 48; pub fn new(config: &Config, network: Network) -> DiskDb { let path = config.db_path(network); let db_options = DiskDb::options(); let column_families = vec![ rocksdb::ColumnFamilyDescriptor::new("hash_by_height", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("height_by_hash", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("block_by_height", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("tx_by_hash", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("utxo_by_outpoint", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("sprout_nullifiers", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("sapling_nullifiers", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("orchard_nullifiers", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("sprout_anchors", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("sapling_anchors", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("orchard_anchors", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("sprout_note_commitment_tree", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new( "sapling_note_commitment_tree", db_options.clone(), ), rocksdb::ColumnFamilyDescriptor::new( "orchard_note_commitment_tree", db_options.clone(), ), rocksdb::ColumnFamilyDescriptor::new("history_tree", db_options.clone()), rocksdb::ColumnFamilyDescriptor::new("tip_chain_value_pool", db_options.clone()), ]; // TODO: move opening the database to a blocking thread (#2188) let db_result = rocksdb::DB::open_cf_descriptors(&db_options, &path, column_families); match db_result { Ok(db) => { info!("Opened Zebra state cache at {}", path.display()); let db = DiskDb { db, ephemeral: config.ephemeral, }; db.assert_default_cf_is_empty(); db } // TODO: provide a different hint if the disk is full, see #1623 Err(e) => panic!( "Opening database {:?} failed: {:?}. \ Hint: Check if another zebrad process is running. \ Try changing the state cache_dir in the Zebra config.", path, e, ), } } // Read methods /// Returns the `Path` where the files used by this database are located. pub fn path(&self) -> &Path { self.db.path() } /// Returns the column family handle for `cf_name`. pub fn cf_handle(&self, cf_name: &str) -> Option<&rocksdb::ColumnFamily> { self.db.cf_handle(cf_name) } /// Returns an iterator over the keys in `cf_name`, starting from the first key. /// /// TODO: add an iterator wrapper struct that does disk reads in a blocking thread (#2188) pub fn forward_iterator(&self, cf_handle: &rocksdb::ColumnFamily) -> rocksdb::DBIterator { self.db.iterator_cf(cf_handle, rocksdb::IteratorMode::Start) } /// Returns a reverse iterator over the keys in `cf_name`, starting from the last key. /// /// TODO: add an iterator wrapper struct that does disk reads in a blocking thread (#2188) pub fn reverse_iterator(&self, cf_handle: &rocksdb::ColumnFamily) -> rocksdb::DBIterator { self.db.iterator_cf(cf_handle, rocksdb::IteratorMode::End) } /// Returns true if `cf` does not contain any entries. pub fn is_empty(&self, cf_handle: &rocksdb::ColumnFamily) -> bool { // Empty column families return invalid iterators. !self.forward_iterator(cf_handle).valid() } // Write methods /// Writes `batch` to the database. pub fn write(&self, batch: DiskWriteBatch) -> Result<(), rocksdb::Error> { // TODO: move writing to the database to a blocking thread (#2188) self.db.write(batch.batch) } // Private methods /// Returns the database options for the finalized state database. fn options() -> rocksdb::Options { let mut opts = rocksdb::Options::default(); opts.create_if_missing(true); opts.create_missing_column_families(true); let open_file_limit = DiskDb::increase_open_file_limit(); let db_file_limit = DiskDb::get_db_open_file_limit(open_file_limit); // If the current limit is very large, set the DB limit using the ideal limit let ideal_limit = DiskDb::get_db_open_file_limit(DiskDb::IDEAL_OPEN_FILE_LIMIT) .try_into() .expect("ideal open file limit fits in a c_int"); let db_file_limit = db_file_limit.try_into().unwrap_or(ideal_limit); opts.set_max_open_files(db_file_limit); opts } /// Calculate the database's share of `open_file_limit` fn get_db_open_file_limit(open_file_limit: u64) -> u64 { // Give the DB half the files, and reserve half the files for peers (open_file_limit - DiskDb::RESERVED_FILE_COUNT) / 2 } /// Increase the open file limit for this process to `IDEAL_OPEN_FILE_LIMIT`. /// If that fails, try `MIN_OPEN_FILE_LIMIT`. /// /// If the current limit is above `IDEAL_OPEN_FILE_LIMIT`, leaves it /// unchanged. /// /// Returns the current limit, after any successful increases. /// /// # Panics /// /// If the open file limit can not be increased to `MIN_OPEN_FILE_LIMIT`. fn increase_open_file_limit() -> u64 { // Zebra mainly uses TCP sockets (`zebra-network`) and low-level files // (`zebra-state` database). // // On Unix-based platforms, `increase_nofile_limit` changes the limit for // both database files and TCP connections. // // But it doesn't do anything on Windows in rlimit 0.7.0. // // On Windows, the default limits are: // - 512 high-level stream I/O files (via the C standard functions), // - 8192 low-level I/O files (via the Unix C functions), and // - 1000 TCP Control Block entries (network connections). // // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/setmaxstdio?view=msvc-160#remarks // http://smallvoid.com/article/winnt-tcpip-max-limit.html // // `zebra-state`'s `IDEAL_OPEN_FILE_LIMIT` is much less than // the Windows low-level I/O file limit. // // The [`setmaxstdio` and `getmaxstdio`](https://docs.rs/rlimit/latest/rlimit/#windows) // functions from the `rlimit` crate only change the high-level I/O file limit. // // `zebra-network`'s default connection limit is much less than // the TCP Control Block limit on Windows. // We try setting the ideal limit, then the minimum limit. let current_limit = match increase_nofile_limit(DiskDb::IDEAL_OPEN_FILE_LIMIT) { Ok(current_limit) => current_limit, Err(limit_error) => { // These errors can happen due to sandboxing or unsupported system calls, // even if the file limit is high enough. info!( ?limit_error, min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT, ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT, "unable to increase the open file limit, \ assuming Zebra can open a minimum number of files" ); return DiskDb::MIN_OPEN_FILE_LIMIT; } }; if current_limit < DiskDb::MIN_OPEN_FILE_LIMIT { panic!( "open file limit too low: \ unable to set the number of open files to {}, \ the minimum number of files required by Zebra. \ Current limit is {:?}. \ Hint: Increase the open file limit to {} before launching Zebra", DiskDb::MIN_OPEN_FILE_LIMIT, current_limit, DiskDb::IDEAL_OPEN_FILE_LIMIT ); } else if current_limit < DiskDb::IDEAL_OPEN_FILE_LIMIT { warn!( ?current_limit, min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT, ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT, "the maximum number of open files is below Zebra's ideal limit. \ Hint: Increase the open file limit to {} before launching Zebra", DiskDb::IDEAL_OPEN_FILE_LIMIT ); } else if cfg!(windows) { info!( min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT, ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT, "assuming the open file limit is high enough for Zebra", ); } else { info!( ?current_limit, min_limit = ?DiskDb::MIN_OPEN_FILE_LIMIT, ideal_limit = ?DiskDb::IDEAL_OPEN_FILE_LIMIT, "the open file limit is high enough for Zebra", ); } current_limit } /// Shut down the database, cleaning up background tasks and ephemeral data. /// /// TODO: make private after the stop height check has moved to the syncer (#3442) /// move shutting down the database to a blocking thread (#2188) pub(crate) fn shutdown(&mut self) { self.assert_default_cf_is_empty(); // Drop isn't guaranteed to run, such as when we panic, or if the tokio shutdown times out. // // Zebra's data should be fine if we don't clean up, because: // - the database flushes regularly anyway // - Zebra commits each block in a database transaction, any incomplete blocks get rolled back // - ephemeral files are placed in the os temp dir and should be cleaned up automatically eventually info!("flushing database to disk"); self.db.flush().expect("flush is successful"); // But we should call `cancel_all_background_work` before Zebra exits. // If we don't, we see these kinds of errors: // ``` // pthread lock: Invalid argument // pure virtual method called // terminate called without an active exception // pthread destroy mutex: Device or resource busy // Aborted (core dumped) // ``` // // The RocksDB wiki says: // > Q: Is it safe to close RocksDB while another thread is issuing read, write or manual compaction requests? // > // > A: No. The users of RocksDB need to make sure all functions have finished before they close RocksDB. // > You can speed up the waiting by calling CancelAllBackgroundWork(). // // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ info!("stopping background database tasks"); self.db.cancel_all_background_work(true); // We'd like to drop the database before deleting its files, // because that closes the column families and the database correctly. // But Rust's ownership rules make that difficult, // so we just flush and delete ephemeral data instead. // // The RocksDB wiki says: // > rocksdb::DB instances need to be destroyed before your main function exits. // > RocksDB instances usually depend on some internal static variables. // > Users need to make sure rocksdb::DB instances are destroyed before those static variables. // // https://github.com/facebook/rocksdb/wiki/Known-Issues // // But our current code doesn't seem to cause any issues. // We might want to explicitly drop the database as part of graceful shutdown (#1678). self.delete_ephemeral(); } /// If the database is `ephemeral`, delete it. fn delete_ephemeral(&self) { if self.ephemeral { let path = self.path(); info!(cache_path = ?path, "removing temporary database files"); // We'd like to use `rocksdb::Env::mem_env` for ephemeral databases, // but the Zcash blockchain might not fit in memory. So we just // delete the database files instead. // // We'd like to call `DB::destroy` here, but calling destroy on a // live DB is undefined behaviour: // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ#basic-readwrite // // So we assume that all the database files are under `path`, and // delete them using standard filesystem APIs. Deleting open files // might cause errors on non-Unix platforms, so we ignore the result. // (The OS will delete them eventually anyway.) let res = std::fs::remove_dir_all(path); // TODO: downgrade to debug once bugs like #2905 are fixed // but leave any errors at "info" level info!(?res, "removed temporary database files"); } } /// Check that the "default" column family is empty. /// /// # Panics /// /// If Zebra has a bug where it is storing data in the wrong column family. fn assert_default_cf_is_empty(&self) { if let Some(default_cf) = self.cf_handle("default") { assert!( self.is_empty(default_cf), "Zebra should not store data in the 'default' column family" ); } } } impl Drop for DiskDb { fn drop(&mut self) { self.shutdown(); } }