feat(verify): Concurrently verify proof and signature batches (#4776)
* Initialize the rayon threadpool with a new config for CPU-bound threads * Verify proofs and signatures on the rayon thread pool * Only spawn one concurrent batch per verifier, for now * Allow tower-batch to queue multiple batches * Fix up a potentially incorrect comment * Rename some variables for concurrent batches * Spawn multiple batches concurrently, without any limits * Simplify batch worker loop using OptionFuture * Clear pending batches once they finish * Stop accepting new items when we're at the concurrent batch limit * Fail queued requests on drop * Move pending_items and the batch timer into the worker struct * Add worker fields to batch trace logs * Run docker tests on PR series * During full verification, process 20 blocks concurrently * Remove an outdated comment about yielding to other tasks
This commit is contained in:
parent
9b9cd55097
commit
cf4b2f7a67
|
|
@ -7,8 +7,6 @@ name: CI Docker
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
regenerate-stateful-disks:
|
regenerate-stateful-disks:
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,6 @@ name: CI Docker
|
||||||
# so they can be skipped when the modified files make the actual workflow run.
|
# so they can be skipped when the modified files make the actual workflow run.
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
# code and tests
|
# code and tests
|
||||||
- '**/*.rs'
|
- '**/*.rs'
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,6 @@ on:
|
||||||
required: true
|
required: true
|
||||||
|
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
paths:
|
||||||
# code and tests
|
# code and tests
|
||||||
- '**/*.rs'
|
- '**/*.rs'
|
||||||
|
|
|
||||||
|
|
@ -3792,9 +3792,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rayon"
|
name = "rayon"
|
||||||
version = "1.5.1"
|
version = "1.5.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
|
checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg 1.1.0",
|
"autocfg 1.1.0",
|
||||||
"crossbeam-deque",
|
"crossbeam-deque",
|
||||||
|
|
@ -3804,14 +3804,13 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rayon-core"
|
name = "rayon-core"
|
||||||
version = "1.9.1"
|
version = "1.9.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
|
checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"crossbeam-channel",
|
"crossbeam-channel",
|
||||||
"crossbeam-deque",
|
"crossbeam-deque",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
"lazy_static",
|
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -5447,6 +5446,7 @@ dependencies = [
|
||||||
"futures-core",
|
"futures-core",
|
||||||
"pin-project 1.0.11",
|
"pin-project 1.0.11",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
|
"rayon",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-test",
|
"tokio-test",
|
||||||
"tokio-util 0.7.3",
|
"tokio-util 0.7.3",
|
||||||
|
|
@ -6368,6 +6368,7 @@ dependencies = [
|
||||||
"proptest-derive",
|
"proptest-derive",
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
|
"rayon",
|
||||||
"serde",
|
"serde",
|
||||||
"spandoc",
|
"spandoc",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -6576,6 +6577,7 @@ dependencies = [
|
||||||
"proptest-derive",
|
"proptest-derive",
|
||||||
"prost",
|
"prost",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
|
"rayon",
|
||||||
"regex",
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"semver 1.0.11",
|
"semver 1.0.11",
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ edition = "2021"
|
||||||
futures = "0.3.21"
|
futures = "0.3.21"
|
||||||
futures-core = "0.3.21"
|
futures-core = "0.3.21"
|
||||||
pin-project = "1.0.10"
|
pin-project = "1.0.10"
|
||||||
|
rayon = "1.5.3"
|
||||||
tokio = { version = "1.19.2", features = ["time", "sync", "tracing", "macros"] }
|
tokio = { version = "1.19.2", features = ["time", "sync", "tracing", "macros"] }
|
||||||
tokio-util = "0.7.3"
|
tokio-util = "0.7.3"
|
||||||
tower = { version = "0.4.13", features = ["util", "buffer"] }
|
tower = { version = "0.4.13", features = ["util", "buffer"] }
|
||||||
|
|
@ -24,7 +25,6 @@ tokio = { version = "1.19.2", features = ["full", "tracing", "test-util"] }
|
||||||
tokio-test = "0.4.2"
|
tokio-test = "0.4.2"
|
||||||
tower-fallback = { path = "../tower-fallback/" }
|
tower-fallback = { path = "../tower-fallback/" }
|
||||||
tower-test = "0.4.0"
|
tower-test = "0.4.0"
|
||||||
tracing = "0.1.31"
|
|
||||||
|
|
||||||
zebra-consensus = { path = "../zebra-consensus/" }
|
zebra-consensus = { path = "../zebra-consensus/" }
|
||||||
zebra-test = { path = "../zebra-test/" }
|
zebra-test = { path = "../zebra-test/" }
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,12 @@
|
||||||
use super::{service::Batch, BatchControl};
|
//! Tower service layer for batch processing.
|
||||||
|
|
||||||
use std::{fmt, marker::PhantomData};
|
use std::{fmt, marker::PhantomData};
|
||||||
|
|
||||||
use tower::layer::Layer;
|
use tower::layer::Layer;
|
||||||
use tower::Service;
|
use tower::Service;
|
||||||
|
|
||||||
|
use super::{service::Batch, BatchControl};
|
||||||
|
|
||||||
/// Adds a layer performing batch processing of requests.
|
/// Adds a layer performing batch processing of requests.
|
||||||
///
|
///
|
||||||
/// The default Tokio executor is used to run the given service,
|
/// The default Tokio executor is used to run the given service,
|
||||||
|
|
@ -10,24 +14,31 @@ use tower::Service;
|
||||||
///
|
///
|
||||||
/// See the module documentation for more details.
|
/// See the module documentation for more details.
|
||||||
pub struct BatchLayer<Request> {
|
pub struct BatchLayer<Request> {
|
||||||
max_items: usize,
|
max_items_in_batch: usize,
|
||||||
|
max_batches: Option<usize>,
|
||||||
max_latency: std::time::Duration,
|
max_latency: std::time::Duration,
|
||||||
_p: PhantomData<fn(Request)>,
|
|
||||||
|
// TODO: is the variance correct here?
|
||||||
|
// https://doc.rust-lang.org/1.33.0/nomicon/subtyping.html#variance
|
||||||
|
// https://doc.rust-lang.org/nomicon/phantom-data.html#table-of-phantomdata-patterns
|
||||||
|
_handles_requests: PhantomData<fn(Request)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<Request> BatchLayer<Request> {
|
impl<Request> BatchLayer<Request> {
|
||||||
/// Creates a new `BatchLayer`.
|
/// Creates a new `BatchLayer`.
|
||||||
///
|
///
|
||||||
/// The wrapper is responsible for telling the inner service when to flush a
|
/// The wrapper is responsible for telling the inner service when to flush a
|
||||||
/// batch of requests. Two parameters control this policy:
|
/// batch of requests. See [`Batch::new()`] for details.
|
||||||
///
|
pub fn new(
|
||||||
/// * `max_items` gives the maximum number of items per batch.
|
max_items_in_batch: usize,
|
||||||
/// * `max_latency` gives the maximum latency for a batch item.
|
max_batches: impl Into<Option<usize>>,
|
||||||
pub fn new(max_items: usize, max_latency: std::time::Duration) -> Self {
|
max_latency: std::time::Duration,
|
||||||
|
) -> Self {
|
||||||
BatchLayer {
|
BatchLayer {
|
||||||
max_items,
|
max_items_in_batch,
|
||||||
|
max_batches: max_batches.into(),
|
||||||
max_latency,
|
max_latency,
|
||||||
_p: PhantomData,
|
_handles_requests: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -36,20 +47,27 @@ impl<S, Request> Layer<S> for BatchLayer<Request>
|
||||||
where
|
where
|
||||||
S: Service<BatchControl<Request>> + Send + 'static,
|
S: Service<BatchControl<Request>> + Send + 'static,
|
||||||
S::Future: Send,
|
S::Future: Send,
|
||||||
|
S::Response: Send,
|
||||||
S::Error: Into<crate::BoxError> + Send + Sync,
|
S::Error: Into<crate::BoxError> + Send + Sync,
|
||||||
Request: Send + 'static,
|
Request: Send + 'static,
|
||||||
{
|
{
|
||||||
type Service = Batch<S, Request>;
|
type Service = Batch<S, Request>;
|
||||||
|
|
||||||
fn layer(&self, service: S) -> Self::Service {
|
fn layer(&self, service: S) -> Self::Service {
|
||||||
Batch::new(service, self.max_items, self.max_latency)
|
Batch::new(
|
||||||
|
service,
|
||||||
|
self.max_items_in_batch,
|
||||||
|
self.max_batches,
|
||||||
|
self.max_latency,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<Request> fmt::Debug for BatchLayer<Request> {
|
impl<Request> fmt::Debug for BatchLayer<Request> {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
f.debug_struct("BufferLayer")
|
f.debug_struct("BufferLayer")
|
||||||
.field("max_items", &self.max_items)
|
.field("max_items_in_batch", &self.max_items_in_batch)
|
||||||
|
.field("max_batches", &self.max_batches)
|
||||||
.field("max_latency", &self.max_latency)
|
.field("max_latency", &self.max_latency)
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
//! Wrapper service for batching items to an underlying service.
|
//! Wrapper service for batching items to an underlying service.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
|
cmp::max,
|
||||||
fmt,
|
fmt,
|
||||||
future::Future,
|
future::Future,
|
||||||
pin::Pin,
|
pin::Pin,
|
||||||
|
|
@ -25,6 +26,11 @@ use super::{
|
||||||
BatchControl,
|
BatchControl,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// The maximum number of batches in the queue.
|
||||||
|
///
|
||||||
|
/// This avoids having very large queues on machines with hundreds or thousands of cores.
|
||||||
|
pub const QUEUE_BATCH_LIMIT: usize = 64;
|
||||||
|
|
||||||
/// Allows batch processing of requests.
|
/// Allows batch processing of requests.
|
||||||
///
|
///
|
||||||
/// See the crate documentation for more details.
|
/// See the crate documentation for more details.
|
||||||
|
|
@ -32,6 +38,8 @@ pub struct Batch<T, Request>
|
||||||
where
|
where
|
||||||
T: Service<BatchControl<Request>>,
|
T: Service<BatchControl<Request>>,
|
||||||
{
|
{
|
||||||
|
// Batch management
|
||||||
|
//
|
||||||
/// A custom-bounded channel for sending requests to the batch worker.
|
/// A custom-bounded channel for sending requests to the batch worker.
|
||||||
///
|
///
|
||||||
/// Note: this actually _is_ bounded, but rather than using Tokio's unbounded
|
/// Note: this actually _is_ bounded, but rather than using Tokio's unbounded
|
||||||
|
|
@ -53,6 +61,8 @@ where
|
||||||
/// A semaphore permit that allows this service to send one message on `tx`.
|
/// A semaphore permit that allows this service to send one message on `tx`.
|
||||||
permit: Option<OwnedSemaphorePermit>,
|
permit: Option<OwnedSemaphorePermit>,
|
||||||
|
|
||||||
|
// Errors
|
||||||
|
//
|
||||||
/// An error handle shared between all service clones for the same worker.
|
/// An error handle shared between all service clones for the same worker.
|
||||||
error_handle: ErrorHandle,
|
error_handle: ErrorHandle,
|
||||||
|
|
||||||
|
|
@ -71,6 +81,7 @@ where
|
||||||
f.debug_struct(name)
|
f.debug_struct(name)
|
||||||
.field("tx", &self.tx)
|
.field("tx", &self.tx)
|
||||||
.field("semaphore", &self.semaphore)
|
.field("semaphore", &self.semaphore)
|
||||||
|
.field("permit", &self.permit)
|
||||||
.field("error_handle", &self.error_handle)
|
.field("error_handle", &self.error_handle)
|
||||||
.field("worker_handle", &self.worker_handle)
|
.field("worker_handle", &self.worker_handle)
|
||||||
.finish()
|
.finish()
|
||||||
|
|
@ -80,26 +91,37 @@ where
|
||||||
impl<T, Request> Batch<T, Request>
|
impl<T, Request> Batch<T, Request>
|
||||||
where
|
where
|
||||||
T: Service<BatchControl<Request>>,
|
T: Service<BatchControl<Request>>,
|
||||||
|
T::Future: Send + 'static,
|
||||||
T::Error: Into<crate::BoxError>,
|
T::Error: Into<crate::BoxError>,
|
||||||
{
|
{
|
||||||
/// Creates a new `Batch` wrapping `service`.
|
/// Creates a new `Batch` wrapping `service`.
|
||||||
///
|
///
|
||||||
/// The wrapper is responsible for telling the inner service when to flush a
|
/// The wrapper is responsible for telling the inner service when to flush a
|
||||||
/// batch of requests. Two parameters control this policy:
|
/// batch of requests. These parameters control this policy:
|
||||||
///
|
///
|
||||||
/// * `max_items` gives the maximum number of items per batch.
|
/// * `max_items_in_batch` gives the maximum number of items per batch.
|
||||||
/// * `max_latency` gives the maximum latency for a batch item.
|
/// * `max_batches` is an upper bound on the number of batches in the queue,
|
||||||
|
/// and the number of concurrently executing batches.
|
||||||
|
/// If this is `None`, we use the current number of [`rayon`] threads.
|
||||||
|
/// The number of batches in the queue is also limited by [`QUEUE_BATCH_LIMIT`].
|
||||||
|
/// * `max_latency` gives the maximum latency for a batch item to start verifying.
|
||||||
///
|
///
|
||||||
/// The default Tokio executor is used to run the given service, which means
|
/// The default Tokio executor is used to run the given service, which means
|
||||||
/// that this method must be called while on the Tokio runtime.
|
/// that this method must be called while on the Tokio runtime.
|
||||||
pub fn new(service: T, max_items: usize, max_latency: std::time::Duration) -> Self
|
pub fn new(
|
||||||
|
service: T,
|
||||||
|
max_items_in_batch: usize,
|
||||||
|
max_batches: impl Into<Option<usize>>,
|
||||||
|
max_latency: std::time::Duration,
|
||||||
|
) -> Self
|
||||||
where
|
where
|
||||||
T: Send + 'static,
|
T: Send + 'static,
|
||||||
T::Future: Send,
|
T::Future: Send,
|
||||||
|
T::Response: Send,
|
||||||
T::Error: Send + Sync,
|
T::Error: Send + Sync,
|
||||||
Request: Send + 'static,
|
Request: Send + 'static,
|
||||||
{
|
{
|
||||||
let (mut batch, worker) = Self::pair(service, max_items, max_latency);
|
let (mut batch, worker) = Self::pair(service, max_items_in_batch, max_batches, max_latency);
|
||||||
|
|
||||||
let span = info_span!("batch worker", kind = std::any::type_name::<T>());
|
let span = info_span!("batch worker", kind = std::any::type_name::<T>());
|
||||||
|
|
||||||
|
|
@ -131,7 +153,8 @@ where
|
||||||
/// `Batch` and the background `Worker` that you can then spawn.
|
/// `Batch` and the background `Worker` that you can then spawn.
|
||||||
pub fn pair(
|
pub fn pair(
|
||||||
service: T,
|
service: T,
|
||||||
max_items: usize,
|
max_items_in_batch: usize,
|
||||||
|
max_batches: impl Into<Option<usize>>,
|
||||||
max_latency: std::time::Duration,
|
max_latency: std::time::Duration,
|
||||||
) -> (Self, Worker<T, Request>)
|
) -> (Self, Worker<T, Request>)
|
||||||
where
|
where
|
||||||
|
|
@ -141,16 +164,32 @@ where
|
||||||
{
|
{
|
||||||
let (tx, rx) = mpsc::unbounded_channel();
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
|
|
||||||
|
// Clamp config to sensible values.
|
||||||
|
let max_items_in_batch = max(max_items_in_batch, 1);
|
||||||
|
let max_batches = max_batches
|
||||||
|
.into()
|
||||||
|
.unwrap_or_else(rayon::current_num_threads);
|
||||||
|
let max_batches_in_queue = max_batches.clamp(1, QUEUE_BATCH_LIMIT);
|
||||||
|
|
||||||
// The semaphore bound limits the maximum number of concurrent requests
|
// The semaphore bound limits the maximum number of concurrent requests
|
||||||
// (specifically, requests which got a `Ready` from `poll_ready`, but haven't
|
// (specifically, requests which got a `Ready` from `poll_ready`, but haven't
|
||||||
// used their semaphore reservation in a `call` yet).
|
// used their semaphore reservation in a `call` yet).
|
||||||
// We choose a bound that allows callers to check readiness for every item in
|
//
|
||||||
// a batch, then actually submit those items.
|
// We choose a bound that allows callers to check readiness for one batch per rayon CPU thread.
|
||||||
let semaphore = Semaphore::new(max_items);
|
// This helps keep all CPUs filled with work: there is one batch executing, and another ready to go.
|
||||||
|
// Often there is only one verifier running, when that happens we want it to take all the cores.
|
||||||
|
let semaphore = Semaphore::new(max_items_in_batch * max_batches_in_queue);
|
||||||
let semaphore = PollSemaphore::new(Arc::new(semaphore));
|
let semaphore = PollSemaphore::new(Arc::new(semaphore));
|
||||||
|
|
||||||
let (error_handle, worker) =
|
let (error_handle, worker) = Worker::new(
|
||||||
Worker::new(service, rx, max_items, max_latency, semaphore.clone());
|
service,
|
||||||
|
rx,
|
||||||
|
max_items_in_batch,
|
||||||
|
max_batches,
|
||||||
|
max_latency,
|
||||||
|
semaphore.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
let batch = Batch {
|
let batch = Batch {
|
||||||
tx,
|
tx,
|
||||||
semaphore,
|
semaphore,
|
||||||
|
|
@ -182,6 +221,7 @@ where
|
||||||
impl<T, Request> Service<Request> for Batch<T, Request>
|
impl<T, Request> Service<Request> for Batch<T, Request>
|
||||||
where
|
where
|
||||||
T: Service<BatchControl<Request>>,
|
T: Service<BatchControl<Request>>,
|
||||||
|
T::Future: Send + 'static,
|
||||||
T::Error: Into<crate::BoxError>,
|
T::Error: Into<crate::BoxError>,
|
||||||
{
|
{
|
||||||
type Response = T::Response;
|
type Response = T::Response;
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,11 @@ use std::{
|
||||||
sync::{Arc, Mutex},
|
sync::{Arc, Mutex},
|
||||||
};
|
};
|
||||||
|
|
||||||
use futures::future::TryFutureExt;
|
use futures::{
|
||||||
|
future::{BoxFuture, OptionFuture},
|
||||||
|
stream::FuturesUnordered,
|
||||||
|
FutureExt, StreamExt,
|
||||||
|
};
|
||||||
use pin_project::pin_project;
|
use pin_project::pin_project;
|
||||||
use tokio::{
|
use tokio::{
|
||||||
sync::mpsc,
|
sync::mpsc,
|
||||||
|
|
@ -33,28 +37,52 @@ use super::{
|
||||||
pub struct Worker<T, Request>
|
pub struct Worker<T, Request>
|
||||||
where
|
where
|
||||||
T: Service<BatchControl<Request>>,
|
T: Service<BatchControl<Request>>,
|
||||||
|
T::Future: Send + 'static,
|
||||||
T::Error: Into<crate::BoxError>,
|
T::Error: Into<crate::BoxError>,
|
||||||
{
|
{
|
||||||
|
// Batch management
|
||||||
|
//
|
||||||
/// A semaphore-bounded channel for receiving requests from the batch wrapper service.
|
/// A semaphore-bounded channel for receiving requests from the batch wrapper service.
|
||||||
rx: mpsc::UnboundedReceiver<Message<Request, T::Future>>,
|
rx: mpsc::UnboundedReceiver<Message<Request, T::Future>>,
|
||||||
|
|
||||||
/// The wrapped service that processes batches.
|
/// The wrapped service that processes batches.
|
||||||
service: T,
|
service: T,
|
||||||
|
|
||||||
|
/// The number of pending items sent to `service`, since the last batch flush.
|
||||||
|
pending_items: usize,
|
||||||
|
|
||||||
|
/// The timer for the pending batch, if it has any items.
|
||||||
|
///
|
||||||
|
/// The timer is started when the first entry of a new batch is
|
||||||
|
/// submitted, so that the batch latency of all entries is at most
|
||||||
|
/// self.max_latency. However, we don't keep the timer running unless
|
||||||
|
/// there is a pending request to prevent wakeups on idle services.
|
||||||
|
pending_batch_timer: Option<Pin<Box<Sleep>>>,
|
||||||
|
|
||||||
|
/// The batches that the worker is concurrently executing.
|
||||||
|
concurrent_batches: FuturesUnordered<BoxFuture<'static, Result<T::Response, T::Error>>>,
|
||||||
|
|
||||||
|
// Errors and termination
|
||||||
|
//
|
||||||
/// An error that's populated on permanent service failure.
|
/// An error that's populated on permanent service failure.
|
||||||
failed: Option<ServiceError>,
|
failed: Option<ServiceError>,
|
||||||
|
|
||||||
/// A shared error handle that's populated on permanent service failure.
|
/// A shared error handle that's populated on permanent service failure.
|
||||||
error_handle: ErrorHandle,
|
error_handle: ErrorHandle,
|
||||||
|
|
||||||
/// The maximum number of items allowed in a batch.
|
|
||||||
max_items: usize,
|
|
||||||
|
|
||||||
/// The maximum delay before processing a batch with fewer than `max_items`.
|
|
||||||
max_latency: std::time::Duration,
|
|
||||||
|
|
||||||
/// A cloned copy of the wrapper service's semaphore, used to close the semaphore.
|
/// A cloned copy of the wrapper service's semaphore, used to close the semaphore.
|
||||||
close: PollSemaphore,
|
close: PollSemaphore,
|
||||||
|
|
||||||
|
// Config
|
||||||
|
//
|
||||||
|
/// The maximum number of items allowed in a batch.
|
||||||
|
max_items_in_batch: usize,
|
||||||
|
|
||||||
|
/// The maximum number of batches that are allowed to run concurrently.
|
||||||
|
max_concurrent_batches: usize,
|
||||||
|
|
||||||
|
/// The maximum delay before processing a batch with fewer than `max_items_in_batch`.
|
||||||
|
max_latency: std::time::Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the error out
|
/// Get the error out
|
||||||
|
|
@ -66,12 +94,17 @@ pub(crate) struct ErrorHandle {
|
||||||
impl<T, Request> Worker<T, Request>
|
impl<T, Request> Worker<T, Request>
|
||||||
where
|
where
|
||||||
T: Service<BatchControl<Request>>,
|
T: Service<BatchControl<Request>>,
|
||||||
|
T::Future: Send + 'static,
|
||||||
T::Error: Into<crate::BoxError>,
|
T::Error: Into<crate::BoxError>,
|
||||||
{
|
{
|
||||||
|
/// Creates a new batch worker.
|
||||||
|
///
|
||||||
|
/// See [`Service::new()`](crate::Service::new) for details.
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
service: T,
|
service: T,
|
||||||
rx: mpsc::UnboundedReceiver<Message<Request, T::Future>>,
|
rx: mpsc::UnboundedReceiver<Message<Request, T::Future>>,
|
||||||
max_items: usize,
|
max_items_in_batch: usize,
|
||||||
|
max_concurrent_batches: usize,
|
||||||
max_latency: std::time::Duration,
|
max_latency: std::time::Duration,
|
||||||
close: PollSemaphore,
|
close: PollSemaphore,
|
||||||
) -> (ErrorHandle, Worker<T, Request>) {
|
) -> (ErrorHandle, Worker<T, Request>) {
|
||||||
|
|
@ -82,134 +115,184 @@ where
|
||||||
let worker = Worker {
|
let worker = Worker {
|
||||||
rx,
|
rx,
|
||||||
service,
|
service,
|
||||||
error_handle: error_handle.clone(),
|
pending_items: 0,
|
||||||
|
pending_batch_timer: None,
|
||||||
|
concurrent_batches: FuturesUnordered::new(),
|
||||||
failed: None,
|
failed: None,
|
||||||
max_items,
|
error_handle: error_handle.clone(),
|
||||||
max_latency,
|
|
||||||
close,
|
close,
|
||||||
|
max_items_in_batch,
|
||||||
|
max_concurrent_batches,
|
||||||
|
max_latency,
|
||||||
};
|
};
|
||||||
|
|
||||||
(error_handle, worker)
|
(error_handle, worker)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Process a single worker request.
|
||||||
async fn process_req(&mut self, req: Request, tx: message::Tx<T::Future>) {
|
async fn process_req(&mut self, req: Request, tx: message::Tx<T::Future>) {
|
||||||
if let Some(ref failed) = self.failed {
|
if let Some(ref error) = self.failed {
|
||||||
tracing::trace!("notifying caller about worker failure");
|
tracing::trace!(
|
||||||
let _ = tx.send(Err(failed.clone()));
|
?error,
|
||||||
} else {
|
"notifying batch request caller about worker failure",
|
||||||
match self.service.ready().await {
|
);
|
||||||
Ok(svc) => {
|
let _ = tx.send(Err(error.clone()));
|
||||||
let rsp = svc.call(req.into());
|
return;
|
||||||
let _ = tx.send(Ok(rsp));
|
}
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
self.failed(e.into());
|
|
||||||
let _ = tx.send(Err(self
|
|
||||||
.failed
|
|
||||||
.as_ref()
|
|
||||||
.expect("Worker::failed did not set self.failed?")
|
|
||||||
.clone()));
|
|
||||||
|
|
||||||
// Wake any tasks waiting on channel capacity.
|
match self.service.ready().await {
|
||||||
tracing::debug!("waking pending tasks");
|
Ok(svc) => {
|
||||||
self.close.close();
|
let rsp = svc.call(req.into());
|
||||||
}
|
let _ = tx.send(Ok(rsp));
|
||||||
|
|
||||||
|
self.pending_items += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
self.failed(e.into());
|
||||||
|
let _ = tx.send(Err(self
|
||||||
|
.failed
|
||||||
|
.as_ref()
|
||||||
|
.expect("Worker::failed did not set self.failed?")
|
||||||
|
.clone()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tell the inner service to flush the current batch.
|
||||||
|
///
|
||||||
|
/// Waits until the inner service is ready,
|
||||||
|
/// then stores a future which resolves when the batch finishes.
|
||||||
async fn flush_service(&mut self) {
|
async fn flush_service(&mut self) {
|
||||||
if let Err(e) = self
|
if self.failed.is_some() {
|
||||||
.service
|
tracing::trace!("worker failure: skipping flush");
|
||||||
.ready()
|
return;
|
||||||
.and_then(|svc| svc.call(BatchControl::Flush))
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
self.failed(e.into());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Correctness: allow other tasks to run at the end of every batch.
|
match self.service.ready().await {
|
||||||
tokio::task::yield_now().await;
|
Ok(ready_service) => {
|
||||||
|
let flush_future = ready_service.call(BatchControl::Flush);
|
||||||
|
self.concurrent_batches.push(flush_future.boxed());
|
||||||
|
|
||||||
|
// Now we have an empty batch.
|
||||||
|
self.pending_items = 0;
|
||||||
|
self.pending_batch_timer = None;
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
self.failed(error.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Is the current number of concurrent batches above the configured limit?
|
||||||
|
fn can_spawn_new_batches(&self) -> bool {
|
||||||
|
self.concurrent_batches.len() < self.max_concurrent_batches
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run loop for batch requests, which implements the batch policies.
|
||||||
|
///
|
||||||
|
/// See [`Service::new()`](crate::Service::new) for details.
|
||||||
pub async fn run(mut self) {
|
pub async fn run(mut self) {
|
||||||
// The timer is started when the first entry of a new batch is
|
|
||||||
// submitted, so that the batch latency of all entries is at most
|
|
||||||
// self.max_latency. However, we don't keep the timer running unless
|
|
||||||
// there is a pending request to prevent wakeups on idle services.
|
|
||||||
let mut timer: Option<Pin<Box<Sleep>>> = None;
|
|
||||||
let mut pending_items = 0usize;
|
|
||||||
loop {
|
loop {
|
||||||
match timer.as_mut() {
|
// Wait on either a new message or the batch timer.
|
||||||
None => match self.rx.recv().await {
|
//
|
||||||
// The first message in a new batch.
|
// If both are ready, end the batch now, because the timer has elapsed.
|
||||||
|
// If the timer elapses, any pending messages are preserved:
|
||||||
|
// https://docs.rs/tokio/latest/tokio/sync/mpsc/struct.UnboundedReceiver.html#cancel-safety
|
||||||
|
tokio::select! {
|
||||||
|
biased;
|
||||||
|
|
||||||
|
batch_result = self.concurrent_batches.next(), if !self.concurrent_batches.is_empty() => match batch_result.expect("only returns None when empty") {
|
||||||
|
Ok(_response) => {
|
||||||
|
tracing::trace!(
|
||||||
|
pending_items = self.pending_items,
|
||||||
|
batch_deadline = ?self.pending_batch_timer.as_ref().map(|sleep| sleep.deadline()),
|
||||||
|
running_batches = self.concurrent_batches.len(),
|
||||||
|
"batch finished executing",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
let error = error.into();
|
||||||
|
tracing::trace!(?error, "batch execution failed");
|
||||||
|
self.failed(error);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
Some(()) = OptionFuture::from(self.pending_batch_timer.as_mut()), if self.pending_batch_timer.as_ref().is_some() => {
|
||||||
|
tracing::trace!(
|
||||||
|
pending_items = self.pending_items,
|
||||||
|
batch_deadline = ?self.pending_batch_timer.as_ref().map(|sleep| sleep.deadline()),
|
||||||
|
running_batches = self.concurrent_batches.len(),
|
||||||
|
"batch timer expired",
|
||||||
|
);
|
||||||
|
|
||||||
|
// TODO: use a batch-specific span to instrument this future.
|
||||||
|
self.flush_service().await;
|
||||||
|
},
|
||||||
|
|
||||||
|
maybe_msg = self.rx.recv(), if self.can_spawn_new_batches() => match maybe_msg {
|
||||||
Some(msg) => {
|
Some(msg) => {
|
||||||
|
tracing::trace!(
|
||||||
|
pending_items = self.pending_items,
|
||||||
|
batch_deadline = ?self.pending_batch_timer.as_ref().map(|sleep| sleep.deadline()),
|
||||||
|
running_batches = self.concurrent_batches.len(),
|
||||||
|
"batch message received",
|
||||||
|
);
|
||||||
|
|
||||||
let span = msg.span;
|
let span = msg.span;
|
||||||
|
|
||||||
self.process_req(msg.request, msg.tx)
|
self.process_req(msg.request, msg.tx)
|
||||||
// Apply the provided span to request processing
|
// Apply the provided span to request processing.
|
||||||
.instrument(span)
|
.instrument(span)
|
||||||
.await;
|
.await;
|
||||||
timer = Some(Box::pin(sleep(self.max_latency)));
|
|
||||||
pending_items = 1;
|
|
||||||
}
|
|
||||||
// No more messages, ever.
|
|
||||||
None => return,
|
|
||||||
},
|
|
||||||
Some(sleep) => {
|
|
||||||
// Wait on either a new message or the batch timer.
|
|
||||||
//
|
|
||||||
// If both are ready, end the batch now, because the timer has elapsed.
|
|
||||||
// If the timer elapses, any pending messages are preserved:
|
|
||||||
// https://docs.rs/tokio/latest/tokio/sync/mpsc/struct.UnboundedReceiver.html#cancel-safety
|
|
||||||
tokio::select! {
|
|
||||||
biased;
|
|
||||||
|
|
||||||
// The batch timer elapsed.
|
// Check whether we have too many pending items.
|
||||||
() = sleep => {
|
if self.pending_items >= self.max_items_in_batch {
|
||||||
|
tracing::trace!(
|
||||||
|
pending_items = self.pending_items,
|
||||||
|
batch_deadline = ?self.pending_batch_timer.as_ref().map(|sleep| sleep.deadline()),
|
||||||
|
running_batches = self.concurrent_batches.len(),
|
||||||
|
"batch is full",
|
||||||
|
);
|
||||||
|
|
||||||
// TODO: use a batch-specific span to instrument this future.
|
// TODO: use a batch-specific span to instrument this future.
|
||||||
self.flush_service().await;
|
self.flush_service().await;
|
||||||
|
} else if self.pending_items == 1 {
|
||||||
|
tracing::trace!(
|
||||||
|
pending_items = self.pending_items,
|
||||||
|
batch_deadline = ?self.pending_batch_timer.as_ref().map(|sleep| sleep.deadline()),
|
||||||
|
running_batches = self.concurrent_batches.len(),
|
||||||
|
"batch is new, starting timer",
|
||||||
|
);
|
||||||
|
|
||||||
// Now we have an empty batch.
|
// The first message in a new batch.
|
||||||
timer = None;
|
self.pending_batch_timer = Some(Box::pin(sleep(self.max_latency)));
|
||||||
pending_items = 0;
|
} else {
|
||||||
|
tracing::trace!(
|
||||||
|
pending_items = self.pending_items,
|
||||||
|
batch_deadline = ?self.pending_batch_timer.as_ref().map(|sleep| sleep.deadline()),
|
||||||
|
running_batches = self.concurrent_batches.len(),
|
||||||
|
"waiting for full batch or batch timer",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
maybe_msg = self.rx.recv() => match maybe_msg {
|
|
||||||
Some(msg) => {
|
|
||||||
let span = msg.span;
|
|
||||||
self.process_req(msg.request, msg.tx)
|
|
||||||
// Apply the provided span to request processing.
|
|
||||||
.instrument(span)
|
|
||||||
.await;
|
|
||||||
pending_items += 1;
|
|
||||||
// Check whether we have too many pending items.
|
|
||||||
if pending_items >= self.max_items {
|
|
||||||
// TODO: use a batch-specific span to instrument this future.
|
|
||||||
self.flush_service().await;
|
|
||||||
|
|
||||||
// Now we have an empty batch.
|
|
||||||
timer = None;
|
|
||||||
pending_items = 0;
|
|
||||||
} else {
|
|
||||||
// The timer is still running.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// No more messages, ever.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
None => {
|
||||||
|
tracing::trace!("batch channel closed and emptied, exiting worker task");
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Register an inner service failure.
|
||||||
|
///
|
||||||
|
/// The underlying service failed when we called `poll_ready` on it with the given `error`. We
|
||||||
|
/// need to communicate this to all the `Buffer` handles. To do so, we wrap up the error in
|
||||||
|
/// an `Arc`, send that `Arc<E>` to all pending requests, and store it so that subsequent
|
||||||
|
/// requests will also fail with the same error.
|
||||||
fn failed(&mut self, error: crate::BoxError) {
|
fn failed(&mut self, error: crate::BoxError) {
|
||||||
// The underlying service failed when we called `poll_ready` on it with the given `error`. We
|
tracing::debug!(?error, "batch worker error");
|
||||||
// need to communicate this to all the `Buffer` handles. To do so, we wrap up the error in
|
|
||||||
// an `Arc`, send that `Arc<E>` to all pending requests, and store it so that subsequent
|
|
||||||
// requests will also fail with the same error.
|
|
||||||
|
|
||||||
// Note that we need to handle the case where some error_handle is concurrently trying to send us
|
// Note that we need to handle the case where some error_handle is concurrently trying to send us
|
||||||
// a request. We need to make sure that *either* the send of the request fails *or* it
|
// a request. We need to make sure that *either* the send of the request fails *or* it
|
||||||
|
|
@ -222,15 +305,23 @@ where
|
||||||
|
|
||||||
let mut inner = self.error_handle.inner.lock().unwrap();
|
let mut inner = self.error_handle.inner.lock().unwrap();
|
||||||
|
|
||||||
|
// Ignore duplicate failures
|
||||||
if inner.is_some() {
|
if inner.is_some() {
|
||||||
// Future::poll was called after we've already errored out!
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
*inner = Some(error.clone());
|
*inner = Some(error.clone());
|
||||||
drop(inner);
|
drop(inner);
|
||||||
|
|
||||||
|
tracing::trace!(
|
||||||
|
?error,
|
||||||
|
"worker failure: waking pending requests so they can be failed",
|
||||||
|
);
|
||||||
self.rx.close();
|
self.rx.close();
|
||||||
|
self.close.close();
|
||||||
|
|
||||||
|
// We don't schedule any batches on an errored service
|
||||||
|
self.pending_batch_timer = None;
|
||||||
|
|
||||||
// By closing the mpsc::Receiver, we know that that the run() loop will
|
// By closing the mpsc::Receiver, we know that that the run() loop will
|
||||||
// drain all pending requests. We just need to make sure that any
|
// drain all pending requests. We just need to make sure that any
|
||||||
|
|
@ -263,16 +354,39 @@ impl Clone for ErrorHandle {
|
||||||
impl<T, Request> PinnedDrop for Worker<T, Request>
|
impl<T, Request> PinnedDrop for Worker<T, Request>
|
||||||
where
|
where
|
||||||
T: Service<BatchControl<Request>>,
|
T: Service<BatchControl<Request>>,
|
||||||
|
T::Future: Send + 'static,
|
||||||
T::Error: Into<crate::BoxError>,
|
T::Error: Into<crate::BoxError>,
|
||||||
{
|
{
|
||||||
fn drop(mut self: Pin<&mut Self>) {
|
fn drop(mut self: Pin<&mut Self>) {
|
||||||
|
tracing::trace!(
|
||||||
|
pending_items = self.pending_items,
|
||||||
|
batch_deadline = ?self.pending_batch_timer.as_ref().map(|sleep| sleep.deadline()),
|
||||||
|
running_batches = self.concurrent_batches.len(),
|
||||||
|
error = ?self.failed,
|
||||||
|
"dropping batch worker",
|
||||||
|
);
|
||||||
|
|
||||||
// Fail pending tasks
|
// Fail pending tasks
|
||||||
self.failed(Closed::new().into());
|
self.failed(Closed::new().into());
|
||||||
|
|
||||||
// Clear queued requests
|
// Fail queued requests
|
||||||
while self.rx.try_recv().is_ok() {}
|
while let Ok(msg) = self.rx.try_recv() {
|
||||||
|
let _ = msg
|
||||||
|
.tx
|
||||||
|
.send(Err(self.failed.as_ref().expect("just set failed").clone()));
|
||||||
|
}
|
||||||
|
|
||||||
// Stop accepting reservations
|
// Clear any finished batches, ignoring any errors.
|
||||||
self.close.close();
|
// Ignore any batches that are still executing, because we can't cancel them.
|
||||||
|
//
|
||||||
|
// now_or_never() can stop futures waking up, but that's ok here,
|
||||||
|
// because we're manually polling, then dropping the stream.
|
||||||
|
while let Some(Some(_)) = self
|
||||||
|
.as_mut()
|
||||||
|
.project()
|
||||||
|
.concurrent_batches
|
||||||
|
.next()
|
||||||
|
.now_or_never()
|
||||||
|
{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ async fn batch_flushes_on_max_items() -> Result<(), Report> {
|
||||||
// flushing is happening based on hitting max_items.
|
// flushing is happening based on hitting max_items.
|
||||||
//
|
//
|
||||||
// Create our own verifier, so we don't shut down a shared verifier used by other tests.
|
// Create our own verifier, so we don't shut down a shared verifier used by other tests.
|
||||||
let verifier = Batch::new(Ed25519Verifier::default(), 10, Duration::from_secs(1000));
|
let verifier = Batch::new(Ed25519Verifier::default(), 10, 5, Duration::from_secs(1000));
|
||||||
timeout(Duration::from_secs(1), sign_and_verify(verifier, 100, None))
|
timeout(Duration::from_secs(1), sign_and_verify(verifier, 100, None))
|
||||||
.await
|
.await
|
||||||
.map_err(|e| eyre!(e))?
|
.map_err(|e| eyre!(e))?
|
||||||
|
|
@ -79,7 +79,12 @@ async fn batch_flushes_on_max_latency() -> Result<(), Report> {
|
||||||
// flushing is happening based on hitting max_latency.
|
// flushing is happening based on hitting max_latency.
|
||||||
//
|
//
|
||||||
// Create our own verifier, so we don't shut down a shared verifier used by other tests.
|
// Create our own verifier, so we don't shut down a shared verifier used by other tests.
|
||||||
let verifier = Batch::new(Ed25519Verifier::default(), 100, Duration::from_millis(500));
|
let verifier = Batch::new(
|
||||||
|
Ed25519Verifier::default(),
|
||||||
|
100,
|
||||||
|
10,
|
||||||
|
Duration::from_millis(500),
|
||||||
|
);
|
||||||
timeout(Duration::from_secs(1), sign_and_verify(verifier, 10, None))
|
timeout(Duration::from_secs(1), sign_and_verify(verifier, 10, None))
|
||||||
.await
|
.await
|
||||||
.map_err(|e| eyre!(e))?
|
.map_err(|e| eyre!(e))?
|
||||||
|
|
@ -94,7 +99,12 @@ async fn fallback_verification() -> Result<(), Report> {
|
||||||
|
|
||||||
// Create our own verifier, so we don't shut down a shared verifier used by other tests.
|
// Create our own verifier, so we don't shut down a shared verifier used by other tests.
|
||||||
let verifier = Fallback::new(
|
let verifier = Fallback::new(
|
||||||
Batch::new(Ed25519Verifier::default(), 10, Duration::from_millis(100)),
|
Batch::new(
|
||||||
|
Ed25519Verifier::default(),
|
||||||
|
10,
|
||||||
|
1,
|
||||||
|
Duration::from_millis(100),
|
||||||
|
),
|
||||||
tower::service_fn(|item: Ed25519Item| async move { item.verify_single() }),
|
tower::service_fn(|item: Ed25519Item| async move { item.verify_single() }),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ async fn wakes_pending_waiters_on_close() {
|
||||||
|
|
||||||
let (service, mut handle) = mock::pair::<_, ()>();
|
let (service, mut handle) = mock::pair::<_, ()>();
|
||||||
|
|
||||||
let (mut service, worker) = Batch::pair(service, 1, Duration::from_secs(1));
|
let (mut service, worker) = Batch::pair(service, 1, 1, Duration::from_secs(1));
|
||||||
let mut worker = task::spawn(worker.run());
|
let mut worker = task::spawn(worker.run());
|
||||||
|
|
||||||
// // keep the request in the worker
|
// // keep the request in the worker
|
||||||
|
|
@ -72,7 +72,7 @@ async fn wakes_pending_waiters_on_failure() {
|
||||||
|
|
||||||
let (service, mut handle) = mock::pair::<_, ()>();
|
let (service, mut handle) = mock::pair::<_, ()>();
|
||||||
|
|
||||||
let (mut service, worker) = Batch::pair(service, 1, Duration::from_secs(1));
|
let (mut service, worker) = Batch::pair(service, 1, 1, Duration::from_secs(1));
|
||||||
let mut worker = task::spawn(worker.run());
|
let mut worker = task::spawn(worker.run());
|
||||||
|
|
||||||
// keep the request in the worker
|
// keep the request in the worker
|
||||||
|
|
|
||||||
|
|
@ -13,10 +13,10 @@ proptest-impl = ["proptest", "proptest-derive", "zebra-chain/proptest-impl", "ze
|
||||||
blake2b_simd = "1.0.0"
|
blake2b_simd = "1.0.0"
|
||||||
bellman = "0.13.0"
|
bellman = "0.13.0"
|
||||||
bls12_381 = "0.7.0"
|
bls12_381 = "0.7.0"
|
||||||
|
halo2 = { package = "halo2_proofs", version = "0.2.0" }
|
||||||
jubjub = "0.9.0"
|
jubjub = "0.9.0"
|
||||||
rand = { version = "0.8.5", package = "rand" }
|
rand = { version = "0.8.5", package = "rand" }
|
||||||
|
rayon = "1.5.3"
|
||||||
halo2 = { package = "halo2_proofs", version = "0.2.0" }
|
|
||||||
|
|
||||||
chrono = "0.4.19"
|
chrono = "0.4.19"
|
||||||
dirs = "4.0.0"
|
dirs = "4.0.0"
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ use futures::{future::BoxFuture, FutureExt};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
|
|
||||||
|
use rayon::prelude::*;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
use tower::{util::ServiceFn, Service};
|
use tower::{util::ServiceFn, Service};
|
||||||
use tower_batch::{Batch, BatchControl};
|
use tower_batch::{Batch, BatchControl};
|
||||||
|
|
@ -48,6 +49,7 @@ pub static VERIFIER: Lazy<
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::default(),
|
Verifier::default(),
|
||||||
super::MAX_BATCH_SIZE,
|
super::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
super::MAX_BATCH_LATENCY,
|
super::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
// We want to fallback to individual verification if batch verification fails,
|
// We want to fallback to individual verification if batch verification fails,
|
||||||
|
|
@ -106,38 +108,48 @@ impl Verifier {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function blocks until the batch is completed on the thread pool.
|
/// This returns immediately, usually before the batch is completed.
|
||||||
fn flush_blocking(&mut self) {
|
fn flush_blocking(&mut self) {
|
||||||
let (batch, tx) = self.take();
|
let (batch, tx) = self.take();
|
||||||
|
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
//
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// We don't care about execution order here, because this method is only called on drop.
|
||||||
//
|
tokio::task::block_in_place(|| rayon::spawn_fifo(|| Self::verify(batch, tx)));
|
||||||
// TODO: replace with the rayon thread pool
|
|
||||||
tokio::task::block_in_place(|| Self::verify(batch, tx));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function returns a future that becomes ready when the batch is completed.
|
/// This function returns a future that becomes ready when the batch is completed.
|
||||||
fn flush_spawning(batch: BatchVerifier, tx: Sender) -> impl Future<Output = ()> {
|
fn flush_spawning(batch: BatchVerifier, tx: Sender) -> impl Future<Output = ()> {
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(|| {
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// TODO:
|
||||||
//
|
// - spawn batches so rayon executes them in FIFO order
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// possible implementation: return a closure in a Future,
|
||||||
tokio::task::spawn_blocking(|| Self::verify(batch, tx))
|
// then run it using scope_fifo() in the worker task,
|
||||||
.map(|join_result| join_result.expect("panic in ed25519 batch verifier"))
|
// limiting the number of concurrent batches to the number of rayon threads
|
||||||
|
rayon::scope_fifo(|s| s.spawn_fifo(|_s| Self::verify(batch, tx)))
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in ed25519 batch verifier"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Verify a single item using a thread pool, and return the result.
|
/// Verify a single item using a thread pool, and return the result.
|
||||||
/// This function returns a future that becomes ready when the item is completed.
|
/// This function returns a future that becomes ready when the item is completed.
|
||||||
fn verify_single_spawning(item: Item) -> impl Future<Output = VerifyResult> {
|
fn verify_single_spawning(item: Item) -> impl Future<Output = VerifyResult> {
|
||||||
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(|| {
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// Rayon doesn't have a spawn function that returns a value,
|
||||||
tokio::task::spawn_blocking(|| item.verify_single())
|
// so we use a parallel iterator instead.
|
||||||
.map(|join_result| join_result.expect("panic in ed25519 fallback verifier"))
|
//
|
||||||
|
// TODO:
|
||||||
|
// - when a batch fails, spawn all its individual items into rayon using Vec::par_iter()
|
||||||
|
// - spawn fallback individual verifications so rayon executes them in FIFO order,
|
||||||
|
// if possible
|
||||||
|
rayon::iter::once(item)
|
||||||
|
.map(|item| item.verify_single())
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in ed25519 fallback verifier"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -192,9 +204,7 @@ impl Service<BatchControl<Item>> for Verifier {
|
||||||
impl Drop for Verifier {
|
impl Drop for Verifier {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
// We need to flush the current batch in case there are still any pending futures.
|
// We need to flush the current batch in case there are still any pending futures.
|
||||||
// This blocks the current thread and any futures running on it, until the batch is complete.
|
// This returns immediately, usually before the batch is completed.
|
||||||
//
|
|
||||||
// TODO: move the batch onto the rayon thread pool, then drop the verifier immediately.
|
|
||||||
self.flush_blocking();
|
self.flush_blocking();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ async fn batch_flushes_on_max_items() -> Result<()> {
|
||||||
|
|
||||||
// Use a very long max_latency and a short timeout to check that
|
// Use a very long max_latency and a short timeout to check that
|
||||||
// flushing is happening based on hitting max_items.
|
// flushing is happening based on hitting max_items.
|
||||||
let verifier = Batch::new(Verifier::default(), 10, Duration::from_secs(1000));
|
let verifier = Batch::new(Verifier::default(), 10, 5, Duration::from_secs(1000));
|
||||||
timeout(Duration::from_secs(5), sign_and_verify(verifier, 100))
|
timeout(Duration::from_secs(5), sign_and_verify(verifier, 100))
|
||||||
.await?
|
.await?
|
||||||
.map_err(|e| eyre!(e))?;
|
.map_err(|e| eyre!(e))?;
|
||||||
|
|
@ -63,7 +63,7 @@ async fn batch_flushes_on_max_latency() -> Result<()> {
|
||||||
|
|
||||||
// Use a very high max_items and a short timeout to check that
|
// Use a very high max_items and a short timeout to check that
|
||||||
// flushing is happening based on hitting max_latency.
|
// flushing is happening based on hitting max_latency.
|
||||||
let verifier = Batch::new(Verifier::default(), 100, Duration::from_millis(500));
|
let verifier = Batch::new(Verifier::default(), 100, 10, Duration::from_millis(500));
|
||||||
timeout(Duration::from_secs(5), sign_and_verify(verifier, 10))
|
timeout(Duration::from_secs(5), sign_and_verify(verifier, 10))
|
||||||
.await?
|
.await?
|
||||||
.map_err(|e| eyre!(e))?;
|
.map_err(|e| eyre!(e))?;
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,8 @@ use bls12_381::Bls12;
|
||||||
use futures::{future::BoxFuture, FutureExt};
|
use futures::{future::BoxFuture, FutureExt};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
|
|
||||||
|
use rayon::prelude::*;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
use tower::{util::ServiceFn, Service};
|
use tower::{util::ServiceFn, Service};
|
||||||
|
|
||||||
|
|
@ -78,6 +80,7 @@ pub static SPEND_VERIFIER: Lazy<
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&GROTH16_PARAMETERS.sapling.spend.vk),
|
Verifier::new(&GROTH16_PARAMETERS.sapling.spend.vk),
|
||||||
super::MAX_BATCH_SIZE,
|
super::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
super::MAX_BATCH_LATENCY,
|
super::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
// We want to fallback to individual verification if batch verification fails,
|
// We want to fallback to individual verification if batch verification fails,
|
||||||
|
|
@ -116,6 +119,7 @@ pub static OUTPUT_VERIFIER: Lazy<
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&GROTH16_PARAMETERS.sapling.output.vk),
|
Verifier::new(&GROTH16_PARAMETERS.sapling.output.vk),
|
||||||
super::MAX_BATCH_SIZE,
|
super::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
super::MAX_BATCH_LATENCY,
|
super::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
// We want to fallback to individual verification if batch verification
|
// We want to fallback to individual verification if batch verification
|
||||||
|
|
@ -401,16 +405,14 @@ impl Verifier {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function blocks until the batch is completed on the thread pool.
|
/// This returns immediately, usually before the batch is completed.
|
||||||
fn flush_blocking(&mut self) {
|
fn flush_blocking(&mut self) {
|
||||||
let (batch, vk, tx) = self.take();
|
let (batch, vk, tx) = self.take();
|
||||||
|
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
//
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// We don't care about execution order here, because this method is only called on drop.
|
||||||
//
|
tokio::task::block_in_place(|| rayon::spawn_fifo(|| Self::verify(batch, vk, tx)));
|
||||||
// TODO: replace with the rayon thread pool
|
|
||||||
tokio::task::block_in_place(|| Self::verify(batch, vk, tx));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
|
|
@ -420,13 +422,16 @@ impl Verifier {
|
||||||
vk: &'static BatchVerifyingKey,
|
vk: &'static BatchVerifyingKey,
|
||||||
tx: Sender,
|
tx: Sender,
|
||||||
) -> impl Future<Output = ()> {
|
) -> impl Future<Output = ()> {
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(move || {
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// TODO:
|
||||||
//
|
// - spawn batches so rayon executes them in FIFO order
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// possible implementation: return a closure in a Future,
|
||||||
tokio::task::spawn_blocking(move || Self::verify(batch, vk, tx))
|
// then run it using scope_fifo() in the worker task,
|
||||||
.map(|join_result| join_result.expect("panic in groth16 batch verifier"))
|
// limiting the number of concurrent batches to the number of rayon threads
|
||||||
|
rayon::scope_fifo(move |s| s.spawn_fifo(move |_s| Self::verify(batch, vk, tx)))
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in groth16 batch verifier"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Verify a single item using a thread pool, and return the result.
|
/// Verify a single item using a thread pool, and return the result.
|
||||||
|
|
@ -436,10 +441,19 @@ impl Verifier {
|
||||||
pvk: &'static ItemVerifyingKey,
|
pvk: &'static ItemVerifyingKey,
|
||||||
) -> impl Future<Output = VerifyResult> {
|
) -> impl Future<Output = VerifyResult> {
|
||||||
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(move || {
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// Rayon doesn't have a spawn function that returns a value,
|
||||||
tokio::task::spawn_blocking(move || item.verify_single(pvk))
|
// so we use a parallel iterator instead.
|
||||||
.map(|join_result| join_result.expect("panic in groth16 fallback verifier"))
|
//
|
||||||
|
// TODO:
|
||||||
|
// - when a batch fails, spawn all its individual items into rayon using Vec::par_iter()
|
||||||
|
// - spawn fallback individual verifications so rayon executes them in FIFO order,
|
||||||
|
// if possible
|
||||||
|
rayon::iter::once(item)
|
||||||
|
.map(move |item| item.verify_single(pvk))
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in groth16 fallback verifier"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -510,9 +524,7 @@ impl Service<BatchControl<Item>> for Verifier {
|
||||||
impl Drop for Verifier {
|
impl Drop for Verifier {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
// We need to flush the current batch in case there are still any pending futures.
|
// We need to flush the current batch in case there are still any pending futures.
|
||||||
// This blocks the current thread and any futures running on it, until the batch is complete.
|
// This returns immediately, usually before the batch is completed.
|
||||||
//
|
|
||||||
// TODO: move the batch onto the rayon thread pool, then drop the verifier immediately.
|
|
||||||
self.flush_blocking()
|
self.flush_blocking()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,7 @@ async fn verify_sapling_groth16() {
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&GROTH16_PARAMETERS.sapling.spend.vk),
|
Verifier::new(&GROTH16_PARAMETERS.sapling.spend.vk),
|
||||||
crate::primitives::MAX_BATCH_SIZE,
|
crate::primitives::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
crate::primitives::MAX_BATCH_LATENCY,
|
crate::primitives::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
tower::service_fn(
|
tower::service_fn(
|
||||||
|
|
@ -87,6 +88,7 @@ async fn verify_sapling_groth16() {
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&GROTH16_PARAMETERS.sapling.output.vk),
|
Verifier::new(&GROTH16_PARAMETERS.sapling.output.vk),
|
||||||
crate::primitives::MAX_BATCH_SIZE,
|
crate::primitives::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
crate::primitives::MAX_BATCH_LATENCY,
|
crate::primitives::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
tower::service_fn(
|
tower::service_fn(
|
||||||
|
|
@ -179,6 +181,7 @@ async fn correctly_err_on_invalid_output_proof() {
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&GROTH16_PARAMETERS.sapling.output.vk),
|
Verifier::new(&GROTH16_PARAMETERS.sapling.output.vk),
|
||||||
crate::primitives::MAX_BATCH_SIZE,
|
crate::primitives::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
crate::primitives::MAX_BATCH_LATENCY,
|
crate::primitives::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
tower::service_fn(
|
tower::service_fn(
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,8 @@ use futures::{future::BoxFuture, FutureExt};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use orchard::circuit::VerifyingKey;
|
use orchard::circuit::VerifyingKey;
|
||||||
use rand::{thread_rng, CryptoRng, RngCore};
|
use rand::{thread_rng, CryptoRng, RngCore};
|
||||||
|
|
||||||
|
use rayon::prelude::*;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
use tower::{util::ServiceFn, Service};
|
use tower::{util::ServiceFn, Service};
|
||||||
|
|
@ -203,6 +205,7 @@ pub static VERIFIER: Lazy<
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&VERIFYING_KEY),
|
Verifier::new(&VERIFYING_KEY),
|
||||||
HALO2_MAX_BATCH_SIZE,
|
HALO2_MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
super::MAX_BATCH_LATENCY,
|
super::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
// We want to fallback to individual verification if batch verification fails,
|
// We want to fallback to individual verification if batch verification fails,
|
||||||
|
|
@ -269,16 +272,14 @@ impl Verifier {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function blocks until the batch is completed on the thread pool.
|
/// This returns immediately, usually before the batch is completed.
|
||||||
fn flush_blocking(&mut self) {
|
fn flush_blocking(&mut self) {
|
||||||
let (batch, vk, tx) = self.take();
|
let (batch, vk, tx) = self.take();
|
||||||
|
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
//
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// We don't care about execution order here, because this method is only called on drop.
|
||||||
//
|
tokio::task::block_in_place(|| rayon::spawn_fifo(|| Self::verify(batch, vk, tx)));
|
||||||
// TODO: replace with the rayon thread pool
|
|
||||||
tokio::task::block_in_place(|| Self::verify(batch, vk, tx));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
|
|
@ -288,13 +289,16 @@ impl Verifier {
|
||||||
vk: &'static BatchVerifyingKey,
|
vk: &'static BatchVerifyingKey,
|
||||||
tx: Sender,
|
tx: Sender,
|
||||||
) -> impl Future<Output = ()> {
|
) -> impl Future<Output = ()> {
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(move || {
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// TODO:
|
||||||
//
|
// - spawn batches so rayon executes them in FIFO order
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// possible implementation: return a closure in a Future,
|
||||||
tokio::task::spawn_blocking(move || Self::verify(batch, vk, tx))
|
// then run it using scope_fifo() in the worker task,
|
||||||
.map(|join_result| join_result.expect("panic in halo2 batch verifier"))
|
// limiting the number of concurrent batches to the number of rayon threads
|
||||||
|
rayon::scope_fifo(move |s| s.spawn_fifo(move |_s| Self::verify(batch, vk, tx)))
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in halo2 batch verifier"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Verify a single item using a thread pool, and return the result.
|
/// Verify a single item using a thread pool, and return the result.
|
||||||
|
|
@ -304,10 +308,19 @@ impl Verifier {
|
||||||
pvk: &'static ItemVerifyingKey,
|
pvk: &'static ItemVerifyingKey,
|
||||||
) -> impl Future<Output = VerifyResult> {
|
) -> impl Future<Output = VerifyResult> {
|
||||||
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(move || {
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// Rayon doesn't have a spawn function that returns a value,
|
||||||
tokio::task::spawn_blocking(move || item.verify_single(pvk).map_err(Halo2Error::from))
|
// so we use a parallel iterator instead.
|
||||||
.map(|join_result| join_result.expect("panic in halo2 fallback verifier"))
|
//
|
||||||
|
// TODO:
|
||||||
|
// - when a batch fails, spawn all its individual items into rayon using Vec::par_iter()
|
||||||
|
// - spawn fallback individual verifications so rayon executes them in FIFO order,
|
||||||
|
// if possible
|
||||||
|
rayon::iter::once(item)
|
||||||
|
.map(move |item| item.verify_single(pvk).map_err(Halo2Error::from))
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in halo2 fallback verifier"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -377,9 +390,7 @@ impl Service<BatchControl<Item>> for Verifier {
|
||||||
impl Drop for Verifier {
|
impl Drop for Verifier {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
// We need to flush the current batch in case there are still any pending futures.
|
// We need to flush the current batch in case there are still any pending futures.
|
||||||
// This blocks the current thread and any futures running on it, until the batch is complete.
|
// This returns immediately, usually before the batch is completed.
|
||||||
//
|
|
||||||
// TODO: move the batch onto the rayon thread pool, then drop the verifier immediately.
|
|
||||||
self.flush_blocking()
|
self.flush_blocking()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -151,6 +151,7 @@ async fn verify_generated_halo2_proofs() {
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&VERIFYING_KEY),
|
Verifier::new(&VERIFYING_KEY),
|
||||||
crate::primitives::MAX_BATCH_SIZE,
|
crate::primitives::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
crate::primitives::MAX_BATCH_LATENCY,
|
crate::primitives::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
tower::service_fn(
|
tower::service_fn(
|
||||||
|
|
@ -217,6 +218,7 @@ async fn correctly_err_on_invalid_halo2_proofs() {
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::new(&VERIFYING_KEY),
|
Verifier::new(&VERIFYING_KEY),
|
||||||
crate::primitives::MAX_BATCH_SIZE,
|
crate::primitives::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
crate::primitives::MAX_BATCH_LATENCY,
|
crate::primitives::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
tower::service_fn(
|
tower::service_fn(
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ use futures::{future::BoxFuture, FutureExt};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
|
|
||||||
|
use rayon::prelude::*;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
use tower::{util::ServiceFn, Service};
|
use tower::{util::ServiceFn, Service};
|
||||||
use tower_batch::{Batch, BatchControl};
|
use tower_batch::{Batch, BatchControl};
|
||||||
|
|
@ -49,6 +50,7 @@ pub static VERIFIER: Lazy<
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::default(),
|
Verifier::default(),
|
||||||
super::MAX_BATCH_SIZE,
|
super::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
super::MAX_BATCH_LATENCY,
|
super::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
// We want to fallback to individual verification if batch verification fails,
|
// We want to fallback to individual verification if batch verification fails,
|
||||||
|
|
@ -107,38 +109,48 @@ impl Verifier {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function blocks until the batch is completed on the thread pool.
|
/// This returns immediately, usually before the batch is completed.
|
||||||
fn flush_blocking(&mut self) {
|
fn flush_blocking(&mut self) {
|
||||||
let (batch, tx) = self.take();
|
let (batch, tx) = self.take();
|
||||||
|
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
//
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// We don't care about execution order here, because this method is only called on drop.
|
||||||
//
|
tokio::task::block_in_place(|| rayon::spawn_fifo(|| Self::verify(batch, tx)));
|
||||||
// TODO: replace with the rayon thread pool
|
|
||||||
tokio::task::block_in_place(|| Self::verify(batch, tx));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function returns a future that becomes ready when the batch is completed.
|
/// This function returns a future that becomes ready when the batch is completed.
|
||||||
fn flush_spawning(batch: BatchVerifier, tx: Sender) -> impl Future<Output = ()> {
|
fn flush_spawning(batch: BatchVerifier, tx: Sender) -> impl Future<Output = ()> {
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(|| {
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// TODO:
|
||||||
//
|
// - spawn batches so rayon executes them in FIFO order
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// possible implementation: return a closure in a Future,
|
||||||
tokio::task::spawn_blocking(|| Self::verify(batch, tx))
|
// then run it using scope_fifo() in the worker task,
|
||||||
.map(|join_result| join_result.expect("panic in redjubjub batch verifier"))
|
// limiting the number of concurrent batches to the number of rayon threads
|
||||||
|
rayon::scope_fifo(|s| s.spawn_fifo(|_s| Self::verify(batch, tx)))
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in redjubjub batch verifier"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Verify a single item using a thread pool, and return the result.
|
/// Verify a single item using a thread pool, and return the result.
|
||||||
/// This function returns a future that becomes ready when the item is completed.
|
/// This function returns a future that becomes ready when the item is completed.
|
||||||
fn verify_single_spawning(item: Item) -> impl Future<Output = VerifyResult> {
|
fn verify_single_spawning(item: Item) -> impl Future<Output = VerifyResult> {
|
||||||
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(|| {
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// Rayon doesn't have a spawn function that returns a value,
|
||||||
tokio::task::spawn_blocking(|| item.verify_single())
|
// so we use a parallel iterator instead.
|
||||||
.map(|join_result| join_result.expect("panic in redjubjub fallback verifier"))
|
//
|
||||||
|
// TODO:
|
||||||
|
// - when a batch fails, spawn all its individual items into rayon using Vec::par_iter()
|
||||||
|
// - spawn fallback individual verifications so rayon executes them in FIFO order,
|
||||||
|
// if possible
|
||||||
|
rayon::iter::once(item)
|
||||||
|
.map(|item| item.verify_single())
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in redjubjub fallback verifier"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -194,9 +206,7 @@ impl Service<BatchControl<Item>> for Verifier {
|
||||||
impl Drop for Verifier {
|
impl Drop for Verifier {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
// We need to flush the current batch in case there are still any pending futures.
|
// We need to flush the current batch in case there are still any pending futures.
|
||||||
// This blocks the current thread and any futures running on it, until the batch is complete.
|
// This returns immediately, usually before the batch is completed.
|
||||||
//
|
|
||||||
// TODO: move the batch onto the rayon thread pool, then drop the verifier immediately.
|
|
||||||
self.flush_blocking();
|
self.flush_blocking();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ async fn batch_flushes_on_max_items() -> Result<()> {
|
||||||
|
|
||||||
// Use a very long max_latency and a short timeout to check that
|
// Use a very long max_latency and a short timeout to check that
|
||||||
// flushing is happening based on hitting max_items.
|
// flushing is happening based on hitting max_items.
|
||||||
let verifier = Batch::new(Verifier::default(), 10, Duration::from_secs(1000));
|
let verifier = Batch::new(Verifier::default(), 10, 5, Duration::from_secs(1000));
|
||||||
timeout(Duration::from_secs(5), sign_and_verify(verifier, 100))
|
timeout(Duration::from_secs(5), sign_and_verify(verifier, 100))
|
||||||
.await?
|
.await?
|
||||||
.map_err(|e| eyre!(e))?;
|
.map_err(|e| eyre!(e))?;
|
||||||
|
|
@ -75,7 +75,7 @@ async fn batch_flushes_on_max_latency() -> Result<()> {
|
||||||
|
|
||||||
// Use a very high max_items and a short timeout to check that
|
// Use a very high max_items and a short timeout to check that
|
||||||
// flushing is happening based on hitting max_latency.
|
// flushing is happening based on hitting max_latency.
|
||||||
let verifier = Batch::new(Verifier::default(), 100, Duration::from_millis(500));
|
let verifier = Batch::new(Verifier::default(), 100, 10, Duration::from_millis(500));
|
||||||
timeout(Duration::from_secs(5), sign_and_verify(verifier, 10))
|
timeout(Duration::from_secs(5), sign_and_verify(verifier, 10))
|
||||||
.await?
|
.await?
|
||||||
.map_err(|e| eyre!(e))?;
|
.map_err(|e| eyre!(e))?;
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@ use std::{
|
||||||
use futures::{future::BoxFuture, FutureExt};
|
use futures::{future::BoxFuture, FutureExt};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
|
|
||||||
|
use rayon::prelude::*;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
use tower::{util::ServiceFn, Service};
|
use tower::{util::ServiceFn, Service};
|
||||||
use tower_batch::{Batch, BatchControl};
|
use tower_batch::{Batch, BatchControl};
|
||||||
|
|
@ -48,6 +50,7 @@ pub static VERIFIER: Lazy<
|
||||||
Batch::new(
|
Batch::new(
|
||||||
Verifier::default(),
|
Verifier::default(),
|
||||||
super::MAX_BATCH_SIZE,
|
super::MAX_BATCH_SIZE,
|
||||||
|
None,
|
||||||
super::MAX_BATCH_LATENCY,
|
super::MAX_BATCH_LATENCY,
|
||||||
),
|
),
|
||||||
// We want to fallback to individual verification if batch verification fails,
|
// We want to fallback to individual verification if batch verification fails,
|
||||||
|
|
@ -106,38 +109,48 @@ impl Verifier {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function blocks until the batch is completed on the thread pool.
|
/// This returns immediately, usually before the batch is completed.
|
||||||
fn flush_blocking(&mut self) {
|
fn flush_blocking(&mut self) {
|
||||||
let (batch, tx) = self.take();
|
let (batch, tx) = self.take();
|
||||||
|
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
//
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// We don't care about execution order here, because this method is only called on drop.
|
||||||
//
|
tokio::task::block_in_place(|| rayon::spawn_fifo(|| Self::verify(batch, tx)));
|
||||||
// TODO: replace with the rayon thread pool
|
|
||||||
tokio::task::block_in_place(|| Self::verify(batch, tx));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the batch using a thread pool, and return the result via the channel.
|
/// Flush the batch using a thread pool, and return the result via the channel.
|
||||||
/// This function returns a future that becomes ready when the batch is completed.
|
/// This function returns a future that becomes ready when the batch is completed.
|
||||||
fn flush_spawning(batch: BatchVerifier, tx: Sender) -> impl Future<Output = ()> {
|
fn flush_spawning(batch: BatchVerifier, tx: Sender) -> impl Future<Output = ()> {
|
||||||
// # Correctness
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(|| {
|
||||||
// Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// TODO:
|
||||||
//
|
// - spawn batches so rayon executes them in FIFO order
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// possible implementation: return a closure in a Future,
|
||||||
tokio::task::spawn_blocking(|| Self::verify(batch, tx))
|
// then run it using scope_fifo() in the worker task,
|
||||||
.map(|join_result| join_result.expect("panic in redpallas batch verifier"))
|
// limiting the number of concurrent batches to the number of rayon threads
|
||||||
|
rayon::scope_fifo(|s| s.spawn_fifo(|_s| Self::verify(batch, tx)))
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in ed25519 batch verifier"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Verify a single item using a thread pool, and return the result.
|
/// Verify a single item using a thread pool, and return the result.
|
||||||
/// This function returns a future that becomes ready when the item is completed.
|
/// This function returns a future that becomes ready when the item is completed.
|
||||||
fn verify_single_spawning(item: Item) -> impl Future<Output = VerifyResult> {
|
fn verify_single_spawning(item: Item) -> impl Future<Output = VerifyResult> {
|
||||||
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
// Correctness: Do CPU-intensive work on a dedicated thread, to avoid blocking other futures.
|
||||||
//
|
tokio::task::spawn_blocking(|| {
|
||||||
// TODO: spawn on the rayon thread pool inside spawn_blocking
|
// Rayon doesn't have a spawn function that returns a value,
|
||||||
tokio::task::spawn_blocking(|| item.verify_single())
|
// so we use a parallel iterator instead.
|
||||||
.map(|join_result| join_result.expect("panic in redpallas fallback verifier"))
|
//
|
||||||
|
// TODO:
|
||||||
|
// - when a batch fails, spawn all its individual items into rayon using Vec::par_iter()
|
||||||
|
// - spawn fallback individual verifications so rayon executes them in FIFO order,
|
||||||
|
// if possible
|
||||||
|
rayon::iter::once(item)
|
||||||
|
.map(|item| item.verify_single())
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.map(|join_result| join_result.expect("panic in redpallas fallback verifier"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -192,9 +205,7 @@ impl Service<BatchControl<Item>> for Verifier {
|
||||||
impl Drop for Verifier {
|
impl Drop for Verifier {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
// We need to flush the current batch in case there are still any pending futures.
|
// We need to flush the current batch in case there are still any pending futures.
|
||||||
// This blocks the current thread and any futures running on it, until the batch is complete.
|
// This returns immediately, usually before the batch is completed.
|
||||||
//
|
|
||||||
// TODO: move the batch onto the rayon thread pool, then drop the verifier immediately.
|
|
||||||
self.flush_blocking();
|
self.flush_blocking();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ async fn batch_flushes_on_max_items() -> Result<()> {
|
||||||
|
|
||||||
// Use a very long max_latency and a short timeout to check that
|
// Use a very long max_latency and a short timeout to check that
|
||||||
// flushing is happening based on hitting max_items.
|
// flushing is happening based on hitting max_items.
|
||||||
let verifier = Batch::new(Verifier::default(), 10, Duration::from_secs(1000));
|
let verifier = Batch::new(Verifier::default(), 10, 5, Duration::from_secs(1000));
|
||||||
timeout(Duration::from_secs(5), sign_and_verify(verifier, 100))
|
timeout(Duration::from_secs(5), sign_and_verify(verifier, 100))
|
||||||
.await?
|
.await?
|
||||||
.map_err(|e| eyre!(e))?;
|
.map_err(|e| eyre!(e))?;
|
||||||
|
|
@ -65,7 +65,7 @@ async fn batch_flushes_on_max_latency() -> Result<()> {
|
||||||
|
|
||||||
// Use a very high max_items and a short timeout to check that
|
// Use a very high max_items and a short timeout to check that
|
||||||
// flushing is happening based on hitting max_latency.
|
// flushing is happening based on hitting max_latency.
|
||||||
let verifier = Batch::new(Verifier::default(), 100, Duration::from_millis(500));
|
let verifier = Batch::new(Verifier::default(), 100, 10, Duration::from_millis(500));
|
||||||
timeout(Duration::from_secs(5), sign_and_verify(verifier, 10))
|
timeout(Duration::from_secs(5), sign_and_verify(verifier, 10))
|
||||||
.await?
|
.await?
|
||||||
.map_err(|e| eyre!(e))?;
|
.map_err(|e| eyre!(e))?;
|
||||||
|
|
|
||||||
|
|
@ -86,6 +86,7 @@ serde = { version = "1.0.137", features = ["serde_derive"] }
|
||||||
toml = "0.5.9"
|
toml = "0.5.9"
|
||||||
|
|
||||||
futures = "0.3.21"
|
futures = "0.3.21"
|
||||||
|
rayon = "1.5.3"
|
||||||
tokio = { version = "1.19.2", features = ["time", "rt-multi-thread", "macros", "tracing", "signal"] }
|
tokio = { version = "1.19.2", features = ["time", "rt-multi-thread", "macros", "tracing", "signal"] }
|
||||||
tower = { version = "0.4.13", features = ["hedge", "limit"] }
|
tower = { version = "0.4.13", features = ["hedge", "limit"] }
|
||||||
pin-project = "1.0.10"
|
pin-project = "1.0.10"
|
||||||
|
|
|
||||||
|
|
@ -343,6 +343,19 @@ impl Application for ZebradApp {
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
// Apply the configured number of threads to the thread pool.
|
||||||
|
//
|
||||||
|
// TODO:
|
||||||
|
// - set rayon panic handler to a function that takes `Box<dyn Any + Send + 'static>`,
|
||||||
|
// which forwards to sentry. If possible, use eyre's panic report for formatting.
|
||||||
|
// - do we also need to call this code in `zebra_consensus::init()`,
|
||||||
|
// when that crate is being used by itself?
|
||||||
|
rayon::ThreadPoolBuilder::new()
|
||||||
|
.num_threads(config.sync.parallel_cpu_threads)
|
||||||
|
.thread_name(|thread_index| format!("rayon {}", thread_index))
|
||||||
|
.build_global()
|
||||||
|
.expect("unable to initialize rayon thread pool");
|
||||||
|
|
||||||
self.config = Some(config);
|
self.config = Some(config);
|
||||||
|
|
||||||
let cfg_ref = self
|
let cfg_ref = self
|
||||||
|
|
@ -389,6 +402,11 @@ impl Application for ZebradApp {
|
||||||
// leak the global span, to make sure it stays active
|
// leak the global span, to make sure it stays active
|
||||||
std::mem::forget(global_guard);
|
std::mem::forget(global_guard);
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
num_threads = rayon::current_num_threads(),
|
||||||
|
"initialized rayon thread pool for CPU-bound tasks",
|
||||||
|
);
|
||||||
|
|
||||||
// Launch network and async endpoints only for long-running commands.
|
// Launch network and async endpoints only for long-running commands.
|
||||||
if is_server {
|
if is_server {
|
||||||
components.push(Box::new(TokioComponent::new()?));
|
components.push(Box::new(TokioComponent::new()?));
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,20 @@
|
||||||
//! A component owning the Tokio runtime.
|
//! A component owning the Tokio runtime.
|
||||||
|
//!
|
||||||
|
//! The tokio runtime is used for:
|
||||||
|
//! - non-blocking async tasks, via [`Future`]s and
|
||||||
|
//! - blocking network and file tasks, via [`spawn_blocking`](tokio::task::spawn_blocking).
|
||||||
|
//!
|
||||||
|
//! The rayon thread pool is used for:
|
||||||
|
//! - long-running CPU-bound tasks like cryptography, via [`rayon::spawn_fifo`].
|
||||||
|
|
||||||
|
use std::{future::Future, time::Duration};
|
||||||
|
|
||||||
use crate::prelude::*;
|
|
||||||
use abscissa_core::{Application, Component, FrameworkError, Shutdown};
|
use abscissa_core::{Application, Component, FrameworkError, Shutdown};
|
||||||
use color_eyre::Report;
|
use color_eyre::Report;
|
||||||
use std::{future::Future, time::Duration};
|
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
|
|
||||||
|
use crate::prelude::*;
|
||||||
|
|
||||||
/// When Zebra is shutting down, wait this long for tokio tasks to finish.
|
/// When Zebra is shutting down, wait this long for tokio tasks to finish.
|
||||||
const TOKIO_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(20);
|
const TOKIO_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(20);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -209,6 +209,12 @@ pub struct SyncSection {
|
||||||
/// This is set to a low value by default, to avoid verification timeouts on large blocks.
|
/// This is set to a low value by default, to avoid verification timeouts on large blocks.
|
||||||
/// Increasing this value may improve performance on machines with many cores.
|
/// Increasing this value may improve performance on machines with many cores.
|
||||||
pub full_verify_concurrency_limit: usize,
|
pub full_verify_concurrency_limit: usize,
|
||||||
|
|
||||||
|
/// The number of threads used to verify signatures, proofs, and other CPU-intensive code.
|
||||||
|
///
|
||||||
|
/// Set to `0` by default, which uses one thread per available CPU core.
|
||||||
|
/// For details, see [the rayon documentation](https://docs.rs/rayon/latest/rayon/struct.ThreadPoolBuilder.html#method.num_threads).
|
||||||
|
pub parallel_cpu_threads: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SyncSection {
|
impl Default for SyncSection {
|
||||||
|
|
@ -223,11 +229,20 @@ impl Default for SyncSection {
|
||||||
// This default is deliberately very low, so Zebra can verify a few large blocks in under 60 seconds,
|
// This default is deliberately very low, so Zebra can verify a few large blocks in under 60 seconds,
|
||||||
// even on machines with only a few cores.
|
// even on machines with only a few cores.
|
||||||
//
|
//
|
||||||
// This lets users see the committed block height changing in every progress log.
|
// This lets users see the committed block height changing in every progress log,
|
||||||
|
// and avoids hangs due to out-of-order verifications flooding the CPUs.
|
||||||
//
|
//
|
||||||
// TODO: when we implement orchard proof batching, try increasing to 20 or more
|
// TODO:
|
||||||
// limit full verification concurrency based on block transaction counts?
|
// - limit full verification concurrency based on block transaction counts?
|
||||||
full_verify_concurrency_limit: 5,
|
// - move more disk work to blocking tokio threads,
|
||||||
|
// and CPU work to the rayon thread pool inside blocking tokio threads
|
||||||
|
full_verify_concurrency_limit: 20,
|
||||||
|
|
||||||
|
// Use one thread per CPU.
|
||||||
|
//
|
||||||
|
// If this causes tokio executor starvation, move CPU-intensive tasks to rayon threads,
|
||||||
|
// or reserve a few cores for tokio threads, based on `num_cpus()`.
|
||||||
|
parallel_cpu_threads: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue