network: propagate tracing Spans through peer connection

2020-06-22 19:19:27 -07:00 · 2020-06-22 19:19:27 -07:00 · 217c25ef07
parent ff4e722cd7
commit 217c25ef07
4 changed files with 128 additions and 65 deletions
--- a/zebra-network/src/peer/client.rs
+++ b/zebra-network/src/peer/client.rs
@ -16,21 +16,22 @@ use super::{ErrorSlot, SharedPeerError};

 /// The "client" duplex half of a peer connection.
 pub struct Client {
-    pub(super) span: tracing::Span,
    pub(super) server_tx: mpsc::Sender<ClientRequest>,
    pub(super) error_slot: ErrorSlot,
 }

-/// A message from the `peer::Client` to the `peer::Server`, containing both a
-/// request and a return message channel. The reason the return channel is
-/// included is because `peer::Client::call` returns a future that may be moved
-/// around before it resolves, so the future must have ownership of the channel
-/// on which it receives the response.
+/// A message from the `peer::Client` to the `peer::Server`.
 #[derive(Debug)]
-pub(super) struct ClientRequest(
-    pub(super) Request,
-    pub(super) oneshot::Sender<Result<Response, SharedPeerError>>,
-);
+pub(super) struct ClientRequest {
+    /// The actual request.
+    pub request: Request,
+    /// The return message channel, included because `peer::Client::call` returns a
+    /// future that may be moved around before it resolves.
+    pub tx: oneshot::Sender<Result<Response, SharedPeerError>>,
+    /// The tracing context for the request, so that work the connection task does
+    /// processing messages in the context of this request will have correct context.
+    pub span: tracing::Span,
+}

 impl Service<Request> for Client {
    type Response = Response;
@ -49,19 +50,23 @@ impl Service<Request> for Client {
        }
    }

-    fn call(&mut self, req: Request) -> Self::Future {
+    fn call(&mut self, request: Request) -> Self::Future {
        use futures::future::FutureExt;
-        use tracing_futures::Instrument;

        let (tx, rx) = oneshot::channel();
-        match self.server_tx.try_send(ClientRequest(req, tx)) {
+        // get the current Span to propagate it to the peer connection task.
+        // this allows the peer connection to enter the correct tracing context
+        // when it's handling messages in the context of processing this
+        // request.
+        let span = tracing::Span::current();
+
+        match self.server_tx.try_send(ClientRequest { request, span, tx }) {
            Err(e) => {
                if e.is_disconnected() {
                    future::ready(Err(self
                        .error_slot
                        .try_get_error()
                        .expect("failed servers must set their error slot")))
-                    .instrument(self.span.clone())
                    .boxed()
                } else {
                    // sending fails when there's not enough
@ -75,7 +80,6 @@ impl Service<Request> for Client {
                    oneshot_recv_result
                        .expect("ClientRequest oneshot sender must not be dropped before send")
                })
-                .instrument(self.span.clone())
                .boxed()
            }
        }
--- a/zebra-network/src/peer/connection.rs
+++ b/zebra-network/src/peer/connection.rs
@ -9,6 +9,7 @@ use futures::{
 };
 use tokio::time::{delay_for, Delay};
 use tower::Service;
+use tracing_futures::Instrument;

 use zebra_chain::{
    block::{Block, BlockHeaderHash},
@ -46,7 +47,6 @@ impl Handler {
    /// contents to responses without additional copies.  If the message is not
    /// interpretable as a response, we return ownership to the caller.
    fn process_message(&mut self, msg: Message) -> Option<Message> {
-        trace!(?msg);
        // This function is where we statefully interpret Bitcoin/Zcash messages
        // into responses to messages in the internal request/response protocol.
        // This conversion is done by a sequence of (request, message) match arms,
@ -93,6 +93,7 @@ impl Handler {
            ))),
            // By default, messages are not responses.
            (state, msg) => {
+                trace!(?msg, "did not interpret message as response");
                ignored_msg = Some(msg);
                state
            }
@ -106,7 +107,11 @@ pub(super) enum State {
    /// Awaiting a client request or a peer message.
    AwaitingRequest,
    /// Awaiting a peer message we can interpret as a client request.
-    AwaitingResponse(Handler, oneshot::Sender<Result<Response, SharedPeerError>>),
+    AwaitingResponse {
+        handler: Handler,
+        tx: oneshot::Sender<Result<Response, SharedPeerError>>,
+        span: tracing::Span,
+    },
    /// A failure has occurred and we are shutting down the connection.
    Failed,
 }
@ -171,18 +176,26 @@ where
                        Either::Right((None, _)) => {
                            self.fail_with(PeerError::DeadClient);
                        }
-                        Either::Right((Some(req), _)) => self.handle_client_request(req).await,
+                        Either::Right((Some(req), _)) => {
+                            let span = req.span.clone();
+                            self.handle_client_request(req).instrument(span).await
+                        }
                    }
                }
                // We're awaiting a response to a client request,
                // so wait on either a peer message, or on a request timeout.
-                State::AwaitingResponse { .. } => {
-                    trace!("awaiting response to client request");
+                State::AwaitingResponse { ref span, .. } => {
+                    // we have to get rid of the span reference so we can tamper with the state
+                    let span = span.clone();
+                    trace!(parent: &span, "awaiting response to client request");
                    let timer_ref = self
                        .request_timer
                        .as_mut()
                        .expect("timeout must be set while awaiting response");
-                    match future::select(peer_rx.next(), timer_ref).await {
+                    match future::select(peer_rx.next(), timer_ref)
+                        .instrument(span.clone())
+                        .await
+                    {
                        Either::Left((None, _)) => self.fail_with(PeerError::ConnectionClosed),
                        Either::Left((Some(Err(e)), _)) => self.fail_with(e.into()),
                        Either::Left((Some(Ok(peer_msg)), _timer)) => {
@ -194,39 +207,48 @@ where
                            // factor the state required for inbound and
                            // outbound requests.
                            let request_msg = match self.state {
-                                State::AwaitingResponse(ref mut handler, _) => {
-                                    handler.process_message(peer_msg)
-                                }
+                                State::AwaitingResponse {
+                                    ref mut handler, ..
+                                } => span.in_scope(|| handler.process_message(peer_msg)),
                                _ => unreachable!(),
                            };
                            // If the message was not consumed, check whether it
                            // should be handled as a request.
                            if let Some(msg) = request_msg {
+                                // do NOT instrument with the request span, this is
+                                // independent work
                                self.handle_message_as_request(msg).await;
                            } else {
                                // Otherwise, check whether the handler is finished
                                // processing messages and update the state.
                                self.state = match self.state {
-                                    State::AwaitingResponse(Handler::Finished(response), tx) => {
+                                    State::AwaitingResponse {
+                                        handler: Handler::Finished(response),
+                                        tx,
+                                        ..
+                                    } => {
                                        let _ = tx.send(response);
                                        State::AwaitingRequest
                                    }
-                                    pending @ State::AwaitingResponse(_, _) => pending,
+                                    pending @ State::AwaitingResponse { .. } => pending,
                                    _ => unreachable!(),
                                };
                            }
                        }
                        Either::Right(((), _peer_fut)) => {
-                            trace!("client request timed out");
+                            trace!(parent: &span, "client request timed out");
                            let e = PeerError::ClientRequestTimeout;
                            self.state = match self.state {
                                // Special case: ping timeouts fail the connection.
-                                State::AwaitingResponse(Handler::Ping(_), _) => {
+                                State::AwaitingResponse {
+                                    handler: Handler::Ping(_),
+                                    ..
+                                } => {
                                    self.fail_with(e);
                                    State::Failed
                                }
                                // Other request timeouts fail the request.
-                                State::AwaitingResponse(_, tx) => {
+                                State::AwaitingResponse { tx, .. } => {
                                    let _ = tx.send(Err(e.into()));
                                    State::AwaitingRequest
                                }
@ -239,7 +261,11 @@ where
                // requests before we can return and complete the future.
                State::Failed => {
                    match self.client_rx.next().await {
-                        Some(ClientRequest(_, tx)) => {
+                        Some(ClientRequest { tx, span, .. }) => {
+                            trace!(
+                                parent: &span,
+                                "erroring pending request to failed connection"
+                            );
                            let e = self
                                .error_slot
                                .try_get_error()
@ -278,7 +304,7 @@ where
        // we need to deal with it first if it exists.
        self.client_rx.close();
        let old_state = std::mem::replace(&mut self.state, State::Failed);
-        if let State::AwaitingResponse(_, tx) = old_state {
+        if let State::AwaitingResponse { tx, .. } = old_state {
            // We know the slot has Some(e) because we just set it above,
            // and the error slot is never unset.
            let e = self.error_slot.try_get_error().unwrap();
@ -288,15 +314,19 @@ where

    /// Handle an incoming client request, possibly generating outgoing messages to the
    /// remote peer.
-    async fn handle_client_request(&mut self, msg: ClientRequest) {
-        trace!(?msg);
+    ///
+    /// NOTE: the caller should use .instrument(msg.span) to instrument the function.
+    async fn handle_client_request(&mut self, req: ClientRequest) {
+        trace!(?req.request);
        use Request::*;
        use State::*;
-        let ClientRequest(req, tx) = msg;
+        let ClientRequest { request, tx, span } = req;
+
+        // XXX(hdevalence) this is truly horrible, but let's fix it later

        // Inner match returns Result with the new state or an error.
        // Outer match updates state or fails.
-        match match (&self.state, req) {
+        match match (&self.state, request) {
            (Failed, _) => panic!("failed connection cannot handle requests"),
            (AwaitingResponse { .. }, _) => panic!("tried to update pending request"),
            (AwaitingRequest, Peers) => self
@ -304,13 +334,21 @@ where
                .send(Message::GetAddr)
                .await
                .map_err(|e| e.into())
-                .map(|()| AwaitingResponse(Handler::GetPeers, tx)),
+                .map(|()| AwaitingResponse {
+                    handler: Handler::GetPeers,
+                    tx,
+                    span,
+                }),
            (AwaitingRequest, Ping(nonce)) => self
                .peer_tx
                .send(Message::Ping(nonce))
                .await
                .map_err(|e| e.into())
-                .map(|()| AwaitingResponse(Handler::Ping(nonce), tx)),
+                .map(|()| AwaitingResponse {
+                    handler: Handler::Ping(nonce),
+                    tx,
+                    span,
+                }),
            (AwaitingRequest, BlocksByHash(hashes)) => self
                .peer_tx
                .send(Message::GetData(
@ -318,14 +356,13 @@ where
                ))
                .await
                .map_err(|e| e.into())
-                .map(|()| {
-                    AwaitingResponse(
-                        Handler::GetBlocksByHash {
-                            blocks: Vec::with_capacity(hashes.len()),
-                            hashes,
-                        },
-                        tx,
-                    )
+                .map(|()| AwaitingResponse {
+                    handler: Handler::GetBlocksByHash {
+                        blocks: Vec::with_capacity(hashes.len()),
+                        hashes,
+                    },
+                    tx,
+                    span,
                }),
            (AwaitingRequest, FindBlocks { known_blocks, stop }) => self
                .peer_tx
@ -335,7 +372,11 @@ where
                })
                .await
                .map_err(|e| e.into())
-                .map(|()| AwaitingResponse(Handler::FindBlocks, tx)),
+                .map(|()| AwaitingResponse {
+                    handler: Handler::FindBlocks,
+                    tx,
+                    span,
+                }),
        } {
            Ok(new_state) => {
                self.state = new_state;
@ -345,6 +386,9 @@ where
        }
    }

+    // This function has its own span, because we're creating a new work
+    // context (namely, the work of processing the inbound msg as a request)
+    #[instrument(skip(self))]
    async fn handle_message_as_request(&mut self, msg: Message) {
        trace!(?msg);
        // These messages are transport-related, handle them separately:
@ -358,6 +402,7 @@ where
                return;
            }
            Message::Ping(nonce) => {
+                trace!(?nonce, "responding to heartbeat");
                match self.peer_tx.send(Message::Pong(nonce)).await {
                    Ok(()) => {}
                    Err(e) => self.fail_with(e.into()),
--- a/zebra-network/src/peer/handshake.rs
+++ b/zebra-network/src/peer/handshake.rs
@ -212,7 +212,6 @@ where
            let slot = ErrorSlot::default();

            let client = Client {
-                span: connection_span.clone(),
                server_tx: server_tx.clone(),
                error_slot: slot.clone(),
            };
@ -268,31 +267,46 @@ where
                request_timer: None,
            };

-            tokio::spawn(server.run(peer_rx).instrument(connection_span).boxed());
+            tokio::spawn(
+                server
+                    .run(peer_rx)
+                    .instrument(connection_span.clone())
+                    .boxed(),
+            );

-            tokio::spawn(async move {
-                use futures::channel::oneshot;
+            let heartbeat_span = tracing::debug_span!(parent: connection_span, "heartbeat");
+            tokio::spawn(
+                async move {
+                    use futures::channel::oneshot;

-                use super::client::ClientRequest;
+                    use super::client::ClientRequest;

-                let mut server_tx = server_tx;
+                    let mut server_tx = server_tx;

-                let mut interval_stream = tokio::time::interval(constants::HEARTBEAT_INTERVAL);
+                    let mut interval_stream = tokio::time::interval(constants::HEARTBEAT_INTERVAL);

-                loop {
-                    interval_stream.tick().await;
+                    loop {
+                        interval_stream.tick().await;

-                    // We discard the server handle because our
-                    // heartbeat `Ping`s are a special case, and we
-                    // don't actually care about the response here.
-                    let (request_tx, _) = oneshot::channel();
-                    let msg = ClientRequest(Request::Ping(Nonce::default()), request_tx);
-
-                    if server_tx.send(msg).await.is_err() {
-                        return;
+                        // We discard the server handle because our
+                        // heartbeat `Ping`s are a special case, and we
+                        // don't actually care about the response here.
+                        let (request_tx, _) = oneshot::channel();
+                        if server_tx
+                            .send(ClientRequest {
+                                request: Request::Ping(Nonce::default()),
+                                tx: request_tx,
+                                span: tracing::Span::current(),
+                            })
+                            .await
+                            .is_err()
+                        {
+                            return;
+                        }
                    }
                }
-            });
+                .instrument(heartbeat_span),
+            );

            Ok(client)
        };
--- a/zebra-network/src/peer_set/initialize.rs
+++ b/zebra-network/src/peer_set/initialize.rs
@ -110,9 +110,9 @@ where
    // `addr` message per connection, and if we only have one initial peer we
    // need to ensure that its `addr` message is used by the crawler.
    // XXX this should go in CandidateSet::new, but we need init() -> Result<_,_>
-    let _ = candidates.update().await;

    info!("Sending initial request for peers");
+    let _ = candidates.update().await;

    for _ in 0..config.peerset_initial_target_size {
        let _ = demand_tx.try_send(());