Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
82794761
Unverified
Commit
82794761
authored
Mar 23, 2026
by
Biswa Panda
Committed by
GitHub
Mar 23, 2026
Browse files
feat(perf): add pipeline instrumentation and feature gated NVTX markers (#6746)
parent
ba9a8a9f
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
79 additions
and
7 deletions
+79
-7
lib/runtime/src/pipeline/network/egress/nats_client.rs
lib/runtime/src/pipeline/network/egress/nats_client.rs
+4
-0
lib/runtime/src/pipeline/network/egress/push_router.rs
lib/runtime/src/pipeline/network/egress/push_router.rs
+13
-4
lib/runtime/src/pipeline/network/egress/tcp_client.rs
lib/runtime/src/pipeline/network/egress/tcp_client.rs
+9
-1
lib/runtime/src/pipeline/network/ingress/push_handler.rs
lib/runtime/src/pipeline/network/ingress/push_handler.rs
+15
-0
lib/runtime/src/pipeline/network/ingress/shared_tcp_endpoint.rs
...ntime/src/pipeline/network/ingress/shared_tcp_endpoint.rs
+14
-0
lib/runtime/src/pipeline/network/tcp/server.rs
lib/runtime/src/pipeline/network/tcp/server.rs
+15
-2
lib/runtime/src/runtime.rs
lib/runtime/src/runtime.rs
+9
-0
No files found.
lib/runtime/src/pipeline/network/egress/nats_client.rs
View file @
82794761
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
use
super
::
unified_client
::{
ClientStats
,
Headers
,
RequestPlaneClient
};
use
super
::
unified_client
::{
ClientStats
,
Headers
,
RequestPlaneClient
};
use
crate
::
error
::{
DynamoError
,
ErrorType
};
use
crate
::
error
::{
DynamoError
,
ErrorType
};
use
crate
::
metrics
::
transport_metrics
::
NATS_ERRORS_TOTAL
;
use
anyhow
::
Result
;
use
anyhow
::
Result
;
use
async_trait
::
async_trait
;
use
async_trait
::
async_trait
;
use
bytes
::
Bytes
;
use
bytes
::
Bytes
;
...
@@ -51,6 +52,9 @@ impl RequestPlaneClient for NatsRequestClient {
...
@@ -51,6 +52,9 @@ impl RequestPlaneClient for NatsRequestClient {
.request_with_headers
(
address
.clone
(),
nats_headers
,
payload
)
.request_with_headers
(
address
.clone
(),
nats_headers
,
payload
)
.await
.await
.map_err
(|
e
|
{
.map_err
(|
e
|
{
NATS_ERRORS_TOTAL
.with_label_values
(
&
[
"request_failed"
])
.inc
();
anyhow
::
anyhow!
(
anyhow
::
anyhow!
(
DynamoError
::
builder
()
DynamoError
::
builder
()
.error_type
(
ErrorType
::
CannotConnect
)
.error_type
(
ErrorType
::
CannotConnect
)
...
...
lib/runtime/src/pipeline/network/egress/push_router.rs
View file @
82794761
...
@@ -16,7 +16,9 @@ fn is_inhibited(err: &(dyn std::error::Error + 'static)) -> bool {
...
@@ -16,7 +16,9 @@ fn is_inhibited(err: &(dyn std::error::Error + 'static)) -> bool {
}
}
use
crate
::{
use
crate
::{
component
::{
Client
,
Endpoint
},
component
::{
Client
,
Endpoint
},
dynamo_nvtx_range
,
engine
::{
AsyncEngine
,
Data
},
engine
::{
AsyncEngine
,
Data
},
metrics
::
frontend_perf
::
STAGE_DURATION_SECONDS
,
pipeline
::{
pipeline
::{
AddressedPushRouter
,
AddressedRequest
,
Error
,
ManyOut
,
SingleIn
,
AddressedPushRouter
,
AddressedRequest
,
Error
,
ManyOut
,
SingleIn
,
error
::{
PipelineError
,
PipelineErrorExt
},
error
::{
PipelineError
,
PipelineErrorExt
},
...
@@ -34,6 +36,7 @@ use std::{
...
@@ -34,6 +36,7 @@ use std::{
Arc
,
Arc
,
atomic
::{
AtomicU64
,
Ordering
},
atomic
::{
AtomicU64
,
Ordering
},
},
},
time
::
Instant
,
};
};
use
tokio_stream
::
StreamExt
;
use
tokio_stream
::
StreamExt
;
use
tracing
::
Instrument
;
use
tracing
::
Instrument
;
...
@@ -324,6 +327,7 @@ where
...
@@ -324,6 +327,7 @@ where
instance_id
:
u64
,
instance_id
:
u64
,
request
:
SingleIn
<
T
>
,
request
:
SingleIn
<
T
>
,
)
->
anyhow
::
Result
<
ManyOut
<
U
>>
{
)
->
anyhow
::
Result
<
ManyOut
<
U
>>
{
let
route_start
=
Instant
::
now
();
let
request_id
=
request
.id
()
.to_string
();
let
request_id
=
request
.id
()
.to_string
();
let
route_span
=
if
matches!
(
self
.router_mode
,
RouterMode
::
KV
)
{
let
route_span
=
if
matches!
(
self
.router_mode
,
RouterMode
::
KV
)
{
tracing
::
Span
::
none
()
tracing
::
Span
::
none
()
...
@@ -357,7 +361,7 @@ where
...
@@ -357,7 +361,7 @@ where
}
}
// Get the address based on discovered transport type
// Get the address based on discovered transport type
let
address
=
{
let
(
address
,
_
transport_kind
)
=
{
use
crate
::
component
::
TransportType
;
use
crate
::
component
::
TransportType
;
// Get the instance and use its actual transport type
// Get the instance and use its actual transport type
...
@@ -376,7 +380,7 @@ where
...
@@ -376,7 +380,7 @@ where
http_endpoint
=
%
http_endpoint
,
http_endpoint
=
%
http_endpoint
,
"Using HTTP transport for instance"
"Using HTTP transport for instance"
);
);
http_endpoint
.clone
()
(
http_endpoint
.clone
()
,
"transport.http.request"
)
}
}
TransportType
::
Tcp
(
tcp_endpoint
)
=>
{
TransportType
::
Tcp
(
tcp_endpoint
)
=>
{
tracing
::
debug!
(
tracing
::
debug!
(
...
@@ -384,7 +388,7 @@ where
...
@@ -384,7 +388,7 @@ where
tcp_endpoint
=
%
tcp_endpoint
,
tcp_endpoint
=
%
tcp_endpoint
,
"Using TCP transport for instance"
"Using TCP transport for instance"
);
);
tcp_endpoint
.clone
()
(
tcp_endpoint
.clone
()
,
"transport.tcp.request"
)
}
}
TransportType
::
Nats
(
subject
)
=>
{
TransportType
::
Nats
(
subject
)
=>
{
tracing
::
debug!
(
tracing
::
debug!
(
...
@@ -392,13 +396,18 @@ where
...
@@ -392,13 +396,18 @@ where
subject
=
%
subject
,
subject
=
%
subject
,
"Using NATS transport for instance"
"Using NATS transport for instance"
);
);
subject
.clone
()
(
subject
.clone
()
,
"transport.nats.request"
)
}
}
}
}
};
};
let
request
=
request
.map
(|
req
|
AddressedRequest
::
new
(
req
,
address
));
let
request
=
request
.map
(|
req
|
AddressedRequest
::
new
(
req
,
address
));
STAGE_DURATION_SECONDS
.with_label_values
(
&
[
"route"
])
.observe
(
route_start
.elapsed
()
.as_secs_f64
());
let
_
nvtx_transport
=
dynamo_nvtx_range!
(
_
transport_kind
);
let
stream
:
anyhow
::
Result
<
ManyOut
<
U
>>
=
self
let
stream
:
anyhow
::
Result
<
ManyOut
<
U
>>
=
self
.addressed
.addressed
.generate
(
request
)
.generate
(
request
)
...
...
lib/runtime/src/pipeline/network/egress/tcp_client.rs
View file @
82794761
...
@@ -5,6 +5,9 @@
...
@@ -5,6 +5,9 @@
//!
//!
use
super
::
unified_client
::{
ClientStats
,
Headers
,
RequestPlaneClient
};
use
super
::
unified_client
::{
ClientStats
,
Headers
,
RequestPlaneClient
};
use
crate
::
metrics
::
transport_metrics
::{
TCP_BYTES_RECEIVED_TOTAL
,
TCP_BYTES_SENT_TOTAL
,
TCP_ERRORS_TOTAL
,
};
use
anyhow
::
Result
;
use
anyhow
::
Result
;
use
async_trait
::
async_trait
;
use
async_trait
::
async_trait
;
use
bytes
::
Bytes
;
use
bytes
::
Bytes
;
...
@@ -250,6 +253,7 @@ impl TcpConnection {
...
@@ -250,6 +253,7 @@ impl TcpConnection {
// With TCP_NODELAY, no need for explicit flush()
// With TCP_NODELAY, no need for explicit flush()
match
write_half
.write_all
(
&
req
.encoded_data
)
.await
{
match
write_half
.write_all
(
&
req
.encoded_data
)
.await
{
Ok
(())
=>
{
Ok
(())
=>
{
TCP_BYTES_SENT_TOTAL
.inc_by
(
req
.encoded_data
.len
()
as
f64
);
// Forward response channel to reader task (FIFO ordering)
// Forward response channel to reader task (FIFO ordering)
if
response_tx_channel
.send
(
req
.response_tx
)
.is_err
()
{
if
response_tx_channel
.send
(
req
.response_tx
)
.is_err
()
{
tracing
::
debug!
(
"Reader task closed, stopping writer"
);
tracing
::
debug!
(
"Reader task closed, stopping writer"
);
...
@@ -505,9 +509,10 @@ impl RequestPlaneClient for TcpRequestClient {
...
@@ -505,9 +509,10 @@ impl RequestPlaneClient for TcpRequestClient {
)
->
Result
<
Bytes
>
{
)
->
Result
<
Bytes
>
{
tracing
::
debug!
(
"TCP client sending request to address: {address}"
);
tracing
::
debug!
(
"TCP client sending request to address: {address}"
);
self
.stats.requests_sent
.fetch_add
(
1
,
Ordering
::
Relaxed
);
self
.stats.requests_sent
.fetch_add
(
1
,
Ordering
::
Relaxed
);
let
payload_len
=
payload
.len
();
self
.stats
self
.stats
.bytes_sent
.bytes_sent
.fetch_add
(
payload
.
len
()
as
u64
,
Ordering
::
Relaxed
);
.fetch_add
(
payload
_
len
as
u64
,
Ordering
::
Relaxed
);
let
(
addr
,
endpoint_name
)
=
Self
::
parse_address
(
&
address
)
?
;
let
(
addr
,
endpoint_name
)
=
Self
::
parse_address
(
&
address
)
?
;
...
@@ -535,6 +540,7 @@ impl RequestPlaneClient for TcpRequestClient {
...
@@ -535,6 +540,7 @@ impl RequestPlaneClient for TcpRequestClient {
self
.stats
self
.stats
.bytes_received
.bytes_received
.fetch_add
(
response
.len
()
as
u64
,
Ordering
::
Relaxed
);
.fetch_add
(
response
.len
()
as
u64
,
Ordering
::
Relaxed
);
TCP_BYTES_RECEIVED_TOTAL
.inc_by
(
response
.len
()
as
f64
);
// Return connection to pool (health check happens inside)
// Return connection to pool (health check happens inside)
self
.pool
.return_connection
(
conn
)
.await
;
self
.pool
.return_connection
(
conn
)
.await
;
...
@@ -543,6 +549,7 @@ impl RequestPlaneClient for TcpRequestClient {
...
@@ -543,6 +549,7 @@ impl RequestPlaneClient for TcpRequestClient {
}
}
Ok
(
Err
(
e
))
=>
{
Ok
(
Err
(
e
))
=>
{
self
.stats.errors
.fetch_add
(
1
,
Ordering
::
Relaxed
);
self
.stats.errors
.fetch_add
(
1
,
Ordering
::
Relaxed
);
TCP_ERRORS_TOTAL
.inc
();
tracing
::
warn!
(
"TCP request failed to {}: {}"
,
addr
,
e
);
tracing
::
warn!
(
"TCP request failed to {}: {}"
,
addr
,
e
);
// Don't return unhealthy connection to pool, let it drop
// Don't return unhealthy connection to pool, let it drop
let
cause
=
crate
::
error
::
DynamoError
::
from
(
let
cause
=
crate
::
error
::
DynamoError
::
from
(
...
@@ -558,6 +565,7 @@ impl RequestPlaneClient for TcpRequestClient {
...
@@ -558,6 +565,7 @@ impl RequestPlaneClient for TcpRequestClient {
}
}
Err
(
_
)
=>
{
Err
(
_
)
=>
{
self
.stats.errors
.fetch_add
(
1
,
Ordering
::
Relaxed
);
self
.stats.errors
.fetch_add
(
1
,
Ordering
::
Relaxed
);
TCP_ERRORS_TOTAL
.inc
();
tracing
::
warn!
(
"TCP request timeout to {addr}"
);
tracing
::
warn!
(
"TCP request timeout to {addr}"
);
// Don't return timed-out connection to pool
// Don't return timed-out connection to pool
Err
(
anyhow
::
anyhow!
(
Err
(
anyhow
::
anyhow!
(
...
...
lib/runtime/src/pipeline/network/ingress/push_handler.rs
View file @
82794761
...
@@ -4,6 +4,9 @@
...
@@ -4,6 +4,9 @@
use
super
::
*
;
use
super
::
*
;
use
crate
::
metrics
::
prometheus_names
::
work_handler
;
use
crate
::
metrics
::
prometheus_names
::
work_handler
;
use
crate
::
metrics
::
work_handler_perf
::{
WORK_HANDLER_NETWORK_TRANSIT_SECONDS
,
WORK_HANDLER_TIME_TO_FIRST_RESPONSE_SECONDS
,
};
use
crate
::
protocols
::
maybe_error
::
MaybeError
;
use
crate
::
protocols
::
maybe_error
::
MaybeError
;
use
prometheus
::{
Histogram
,
IntCounter
,
IntCounterVec
,
IntGauge
};
use
prometheus
::{
Histogram
,
IntCounter
,
IntCounterVec
,
IntGauge
};
use
serde
::{
Deserialize
,
Serialize
};
use
serde
::{
Deserialize
,
Serialize
};
...
@@ -137,6 +140,10 @@ where
...
@@ -137,6 +140,10 @@ where
}
}
async
fn
handle_payload
(
&
self
,
payload
:
Bytes
)
->
Result
<
(),
PipelineError
>
{
async
fn
handle_payload
(
&
self
,
payload
:
Bytes
)
->
Result
<
(),
PipelineError
>
{
let
t2_wallclock_ns
=
std
::
time
::
SystemTime
::
now
()
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.unwrap_or_default
()
.as_nanos
()
as
u64
;
let
start_time
=
std
::
time
::
Instant
::
now
();
let
start_time
=
std
::
time
::
Instant
::
now
();
// Increment inflight and ensure it's decremented on all exits via RAII guard
// Increment inflight and ensure it's decremented on all exits via RAII guard
...
@@ -194,6 +201,12 @@ where
...
@@ -194,6 +201,12 @@ where
}
}
};
};
// Compute network transit time (T2 - T1) using cross-process wall-clock timestamps
if
let
Some
(
t1_ns
)
=
control_msg
.frontend_send_ts_ns
{
let
transit_ns
=
t2_wallclock_ns
.saturating_sub
(
t1_ns
);
WORK_HANDLER_NETWORK_TRANSIT_SECONDS
.observe
(
transit_ns
as
f64
/
1_000_000_000.0
);
}
// extend request with context
// extend request with context
tracing
::
trace!
(
"received control message: {:?}"
,
control_msg
);
tracing
::
trace!
(
"received control message: {:?}"
,
control_msg
);
tracing
::
trace!
(
"received request: {:?}"
,
request
);
tracing
::
trace!
(
"received request: {:?}"
,
request
);
...
@@ -238,6 +251,8 @@ where
...
@@ -238,6 +251,8 @@ where
Ok
(
stream
)
=>
{
Ok
(
stream
)
=>
{
tracing
::
trace!
(
"Successfully generated response stream; sending prologue"
);
tracing
::
trace!
(
"Successfully generated response stream; sending prologue"
);
let
_
result
=
publisher
.send_prologue
(
None
)
.await
;
let
_
result
=
publisher
.send_prologue
(
None
)
.await
;
WORK_HANDLER_TIME_TO_FIRST_RESPONSE_SECONDS
.observe
(
start_time
.elapsed
()
.as_secs_f64
());
stream
stream
}
}
Err
(
e
)
=>
{
Err
(
e
)
=>
{
...
...
lib/runtime/src/pipeline/network/ingress/shared_tcp_endpoint.rs
View file @
82794761
...
@@ -186,6 +186,20 @@ impl SharedTcpServer {
...
@@ -186,6 +186,20 @@ impl SharedTcpServer {
"TCP worker processing request"
"TCP worker processing request"
);
);
// Compute network transit time from the transport header stamped right
// before the TCP write on the frontend side.
if
let
Some
(
t1_str
)
=
work_item
.headers
.get
(
"x-frontend-send-ts-ns"
)
&&
let
Ok
(
t1_ns
)
=
t1_str
.parse
::
<
u64
>
()
{
let
t2_ns
=
std
::
time
::
SystemTime
::
now
()
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.unwrap_or_default
()
.as_nanos
()
as
u64
;
let
transit_ns
=
t2_ns
.saturating_sub
(
t1_ns
);
crate
::
metrics
::
work_handler_perf
::
WORK_HANDLER_NETWORK_TRANSIT_SECONDS
.observe
(
transit_ns
as
f64
/
1_000_000_000.0
);
}
// Create span with trace context from headers
// Create span with trace context from headers
let
span
=
crate
::
logging
::
make_handle_payload_span_from_tcp_headers
(
let
span
=
crate
::
logging
::
make_handle_payload_span_from_tcp_headers
(
&
work_item
.headers
,
&
work_item
.headers
,
...
...
lib/runtime/src/pipeline/network/tcp/server.rs
View file @
82794761
...
@@ -159,8 +159,21 @@ impl TcpStreamServer {
...
@@ -159,8 +159,21 @@ impl TcpStreamServer {
match
resolved_ip
{
match
resolved_ip
{
Ok
(
addr
)
=>
addr
,
Ok
(
addr
)
=>
addr
,
Err
(
Error
::
LocalIpAddressNotFound
)
=>
IpAddr
::
from
([
127
,
0
,
0
,
1
]),
// Only fall back to loopback when no routable IP exists at all;
Err
(
err
)
=>
return
Err
(
err
.into
()),
// propagate other resolver errors (I/O, platform) so
// misconfigured hosts fail fast instead of silently binding
// to 127.0.0.1.
Err
(
Error
::
LocalIpAddressNotFound
)
=>
{
tracing
::
warn!
(
"No routable local IP address found; falling back to 127.0.0.1"
);
IpAddr
::
from
([
127
,
0
,
0
,
1
])
}
Err
(
err
)
=>
{
return
Err
(
PipelineError
::
Generic
(
format!
(
"Failed to resolve local IP address: {err}"
)));
}
}
}
.to_string
()
.to_string
()
}
}
...
...
lib/runtime/src/runtime.rs
View file @
82794761
...
@@ -51,6 +51,9 @@ pub struct Runtime {
...
@@ -51,6 +51,9 @@ pub struct Runtime {
impl
Runtime
{
impl
Runtime
{
fn
new
(
runtime
:
RuntimeType
,
secondary
:
Option
<
RuntimeType
>
)
->
anyhow
::
Result
<
Runtime
>
{
fn
new
(
runtime
:
RuntimeType
,
secondary
:
Option
<
RuntimeType
>
)
->
anyhow
::
Result
<
Runtime
>
{
// Initialise NVTX toggle once from environment (no-op when feature is off)
crate
::
nvtx
::
init
();
// worker id
// worker id
let
id
=
Arc
::
new
(
uuid
::
Uuid
::
new_v4
()
.to_string
());
let
id
=
Arc
::
new
(
uuid
::
Uuid
::
new_v4
()
.to_string
());
...
@@ -146,6 +149,12 @@ impl Runtime {
...
@@ -146,6 +149,12 @@ impl Runtime {
if
let
(
Some
(
pool
),
Some
(
permits
))
=
(
&
self
.compute_pool
,
&
self
.block_in_place_permits
)
{
if
let
(
Some
(
pool
),
Some
(
permits
))
=
(
&
self
.compute_pool
,
&
self
.block_in_place_permits
)
{
crate
::
compute
::
thread_local
::
initialize_context
(
Arc
::
clone
(
pool
),
Arc
::
clone
(
permits
));
crate
::
compute
::
thread_local
::
initialize_context
(
Arc
::
clone
(
pool
),
Arc
::
clone
(
permits
));
}
}
// Name this worker thread in the Nsight Systems timeline (no-op when nvtx feature is off)
let
thread_name
=
std
::
thread
::
current
()
.name
()
.map
(|
n
|
n
.to_string
())
.unwrap_or_else
(||
format!
(
"tokio-worker-{:?}"
,
std
::
thread
::
current
()
.id
()));
crate
::
nvtx
::
name_current_thread_impl
(
&
thread_name
);
}
}
/// Initialize thread-local compute context on all worker threads using a barrier
/// Initialize thread-local compute context on all worker threads using a barrier
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment