Unverified Commit 0cb01b3f authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

feat: updates to structured logging (#2061)


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent 803bfa81
......@@ -252,7 +252,7 @@ dependencies = [
"eventsource-stream",
"futures",
"rand 0.8.5",
"reqwest",
"reqwest 0.12.22",
"reqwest-eventsource",
"secrecy",
"serde",
......@@ -634,15 +634,30 @@ dependencies = [
"rayon",
]
[[package]]
name = "bit-set"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
dependencies = [
"bit-vec 0.6.3",
]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec",
"bit-vec 0.8.0",
]
[[package]]
name = "bit-vec"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]]
name = "bit-vec"
version = "0.8.0"
......@@ -736,6 +751,12 @@ version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
[[package]]
name = "bytecount"
version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
[[package]]
name = "bytemuck"
version = "1.23.1"
......@@ -943,7 +964,7 @@ version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom",
"nom 7.1.3",
]
[[package]]
......@@ -1906,7 +1927,7 @@ dependencies = [
"rand 0.9.1",
"rayon",
"regex",
"reqwest",
"reqwest 0.12.22",
"rmp-serde",
"rstest 0.18.2",
"rstest_reuse",
......@@ -1986,6 +2007,7 @@ dependencies = [
"figment",
"futures",
"humantime",
"jsonschema",
"local-ip-address",
"log",
"nid",
......@@ -1995,12 +2017,14 @@ dependencies = [
"prometheus",
"rand 0.9.1",
"regex",
"reqwest",
"reqwest 0.12.22",
"rstest 0.23.0",
"serde",
"serde_json",
"socket2",
"stdio-override",
"temp-env",
"tempfile",
"thiserror 2.0.12",
"tokio",
"tokio-stream",
......@@ -2246,7 +2270,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74fef4569247a5f429d9156b9d0a2599914385dd189c539334c625d8099d90ab"
dependencies = [
"futures-core",
"nom",
"nom 7.1.3",
"pin-project-lite",
]
......@@ -2271,13 +2295,23 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365"
[[package]]
name = "fancy-regex"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2"
dependencies = [
"bit-set 0.5.3",
"regex",
]
[[package]]
name = "fancy-regex"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
dependencies = [
"bit-set",
"bit-set 0.8.0",
"regex-automata 0.4.9",
"regex-syntax 0.8.5",
]
......@@ -2406,6 +2440,16 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "fraction"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3027ae1df8d41b4bed2241c8fdad4acc1e7af60c8e17743534b545e77182d678"
dependencies = [
"lazy_static",
"num",
]
[[package]]
name = "fuchsia-zircon"
version = "0.3.3"
......@@ -2846,7 +2890,7 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ac5654356c6f7f6116905aeaf92ab002c3d03414ada5dbe0bb2e32aa5fea173"
dependencies = [
"fancy-regex",
"fancy-regex 0.14.0",
"ggml-quants",
"indexmap 2.9.0",
"log",
......@@ -2888,6 +2932,25 @@ dependencies = [
"regex-syntax 0.8.5",
]
[[package]]
name = "h2"
version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d"
dependencies = [
"bytes",
"fnv",
"futures-core",
"futures-sink",
"futures-util",
"http 0.2.12",
"indexmap 2.9.0",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "h2"
version = "0.4.9"
......@@ -2963,7 +3026,7 @@ dependencies = [
"base64 0.21.7",
"byteorder",
"flate2",
"nom",
"nom 7.1.3",
"num-traits",
]
......@@ -3008,7 +3071,7 @@ dependencies = [
"log",
"num_cpus",
"rand 0.8.5",
"reqwest",
"reqwest 0.12.22",
"serde",
"serde_json",
"thiserror 2.0.12",
......@@ -3152,6 +3215,7 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2 0.3.27",
"http 0.2.12",
"http-body 0.4.6",
"httparse",
......@@ -3174,7 +3238,7 @@ dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2",
"h2 0.4.9",
"http 1.3.1",
"http-body 1.0.1",
"httparse",
......@@ -3237,7 +3301,7 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"socket2",
"system-configuration",
"system-configuration 0.6.1",
"tokio",
"tower-service",
"tracing",
......@@ -3567,6 +3631,15 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "iso8601"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1082f0c48f143442a1ac6122f67e360ceee130b967af4d50996e5154a45df46"
dependencies = [
"nom 8.0.0",
]
[[package]]
name = "itertools"
version = "0.10.5"
......@@ -3659,6 +3732,36 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "jsonschema"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a071f4f7efc9a9118dfb627a0a94ef247986e1ab8606a4c806ae2b3aa3b6978"
dependencies = [
"ahash",
"anyhow",
"base64 0.21.7",
"bytecount",
"clap 4.5.40",
"fancy-regex 0.11.0",
"fraction",
"getrandom 0.2.16",
"iso8601",
"itoa",
"memchr",
"num-cmp",
"once_cell",
"parking_lot",
"percent-encoding",
"regex",
"reqwest 0.11.27",
"serde",
"serde_json",
"time",
"url",
"uuid 1.17.0",
]
[[package]]
name = "jwalk"
version = "0.8.1"
......@@ -4011,7 +4114,7 @@ dependencies = [
"futures",
"prometheus",
"rand 0.9.1",
"reqwest",
"reqwest 0.12.22",
"serde",
"serde_json",
"thiserror 2.0.12",
......@@ -4142,7 +4245,7 @@ dependencies = [
"indexmap 2.9.0",
"mistralrs-core",
"rand 0.9.1",
"reqwest",
"reqwest 0.12.22",
"serde",
"serde_json",
"tokio",
......@@ -4222,7 +4325,7 @@ dependencies = [
"rayon",
"regex",
"regex-automata 0.4.9",
"reqwest",
"reqwest 0.12.22",
"rubato",
"rust-mcp-schema",
"rustc-hash 2.1.1",
......@@ -4263,7 +4366,7 @@ dependencies = [
"async-trait",
"futures-util",
"http 1.3.1",
"reqwest",
"reqwest 0.12.22",
"rust-mcp-schema",
"serde",
"serde_json",
......@@ -4497,6 +4600,15 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "nom"
version = "8.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
dependencies = [
"memchr",
]
[[package]]
name = "nonmax"
version = "0.5.5"
......@@ -4555,6 +4667,12 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-cmp"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa"
[[package]]
name = "num-complex"
version = "0.4.6"
......@@ -5187,8 +5305,8 @@ version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50"
dependencies = [
"bit-set",
"bit-vec",
"bit-set 0.8.0",
"bit-vec 0.8.0",
"bitflags 2.9.0",
"lazy_static",
"num-traits",
......@@ -5666,6 +5784,42 @@ version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2"
[[package]]
name = "reqwest"
version = "0.11.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62"
dependencies = [
"base64 0.21.7",
"bytes",
"encoding_rs",
"futures-core",
"futures-util",
"h2 0.3.27",
"http 0.2.12",
"http-body 0.4.6",
"hyper 0.14.32",
"ipnet",
"js-sys",
"log",
"mime",
"once_cell",
"percent-encoding",
"pin-project-lite",
"serde",
"serde_json",
"serde_urlencoded",
"sync_wrapper 0.1.2",
"system-configuration 0.5.1",
"tokio",
"tower-service",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
"winreg",
]
[[package]]
name = "reqwest"
version = "0.12.22"
......@@ -5678,7 +5832,7 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2",
"h2 0.4.9",
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
......@@ -5723,9 +5877,9 @@ dependencies = [
"futures-core",
"futures-timer",
"mime",
"nom",
"nom 7.1.3",
"pin-project-lite",
"reqwest",
"reqwest 0.12.22",
"thiserror 1.0.69",
]
......@@ -6556,7 +6710,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326"
dependencies = [
"base64 0.13.1",
"nom",
"nom 7.1.3",
"serde",
"unicode-segmentation",
]
......@@ -6573,6 +6727,16 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "stdio-override"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cffa8a2e517b4e9f270c47e1c4120df90506d9451c1efa67e3698d66446d30ce"
dependencies = [
"libc",
"winapi 0.3.9",
]
[[package]]
name = "stop-words"
version = "0.8.1"
......@@ -6879,6 +7043,17 @@ dependencies = [
"windows",
]
[[package]]
name = "system-configuration"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
dependencies = [
"bitflags 1.3.2",
"core-foundation 0.9.4",
"system-configuration-sys 0.5.0",
]
[[package]]
name = "system-configuration"
version = "0.6.1"
......@@ -6887,7 +7062,17 @@ checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
dependencies = [
"bitflags 2.9.0",
"core-foundation 0.9.4",
"system-configuration-sys",
"system-configuration-sys 0.6.0",
]
[[package]]
name = "system-configuration-sys"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
......@@ -7113,7 +7298,7 @@ dependencies = [
"dary_heap",
"derive_builder",
"esaxx-rs",
"fancy-regex",
"fancy-regex 0.14.0",
"getrandom 0.3.2",
"hf-hub",
"itertools 0.14.0",
......@@ -7350,7 +7535,7 @@ dependencies = [
"axum 0.7.9",
"base64 0.22.1",
"bytes",
"h2",
"h2 0.4.9",
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
......@@ -8404,6 +8589,16 @@ dependencies = [
"memchr",
]
[[package]]
name = "winreg"
version = "0.50.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
dependencies = [
"cfg-if 1.0.0",
"windows-sys 0.48.0",
]
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"
......
......@@ -70,3 +70,6 @@ env_logger = { version = "0.11" }
reqwest = { workspace = true }
rstest = { version = "0.23.0" }
temp-env = { version = "0.3.6" , features=["async_closure"] }
stdio-override = {version= "0.2.0"}
jsonschema = {version = "0.17"}
tempfile = { workspace = true }
\ No newline at end of file
......@@ -14,6 +14,7 @@
// limitations under the License.
use crate::config::HealthStatus;
use crate::logging::TraceParent;
use crate::metrics::MetricsRegistry;
use crate::traits::DistributedRuntimeProvider;
use axum::{body, http::StatusCode, response::IntoResponse, routing::get, Router};
......@@ -25,6 +26,7 @@ use std::time::Instant;
use tokio::{net::TcpListener, task::JoinHandle};
use tokio_util::sync::CancellationToken;
use tracing;
use tracing::Instrument;
/// HTTP server information containing socket address and handle
#[derive(Debug)]
......@@ -160,26 +162,35 @@ pub async fn spawn_http_server(
"/health",
get({
let state = Arc::clone(&server_state);
move || health_handler(state.clone())
move |tracing_ctx| health_handler(state, "health", tracing_ctx)
}),
)
.route(
"/live",
get({
let state = Arc::clone(&server_state);
move || health_handler(state)
move |tracing_ctx| health_handler(state, "live", tracing_ctx)
}),
)
.route(
"/metrics",
get({
let state = Arc::clone(&server_state);
move || metrics_handler(state)
move |tracing_ctx| metrics_handler(state, "metrics", tracing_ctx)
}),
)
.fallback(|| async {
.fallback(|tracing_ctx: TraceParent| {
async {
tracing::info!("[fallback handler] called");
(StatusCode::NOT_FOUND, "Route not found").into_response()
}
.instrument(tracing::trace_span!(
"fallback handler",
trace_id = tracing_ctx.trace_id,
parent_id = tracing_ctx.parent_id,
x_request_id = tracing_ctx.x_request_id,
tracestate = tracing_ctx.tracestate
))
});
let address = format!("{}:{}", host, port);
......@@ -216,8 +227,16 @@ pub async fn spawn_http_server(
}
/// Health handler
#[tracing::instrument(skip_all, level = "trace")]
async fn health_handler(state: Arc<HttpServerState>) -> impl IntoResponse {
#[tracing::instrument(skip_all, level="trace", fields(route= %route,
trace_id = trace_parent.trace_id,
parent_id = trace_parent.parent_id,
x_request_id= trace_parent.x_request_id,
tracestate= trace_parent.tracestate))]
async fn health_handler(
state: Arc<HttpServerState>,
route: &'static str, // Used for tracing only
trace_parent: TraceParent, // Used for tracing only
) -> impl IntoResponse {
let system_health = state.drt().system_health.lock().await;
let (mut healthy, endpoints) = system_health.get_health_status();
let uptime = match state.uptime() {
......@@ -248,7 +267,16 @@ async fn health_handler(state: Arc<HttpServerState>) -> impl IntoResponse {
}
/// Metrics handler with DistributedRuntime uptime
async fn metrics_handler(state: Arc<HttpServerState>) -> impl IntoResponse {
#[tracing::instrument(skip_all, level="trace", fields(route= %route,
trace_id = trace_parent.trace_id,
parent_id = trace_parent.parent_id,
x_request_id = trace_parent.x_request_id,
tracestate = trace_parent.tracestate))]
async fn metrics_handler(
state: Arc<HttpServerState>,
route: &'static str, // Used for tracing only
trace_parent: TraceParent, // Used for tracing only
) -> impl IntoResponse {
// Update the uptime gauge with current value
state.update_uptime_gauge();
......@@ -281,9 +309,17 @@ async fn create_test_drt_async() -> crate::DistributedRuntime {
#[cfg(test)]
mod tests {
use super::*;
use crate::logging::tests::load_log;
use crate::metrics::MetricsRegistry;
use anyhow::{anyhow, Result};
use chrono::{DateTime, Utc};
use jsonschema::{Draft, JSONSchema};
use rstest::rstest;
use serde_json::Value;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::sync::Arc;
use stdio_override::*;
use tokio::time::{sleep, Duration};
#[tokio::test]
......@@ -358,10 +394,10 @@ dynamo_uptime_seconds{namespace=\"http_server\"} 42
}
#[rstest]
#[cfg(feature = "integration")]
#[case("ready", 200, "ready")]
#[case("notready", 503, "notready")]
#[tokio::test]
#[cfg(feature = "integration")]
async fn test_health_endpoints(
#[case] starting_health_status: &'static str,
#[case] expected_status: u16,
......@@ -375,6 +411,8 @@ dynamo_uptime_seconds{namespace=\"http_server\"} 42
// Closure call is needed here to satisfy async_with_vars
crate::logging::init();
#[allow(clippy::redundant_closure_call)]
temp_env::async_with_vars(
[(
......@@ -402,6 +440,14 @@ dynamo_uptime_seconds{namespace=\"http_server\"} 42
("/someRandomPathNotFoundHere", 404, "Route not found"),
] {
println!("[test] Sending request to {}", path);
let traceparent_value =
"00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01";
let tracestate_value = "vendor1=opaqueValue1,vendor2=opaqueValue2";
let mut headers = reqwest::header::HeaderMap::new();
headers.insert(
reqwest::header::HeaderName.from_static("traceparent"),
reqwest::header::HeaderValue.from_str(traceparent_value)?,
);
let url = format!("http://{}{}", addr, path);
let response = client.get(&url).send().await.unwrap();
let status = response.status();
......@@ -427,6 +473,67 @@ dynamo_uptime_seconds{namespace=\"http_server\"} 42
.await;
}
#[tokio::test]
#[cfg(feature = "integration")]
async fn test_health_endpoint_tracing() -> Result<()> {
use std::sync::Arc;
use tokio::time::sleep;
use tokio_util::sync::CancellationToken;
// Closure call is needed here to satisfy async_with_vars
#[allow(clippy::redundant_closure_call)]
let _ = temp_env::async_with_vars(
[
("DYN_SYSTEM_STARTING_HEALTH_STATUS", Some("ready")),
("DYN_LOGGING_JSONL", Some("1")),
("DYN_LOG", Some("trace")),
],
(async || {
// TODO Add proper testing for
// trace id and parent id
crate::logging::init();
let runtime = crate::Runtime::from_settings().unwrap();
let drt = Arc::new(
crate::DistributedRuntime::from_settings_without_discovery(runtime)
.await
.unwrap(),
);
let cancel_token = CancellationToken::new();
let (addr, _) = spawn_http_server("127.0.0.1", 0, cancel_token.clone(), drt)
.await
.unwrap();
sleep(std::time::Duration::from_millis(1000)).await;
let client = reqwest::Client::new();
for path in [("/health"), ("/live"), ("/someRandomPathNotFoundHere")] {
let traceparent_value =
"00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01";
let tracestate_value = "vendor1=opaqueValue1,vendor2=opaqueValue2";
let mut headers = reqwest::header::HeaderMap::new();
headers.insert(
reqwest::header::HeaderName::from_static("traceparent"),
reqwest::header::HeaderValue::from_str(traceparent_value)?,
);
headers.insert(
reqwest::header::HeaderName::from_static("tracestate"),
reqwest::header::HeaderValue::from_str(tracestate_value)?,
);
let url = format!("http://{}{}", addr, path);
let response = client.get(&url).headers(headers).send().await.unwrap();
let status = response.status();
let body = response.text().await.unwrap();
tracing::info!(body = body, status = status.to_string());
}
Ok::<(), anyhow::Error>(())
})(),
)
.await;
Ok(())
}
#[cfg(feature = "integration")]
#[tokio::test]
async fn test_uptime_without_initialization() {
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment