Unverified Commit 04442173 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix: replace metrics callback with background scraping to prevent tim… (#2480)


Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent dea0b201
...@@ -42,6 +42,7 @@ scrape_configs: ...@@ -42,6 +42,7 @@ scrape_configs:
- targets: ['host.docker.internal:8080'] # on the "monitoring" network - targets: ['host.docker.internal:8080'] # on the "monitoring" network
# Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ... # Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ...
# If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY
- job_name: 'dynamo-backend' - job_name: 'dynamo-backend'
scrape_interval: 6s scrape_interval: 6s
static_configs: static_configs:
......
...@@ -1134,7 +1134,7 @@ dependencies = [ ...@@ -1134,7 +1134,7 @@ dependencies = [
[[package]] [[package]]
name = "dynamo-llm" name = "dynamo-llm"
version = "0.4.0" version = "0.4.0+post0"
dependencies = [ dependencies = [
"ahash", "ahash",
"akin", "akin",
...@@ -1202,7 +1202,7 @@ dependencies = [ ...@@ -1202,7 +1202,7 @@ dependencies = [
[[package]] [[package]]
name = "dynamo-py3" name = "dynamo-py3"
version = "0.4.0" version = "0.4.0+post0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-openai", "async-openai",
...@@ -1229,7 +1229,7 @@ dependencies = [ ...@@ -1229,7 +1229,7 @@ dependencies = [
[[package]] [[package]]
name = "dynamo-runtime" name = "dynamo-runtime"
version = "0.4.0" version = "0.4.0+post0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-swap", "arc-swap",
......
...@@ -648,7 +648,7 @@ dependencies = [ ...@@ -648,7 +648,7 @@ dependencies = [
[[package]] [[package]]
name = "dynamo-runtime" name = "dynamo-runtime"
version = "0.4.0" version = "0.4.0+post0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-swap", "arc-swap",
...@@ -1020,7 +1020,7 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" ...@@ -1020,7 +1020,7 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]] [[package]]
name = "hello_world" name = "hello_world"
version = "0.4.0" version = "0.4.0+post0"
dependencies = [ dependencies = [
"dynamo-runtime", "dynamo-runtime",
] ]
...@@ -2492,7 +2492,7 @@ dependencies = [ ...@@ -2492,7 +2492,7 @@ dependencies = [
[[package]] [[package]]
name = "service_metrics" name = "service_metrics"
version = "0.4.0" version = "0.4.0+post0"
dependencies = [ dependencies = [
"dynamo-runtime", "dynamo-runtime",
"futures", "futures",
...@@ -2668,7 +2668,7 @@ dependencies = [ ...@@ -2668,7 +2668,7 @@ dependencies = [
[[package]] [[package]]
name = "system_metrics" name = "system_metrics"
version = "0.4.0" version = "0.4.0+post0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"dynamo-runtime", "dynamo-runtime",
......
...@@ -43,7 +43,7 @@ async fn test_backend_with_metrics() -> Result<()> { ...@@ -43,7 +43,7 @@ async fn test_backend_with_metrics() -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?; let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
// Get the System status server info to find the actual port // Get the System status server info to find the actual port
let system_status_info = distributed.system_status_info(); let system_status_info = distributed.system_status_server_info();
let system_status_port = match system_status_info { let system_status_port = match system_status_info {
Some(info) => { Some(info) => {
println!("System status server running on: {}", info.address()); println!("System status server running on: {}", info.address());
......
...@@ -259,6 +259,7 @@ impl Component { ...@@ -259,6 +259,7 @@ impl Component {
/// Scrape ServiceSet, which contains NATS stats as well as user defined stats /// Scrape ServiceSet, which contains NATS stats as well as user defined stats
/// embedded in data field of ServiceInfo. /// embedded in data field of ServiceInfo.
pub async fn scrape_stats(&self, timeout: Duration) -> Result<ServiceSet> { pub async fn scrape_stats(&self, timeout: Duration) -> Result<ServiceSet> {
// Debug: scraping stats for component
let service_name = self.service_name(); let service_name = self.service_name();
let service_client = self.drt().service_client(); let service_client = self.drt().service_client();
service_client service_client
...@@ -268,9 +269,15 @@ impl Component { ...@@ -268,9 +269,15 @@ impl Component {
/// Add Prometheus metrics for this component's service stats. /// Add Prometheus metrics for this component's service stats.
/// ///
/// Uses a channel to synchronize with the spawned async task, ensuring /// Starts a background task that scrapes stats every ~4.7s and updates metrics.
/// metrics are updated before the callback returns. /// The thinking was that it should be a little bit shorter than the Prometheus polling interval.
pub fn add_metrics_callback(&self) -> Result<()> { /// Currently Prometheus polls every 6 seconds, and I wanted every poll to be fresh, so this is set
/// as an arbitrary 4.7 seconds plus 0.3 seconds if it times out. It's a bit of a hand-wavey decision.
pub fn start_scraping_metrics(&self) -> Result<()> {
const NATS_TIMEOUT_AND_INITIAL_DELAY_MS: std::time::Duration =
std::time::Duration::from_millis(300);
const MAX_DELAY_MS: std::time::Duration = std::time::Duration::from_millis(4700);
let component_metrics = ComponentNatsPrometheusMetrics::new(self)?; let component_metrics = ComponentNatsPrometheusMetrics::new(self)?;
let component_clone = self.clone(); let component_clone = self.clone();
...@@ -281,60 +288,41 @@ impl Component { ...@@ -281,60 +288,41 @@ impl Component {
self.service_name() self.service_name()
); // it happens that in component, hierarchy and service name are the same ); // it happens that in component, hierarchy and service name are the same
// Register a metrics callback that scrapes component statistics // Start a background task that scrapes stats every 5 seconds
let metrics_callback = Arc::new(move || { let m = component_metrics.clone();
// Timeout for scraping metrics from components (in milliseconds) let c = component_clone.clone();
// This value is also used by KV Router metrics aggregator (300ms) and other components
const METRICS_SCRAPE_TIMEOUT_MS: u64 = 300; // Use std::thread for the background task to avoid runtime context issues
std::thread::spawn(move || {
// Get the current Tokio runtime handle // Use the existing secondary runtime from drt for background metrics scraping
let handle = tokio::runtime::Handle::try_current() let rt = c.drt().runtime().secondary();
.map_err(|err| anyhow::anyhow!("No Tokio runtime handle available: {}", err))?;
// Run the background scraping loop
let m = component_metrics.clone(); rt.block_on(async {
let c = component_clone.clone(); let timeout = NATS_TIMEOUT_AND_INITIAL_DELAY_MS;
let mut delay = NATS_TIMEOUT_AND_INITIAL_DELAY_MS;
// Create a channel to synchronize with the spawned task
let (tx, rx) = std::sync::mpsc::channel::<anyhow::Result<()>>(); loop {
match c.scrape_stats(timeout).await {
let timeout = std::time::Duration::from_millis(METRICS_SCRAPE_TIMEOUT_MS); Ok(service_set) => {
handle.spawn(async move { m.update_from_service_set(&service_set);
let result = match c.scrape_stats(timeout).await { }
Ok(service_set) => { Err(err) => {
m.update_from_service_set(&service_set); tracing::error!(
Ok(()) "Background scrape failed for {}: {}",
} c.service_name(),
Err(err) => { err
// Reset metrics on failure );
m.reset_to_zeros(); m.reset_to_zeros();
Err(anyhow::anyhow!("Failed to scrape stats: {}", err)) // Double delay on failure, capped at MAX_DELAY
delay = std::cmp::min(delay * 2, MAX_DELAY_MS);
}
} }
}; tokio::time::sleep(delay).await;
// Send the result back to the waiting thread
// If send fails, the receiver has already given up waiting
let _ = tx.send(result);
});
// Wait for the spawned task to complete (with a timeout to prevent hanging)
// Add 100ms buffer to the scrape timeout to account for processing overhead
let recv_timeout = std::time::Duration::from_millis(METRICS_SCRAPE_TIMEOUT_MS + 100);
match rx.recv_timeout(recv_timeout) {
Ok(result) => result, // Return the actual result from scraping
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
component_metrics.reset_to_zeros();
Err(anyhow::anyhow!("Metrics collection timed out"))
} }
Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => { });
component_metrics.reset_to_zeros();
Err(anyhow::anyhow!("Metrics collection task failed"))
}
}
}); });
self.drt()
.register_metrics_callback(hierarchies, metrics_callback);
Ok(()) Ok(())
} }
...@@ -587,7 +575,7 @@ impl Namespace { ...@@ -587,7 +575,7 @@ impl Namespace {
// Register the metrics callback for this component. // Register the metrics callback for this component.
// If registration fails, log a warning but do not propagate the error, // If registration fails, log a warning but do not propagate the error,
// as metrics are not mission critical and should not block component creation. // as metrics are not mission critical and should not block component creation.
if let Err(err) = component.add_metrics_callback() { if let Err(err) = component.start_scraping_metrics() {
tracing::warn!( tracing::warn!(
"Failed to add metrics callback for component '{}': {}", "Failed to add metrics callback for component '{}': {}",
component.service_name(), component.service_name(),
......
...@@ -247,7 +247,8 @@ impl DistributedRuntime { ...@@ -247,7 +247,8 @@ impl DistributedRuntime {
self.instance_sources.clone() self.instance_sources.clone()
} }
/// Add a Prometheus metric to a specific hierarchy's registry /// Add a Prometheus metric to a specific hierarchy's registry. Note that it is possible
/// to register the same metric name multiple times, as long as the labels are different.
pub fn add_prometheus_metric( pub fn add_prometheus_metric(
&self, &self,
hierarchy: &str, hierarchy: &str,
...@@ -257,16 +258,6 @@ impl DistributedRuntime { ...@@ -257,16 +258,6 @@ impl DistributedRuntime {
let mut registries = self.hierarchy_to_metricsregistry.write().unwrap(); let mut registries = self.hierarchy_to_metricsregistry.write().unwrap();
let entry = registries.entry(hierarchy.to_string()).or_default(); let entry = registries.entry(hierarchy.to_string()).or_default();
// If a metric with this name already exists for the hierarchy, warn and skip registration
if entry.has_metric_named(metric_name) {
tracing::warn!(
hierarchy = ?hierarchy,
metric_name = ?metric_name,
"Metric already exists in registry; skipping registration"
);
return Ok(());
}
// Try to register the metric and provide better error information // Try to register the metric and provide better error information
match entry.prometheus_registry.register(prometheus_metric) { match entry.prometheus_registry.register(prometheus_metric) {
Ok(_) => Ok(()), Ok(_) => Ok(()),
......
...@@ -1089,11 +1089,9 @@ mod test_metricsregistry_prometheus_fmt_outputs { ...@@ -1089,11 +1089,9 @@ mod test_metricsregistry_prometheus_fmt_outputs {
let endpoint_output = let endpoint_output =
super::test_helpers::remove_nats_lines(&endpoint_output_raw).join("\n"); super::test_helpers::remove_nats_lines(&endpoint_output_raw).join("\n");
let expected_endpoint_output = format!( let expected_endpoint_output = r#"# HELP dynamo_component_testcounter A test counter
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789"# dynamo_component_testcounter{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"} 123.456789"#.to_string();
);
assert_eq!( assert_eq!(
endpoint_output, expected_endpoint_output, endpoint_output, expected_endpoint_output,
...@@ -1120,14 +1118,12 @@ dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345" ...@@ -1120,14 +1118,12 @@ dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345"
let component_output = let component_output =
super::test_helpers::remove_nats_lines(&component_output_raw).join("\n"); super::test_helpers::remove_nats_lines(&component_output_raw).join("\n");
let expected_component_output = format!( let expected_component_output = r#"# HELP dynamo_component_testcounter A test counter
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789 dynamo_component_testcounter{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"} 123.456789
# HELP dynamo_component_testgauge A test gauge # HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge # TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000"# dynamo_component_testgauge{dynamo_component="comp345",dynamo_namespace="ns345"} 50000"#.to_string();
);
assert_eq!( assert_eq!(
component_output, expected_component_output, component_output, expected_component_output,
...@@ -1153,17 +1149,15 @@ dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"} ...@@ -1153,17 +1149,15 @@ dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}
let namespace_output = let namespace_output =
super::test_helpers::remove_nats_lines(&namespace_output_raw).join("\n"); super::test_helpers::remove_nats_lines(&namespace_output_raw).join("\n");
let expected_namespace_output = format!( let expected_namespace_output = r#"# HELP dynamo_component_testcounter A test counter
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789 dynamo_component_testcounter{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"} 123.456789
# HELP dynamo_component_testgauge A test gauge # HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge # TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000 dynamo_component_testgauge{dynamo_component="comp345",dynamo_namespace="ns345"} 50000
# HELP dynamo_component_testintcounter A test int counter # HELP dynamo_component_testintcounter A test int counter
# TYPE dynamo_component_testintcounter counter # TYPE dynamo_component_testintcounter counter
dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345"# dynamo_component_testintcounter{dynamo_namespace="ns345"} 12345"#.to_string();
);
assert_eq!( assert_eq!(
namespace_output, expected_namespace_output, namespace_output, expected_namespace_output,
...@@ -1186,7 +1180,7 @@ dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345"# ...@@ -1186,7 +1180,7 @@ dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345"#
.create_intgaugevec( .create_intgaugevec(
"testintgaugevec", "testintgaugevec",
"A test int gauge vector", "A test int gauge vector",
&["instance", "service", "status"], &["instance", "status"],
&[("service", "api")], &[("service", "api")],
) )
.unwrap(); .unwrap();
...@@ -1226,37 +1220,42 @@ dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345"# ...@@ -1226,37 +1220,42 @@ dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345"#
let filtered_drt_output = let filtered_drt_output =
super::test_helpers::remove_nats_lines(&drt_output_raw).join("\n"); super::test_helpers::remove_nats_lines(&drt_output_raw).join("\n");
let expected_drt_output = format!( let expected_drt_output = r#"# HELP dynamo_component_testcounter A test counter
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789 dynamo_component_testcounter{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"} 123.456789
# HELP dynamo_component_testcountervec A test counter vector # HELP dynamo_component_testcountervec A test counter vector
# TYPE dynamo_component_testcountervec counter # TYPE dynamo_component_testcountervec counter
dynamo_component_testcountervec{{method="GET",service="api",status="200"}} 10 dynamo_component_testcountervec{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345",method="GET",service="api",status="200"} 10
dynamo_component_testcountervec{{method="POST",service="api",status="201"}} 5 dynamo_component_testcountervec{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345",method="POST",service="api",status="201"} 5
# HELP dynamo_component_testgauge A test gauge # HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge # TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000 dynamo_component_testgauge{dynamo_component="comp345",dynamo_namespace="ns345"} 50000
# HELP dynamo_component_testhistogram A test histogram # HELP dynamo_component_testhistogram A test histogram
# TYPE dynamo_component_testhistogram histogram # TYPE dynamo_component_testhistogram histogram
dynamo_component_testhistogram_bucket{{le="1"}} 0 dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="0.005"} 0
dynamo_component_testhistogram_bucket{{le="2.5"}} 2 dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="0.01"} 0
dynamo_component_testhistogram_bucket{{le="5"}} 3 dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="0.025"} 0
dynamo_component_testhistogram_bucket{{le="10"}} 3 dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="0.05"} 0
dynamo_component_testhistogram_bucket{{le="+Inf"}} 3 dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="0.1"} 0
dynamo_component_testhistogram_sum 7.5 dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="0.25"} 0
dynamo_component_testhistogram_count 3 dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="0.5"} 0
dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="1"} 1
dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="2.5"} 2
dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="5"} 3
dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="10"} 3
dynamo_component_testhistogram_bucket{dynamo_component="comp345",dynamo_namespace="ns345",le="+Inf"} 3
dynamo_component_testhistogram_sum{dynamo_component="comp345",dynamo_namespace="ns345"} 7.5
dynamo_component_testhistogram_count{dynamo_component="comp345",dynamo_namespace="ns345"} 3
# HELP dynamo_component_testintcounter A test int counter # HELP dynamo_component_testintcounter A test int counter
# TYPE dynamo_component_testintcounter counter # TYPE dynamo_component_testintcounter counter
dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345 dynamo_component_testintcounter{dynamo_namespace="ns345"} 12345
# HELP dynamo_component_testintgauge A test int gauge # HELP dynamo_component_testintgauge A test int gauge
# TYPE dynamo_component_testintgauge gauge # TYPE dynamo_component_testintgauge gauge
dynamo_component_testintgauge 42 dynamo_component_testintgauge{dynamo_namespace="ns345"} 42
# HELP dynamo_component_testintgaugevec A test int gauge vector # HELP dynamo_component_testintgaugevec A test int gauge vector
# TYPE dynamo_component_testintgaugevec gauge # TYPE dynamo_component_testintgaugevec gauge
dynamo_component_testintgaugevec{{instance="server1",service="api",status="active"}} 10 dynamo_component_testintgaugevec{dynamo_namespace="ns345",instance="server1",service="api",status="active"} 10
dynamo_component_testintgaugevec{{instance="server2",service="api",status="inactive"}} 0"# dynamo_component_testintgaugevec{dynamo_namespace="ns345",instance="server2",service="api",status="inactive"} 0"#.to_string();
);
assert_eq!( assert_eq!(
filtered_drt_output, expected_drt_output, filtered_drt_output, expected_drt_output,
...@@ -1480,7 +1479,7 @@ mod test_metricsregistry_nats { ...@@ -1480,7 +1479,7 @@ mod test_metricsregistry_nats {
input: SingleIn<String>, input: SingleIn<String>,
) -> Result<ManyOut<Annotated<String>>, Error> { ) -> Result<ManyOut<Annotated<String>>, Error> {
let (data, ctx) = input.into_parts(); let (data, ctx) = input.into_parts();
let response = format!("{}", data); let response = data.to_string();
let stream = stream::iter(vec![Annotated::from_data(response)]); let stream = stream::iter(vec![Annotated::from_data(response)]);
Ok(ResponseStream::new(Box::pin(stream), ctx.context())) Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
} }
...@@ -1505,7 +1504,7 @@ mod test_metricsregistry_nats { ...@@ -1505,7 +1504,7 @@ mod test_metricsregistry_nats {
let drt_output = drt.prometheus_metrics_fmt().unwrap(); let drt_output = drt.prometheus_metrics_fmt().unwrap();
let parsed_metrics: Vec<_> = drt_output let parsed_metrics: Vec<_> = drt_output
.lines() .lines()
.filter_map(|line| super::test_helpers::parse_prometheus_metric(line)) .filter_map(super::test_helpers::parse_prometheus_metric)
.collect(); .collect();
println!("=== Initial DRT metrics output ==="); println!("=== Initial DRT metrics output ===");
...@@ -1517,17 +1516,17 @@ mod test_metricsregistry_nats { ...@@ -1517,17 +1516,17 @@ mod test_metricsregistry_nats {
// DRT NATS metrics (ordered to match DRT_NATS_METRICS) // DRT NATS metrics (ordered to match DRT_NATS_METRICS)
(build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Should be connected (build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Should be connected
(build_metric_name(nats::CONNECTS), 1.0, 1.0), // Should have 1 connection (build_metric_name(nats::CONNECTS), 1.0, 1.0), // Should have 1 connection
(build_metric_name(nats::IN_TOTAL_BYTES), 300.0, 500.0), // ~75% to ~125% of 417 (build_metric_name(nats::IN_TOTAL_BYTES), 400.0, 1500.0), // Wide range around 923
(build_metric_name(nats::IN_MESSAGES), 0.0, 0.0), // No messages yet (build_metric_name(nats::IN_MESSAGES), 0.0, 5.0), // Wide range around 2
(build_metric_name(nats::OUT_OVERHEAD_BYTES), 500.0, 700.0), // ~75% to ~125% of 612 (includes endpoint creation overhead) (build_metric_name(nats::OUT_OVERHEAD_BYTES), 700.0, 2500.0), // Wide range around 1633
(build_metric_name(nats::OUT_MESSAGES), 0.0, 0.0), // No messages yet (build_metric_name(nats::OUT_MESSAGES), 0.0, 5.0), // Wide range around 2
// Component NATS metrics (ordered to match COMPONENT_NATS_METRICS) // Component NATS metrics (ordered to match COMPONENT_NATS_METRICS)
(build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 0.0), // No processing yet (build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 0.0), // No processing yet
(build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // No errors yet (build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // No errors yet
(build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // No requests yet (build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // No requests yet
(build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 0.0), // No processing yet (build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 0.0), // No processing yet
(build_metric_name(nats::ACTIVE_SERVICES), 0.0, 0.0), // No services yet (build_metric_name(nats::ACTIVE_SERVICES), 0.0, 2.0), // Service may not be fully active yet
(build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 0.0), // No endpoints yet (build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 2.0), // Endpoint may not be fully active yet
]; ];
for (metric_name, min_value, max_value) in &initial_expected_metric_values { for (metric_name, min_value, max_value) in &initial_expected_metric_values {
...@@ -1576,7 +1575,6 @@ mod test_metricsregistry_nats { ...@@ -1576,7 +1575,6 @@ mod test_metricsregistry_nats {
); );
} }
} }
sleep(Duration::from_millis(100)).await;
} }
println!("✓ Sent messages and received responses successfully"); println!("✓ Sent messages and received responses successfully");
...@@ -1592,42 +1590,46 @@ mod test_metricsregistry_nats { ...@@ -1592,42 +1590,46 @@ mod test_metricsregistry_nats {
let final_parsed_metrics: Vec<_> = super::test_helpers::extract_metrics(&final_drt_output) let final_parsed_metrics: Vec<_> = super::test_helpers::extract_metrics(&final_drt_output)
.iter() .iter()
.filter_map(|line| super::test_helpers::parse_prometheus_metric(line)) .filter_map(|line| super::test_helpers::parse_prometheus_metric(line.as_str()))
.collect(); .collect();
println!("\n=== Waiting 1 second for metrics to stabilize ===");
sleep(Duration::from_secs(1)).await;
println!("✓ Wait complete, checking final metrics...");
let post_expected_metric_values = [ let post_expected_metric_values = [
// DRT NATS metrics (ordered to match DRT_NATS_METRICS) // DRT NATS metrics
(build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Should remain connected (build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Connected
(build_metric_name(nats::CONNECTS), 1.0, 1.0), // Should remain 1 connection (build_metric_name(nats::CONNECTS), 1.0, 1.0), // 1 connection
(build_metric_name(nats::IN_TOTAL_BYTES), 22000.0, 28000.0), // ~75% to ~125% of 24977 (10 messages × 2000 bytes + overhead) (build_metric_name(nats::IN_TOTAL_BYTES), 20000.0, 32000.0), // Wide range around 26117
(build_metric_name(nats::IN_MESSAGES), 10.0, 12.0), // Allow small drift (callback may run twice) (build_metric_name(nats::IN_MESSAGES), 8.0, 20.0), // Wide range around 16
(build_metric_name(nats::OUT_OVERHEAD_BYTES), 2076.0, 3461.0), // ~75% to ~125% of 2769 (synchronous metrics collection overhead) (build_metric_name(nats::OUT_OVERHEAD_BYTES), 2500.0, 8000.0), // Wide range around 5524
(build_metric_name(nats::OUT_MESSAGES), 10.0, 12.0), // Allow small drift (callback may run twice) (build_metric_name(nats::OUT_MESSAGES), 8.0, 20.0), // Wide range around 16
// Component NATS metrics (ordered to match COMPONENT_NATS_METRICS) // Component NATS metrics
(build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 1.0), // Should be low processing time (build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 1.0), // Low processing time
(build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // Should have no errors (build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // No errors
(build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // NATS metrics don't track work handler requests (build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // No work handler requests
(build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 5.0), // Should be low total processing time (build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 5.0), // Low total processing time
(build_metric_name(nats::ACTIVE_SERVICES), 0.0, 0.0), // NATS metrics don't track work handler services (build_metric_name(nats::ACTIVE_SERVICES), 0.0, 2.0), // Service may not be fully active
(build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 0.0), // NATS metrics don't track work handler endpoints (build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 2.0), // Endpoint may not be fully active
// Work handler metrics with ranges // Work handler metrics
(build_metric_name(work_handler::REQUESTS_TOTAL), 10.0, 10.0), // Exact count (10 messages) (build_metric_name(work_handler::REQUESTS_TOTAL), 10.0, 10.0), // 10 messages
( (
build_metric_name(work_handler::REQUEST_BYTES_TOTAL), build_metric_name(work_handler::REQUEST_BYTES_TOTAL),
21000.0, 21000.0,
26000.0, 26000.0,
), // ~75% to ~125% of 23520 (10 × 2000 bytes + overhead) ), // ~75-125% of 23520
( (
build_metric_name(work_handler::RESPONSE_BYTES_TOTAL), build_metric_name(work_handler::RESPONSE_BYTES_TOTAL),
18000.0, 18000.0,
23000.0, 23000.0,
), // ~75% to ~125% of 20660 (10 × 2000 bytes + overhead, but response size varies) ), // ~75-125% of 20660
// Additional component metrics
( (
build_metric_name(work_handler::CONCURRENT_REQUESTS), build_metric_name(work_handler::CONCURRENT_REQUESTS),
0.0, 0.0,
1.0, 1.0,
), // Should be 0 or very low ), // 0 or very low
// Histograms have _{count,sum} suffixes
( (
format!( format!(
"{}_count", "{}_count",
...@@ -1635,15 +1637,15 @@ mod test_metricsregistry_nats { ...@@ -1635,15 +1637,15 @@ mod test_metricsregistry_nats {
), ),
10.0, 10.0,
10.0, 10.0,
), // Exact count (10 messages) ), // 10 messages
( (
format!( format!(
"{}_sum", "{}_sum",
build_metric_name(work_handler::REQUEST_DURATION_SECONDS) build_metric_name(work_handler::REQUEST_DURATION_SECONDS)
), ),
0.001, 0.0001,
0.999, 1.0,
), // Processing time sum (10 messages) ), // Processing time sum (wide range)
]; ];
println!("\n=== Checking Post-Activity All Metrics (NATS + Work Handler) ==="); println!("\n=== Checking Post-Activity All Metrics (NATS + Work Handler) ===");
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment