Unverified Commit e5a8628f authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: add a hierarchical Prometheus MetricsRegistry trait for...


feat: add a hierarchical Prometheus MetricsRegistry trait for DistributedRuntime, Namespace, Components, and Endpoint (#2008)
Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
Co-authored-by: default avatarRyan Olson <rolson@nvidia.com>
parent 20c5daf3
......@@ -49,7 +49,6 @@ hf-hub = { version = "0.4.2", default-features = false, features = ["tokio", "ru
humantime = { version = "2.2.0" }
libc = { version = "0.2" }
oneshot = { version = "0.1.11", features = ["std", "async"] }
opentelemetry = { version = "0.27" }
prometheus = { version = "0.14" }
rand = { version = "0.9.0" }
reqwest = { version = "0.12.22", default-features = false, features = ["json", "stream", "rustls-tls"] }
......
......@@ -173,6 +173,7 @@ async fn app(runtime: Runtime) -> Result<()> {
let namespace_clone = namespace.clone();
let metrics_collector_clone = metrics_collector.clone();
// Note: Subscribing to KVHitRateEvent for illustration purposes. They're not used in production.
// Spawn a task to handle KV hit rate events
tokio::spawn(async move {
match namespace_clone.subscribe(kv_hit_rate_subject).await {
......
......@@ -2567,6 +2567,18 @@ dependencies = [
"version-compare",
]
[[package]]
name = "system_metrics"
version = "0.3.2"
dependencies = [
"dynamo-runtime",
"futures",
"prometheus",
"serde",
"serde_json",
"tokio",
]
[[package]]
name = "target-lexicon"
version = "0.12.16"
......
......@@ -17,6 +17,7 @@
members = [
"hello_world",
"service_metrics",
"system_metrics",
]
resolver = "3"
......@@ -32,3 +33,4 @@ repository = "https://github.com/ai-dynamo/dynamo.git"
[workspace.dependencies]
# local or crates.io
dynamo-runtime = { path = "../" }
prometheus = { workspace = true }
......@@ -45,6 +45,9 @@ async fn app(runtime: Runtime) -> Result<()> {
println!("{:?}", resp);
}
// This is just an illustration to invoke the server's stats_registry(<action>), where
// the action currently increments the `service_requests_total` metric. You can validate
// the result by running `curl http://localhost:8000/metrics`
let service_set = component.scrape_stats(Duration::from_millis(100)).await?;
println!("{:?}", service_set);
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "system_metrics"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
# third-party
futures = "0.3"
serde = { version = "1", features = ["derive"] }
serde_json = { version = "1" }
tokio = { version = "1", features = ["full"] }
prometheus = { version = "0.14" }
# System Metrics Example
Demonstrates custom metrics and monitoring in Dynamo Runtime using Prometheus.
## Overview
- Automatic hierarchical labeling: Runtime automatically adds `namespace``component``endpoint` labels
- Uses existing Prometheus implementations
- HTTP metrics endpoint automatically added
## Quick Start
### Build
```bash
cd lib/runtime/examples/system_metrics
cargo build
```
### Run Server
```bash
export DYN_LOG=1 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8000
cargo run --bin system_server
```
### Run Client
```bash
cargo run --bin system_client
```
Note: Running the client will increment `service_requests_total`.
### View Metrics
```bash
curl http://localhost:8000/metrics
```
Example output:
```
# HELP service_request_duration_seconds Time spent processing requests
# TYPE service_request_duration_seconds histogram
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.005"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.01"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.025"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.05"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.1"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.25"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.5"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="1"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="2.5"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="5"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="10"} 2
service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="+Inf"} 2
service_request_duration_seconds_sum{component="component",endpoint="endpoint",namespace="system",service="backend"} 0.000022239000000000002
service_request_duration_seconds_count{component="component",endpoint="endpoint",namespace="system",service="backend"} 2
# HELP service_requests_total Total number of requests processed
# TYPE service_requests_total counter
service_requests_total{component="component",endpoint="endpoint",namespace="system",service="backend"} 2
# HELP uptime_seconds Total uptime of the DistributedRuntime in seconds
# TYPE uptime_seconds gauge
uptime_seconds{namespace="http_server"} 725.997013676
```
## Configuration
| Variable | Description | Default |
|----------|-------------|---------|
| `DYN_LOG` | Enable logging | `0` |
| `DYN_SYSTEM_ENABLED` | Enable system metrics | `false` |
| `DYN_SYSTEM_PORT` | HTTP server port | `8000` |
## Metrics
- `service_requests_total`: Request counter
- `service_request_duration_seconds`: Request duration histogram
- `uptime_seconds`: Server uptime gauge
This provides automatic context and grouping for all metrics without manual configuration.
## Troubleshooting
- **Port in use**: Change `DYN_SYSTEM_PORT`
- **Connection refused**: Ensure server is running first
- **No metrics**: Verify `DYN_SYSTEM_ENABLED=true`
\ No newline at end of file
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use futures::StreamExt;
use system_metrics::DEFAULT_NAMESPACE;
use dynamo_runtime::{
logging, pipeline::PushRouter, protocols::annotated::Annotated, utils::Duration,
DistributedRuntime, Result, Runtime, Worker,
};
fn main() -> Result<()> {
logging::init();
let worker = Worker::from_settings()?;
worker.execute(app)
}
async fn app(runtime: Runtime) -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = distributed.namespace(DEFAULT_NAMESPACE)?;
let component = namespace.component("component")?;
let client = component.endpoint("endpoint").client().await?;
client.wait_for_instances().await?;
let router =
PushRouter::<String, Annotated<String>>::from_client(client, Default::default()).await?;
let mut stream = router.random("hello world".to_string().into()).await?;
while let Some(resp) = stream.next().await {
println!("{:?}", resp);
}
let service_set = component.scrape_stats(Duration::from_millis(100)).await?;
println!("{:?}", service_set);
runtime.shutdown();
Ok(())
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use system_metrics::{MyStats, DEFAULT_NAMESPACE};
use dynamo_runtime::{
logging,
metrics::MetricsRegistry,
pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
ResponseStream, SingleIn,
},
protocols::annotated::Annotated,
stream, DistributedRuntime, Result, Runtime, Worker,
};
use prometheus::{Counter, Histogram};
use std::sync::Arc;
/// Service metrics struct using the metric classes from metrics.rs
pub struct MySystemStatsMetrics {
pub request_counter: Arc<Counter>,
pub request_duration: Arc<Histogram>,
}
impl MySystemStatsMetrics {
/// Create a new ServiceMetrics instance using the metric backend
pub fn new<R: MetricsRegistry>(
metrics_registry: Arc<R>,
) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
let request_counter = metrics_registry.create_counter(
"service_requests_total",
"Total number of requests processed",
&[("service", "backend")],
)?;
let request_duration = metrics_registry.create_histogram(
"service_request_duration_seconds",
"Time spent processing requests",
&[("service", "backend")],
None,
)?;
Ok(Self {
request_counter,
request_duration,
})
}
}
fn main() -> Result<()> {
logging::init();
let worker = Worker::from_settings()?;
worker.execute(app)
}
async fn app(runtime: Runtime) -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
backend(distributed).await
}
struct RequestHandler {
metrics: Arc<MySystemStatsMetrics>,
}
impl RequestHandler {
fn new(metrics: Arc<MySystemStatsMetrics>) -> Arc<Self> {
Arc::new(Self { metrics })
}
}
#[async_trait]
impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for RequestHandler {
async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
let start_time = std::time::Instant::now();
// Record request start
self.metrics.request_counter.inc();
let (data, ctx) = input.into_parts();
let chars = data
.chars()
.map(|c| Annotated::from_data(c.to_string()))
.collect::<Vec<_>>();
let stream = stream::iter(chars);
// Record request duration
let duration = start_time.elapsed();
self.metrics
.request_duration
.observe(duration.as_secs_f64());
Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
}
}
async fn backend(drt: DistributedRuntime) -> Result<()> {
let endpoint = drt
.namespace(DEFAULT_NAMESPACE)?
.component("component")?
.service_builder()
.create()
.await?
.endpoint("endpoint");
// make the ingress discoverable via a component service
// we must first create a service, then we can attach one more more endpoints
// attach an ingress to an engine, with the RequestHandler using the metrics struct
let endpoint_metrics = Arc::new(
MySystemStatsMetrics::new(Arc::new(endpoint.clone()))
.map_err(|e| Error::msg(e.to_string()))?,
);
let ingress = Ingress::for_engine(RequestHandler::new(endpoint_metrics.clone()))?;
endpoint
.endpoint_builder()
.stats_handler(|_stats| {
println!("Stats handler called with stats: {:?}", _stats);
let stats = MyStats { val: 10 };
serde_json::to_value(stats).unwrap()
})
.handler(ingress)
.start()
.await?;
Ok(())
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
pub const DEFAULT_NAMESPACE: &str = "system";
#[derive(Serialize, Deserialize)]
// Dummy Stats object to demonstrate how to attach a custom stats handler
pub struct MyStats {
pub val: u32,
}
......@@ -29,7 +29,9 @@
//!
//! TODO: Top-level Overview of Endpoints/Functions
use crate::{discovery::Lease, service::ServiceSet, transports::etcd::EtcdPath};
use crate::{
discovery::Lease, metrics::MetricsRegistry, service::ServiceSet, transports::etcd::EtcdPath,
};
use super::{
error,
......@@ -168,6 +170,20 @@ impl RuntimeProvider for Component {
}
}
impl MetricsRegistry for Component {
fn basename(&self) -> String {
self.name.clone()
}
fn parent_hierarchy(&self) -> Vec<String> {
[
self.namespace.parent_hierarchy(),
vec![self.namespace.basename()],
]
.concat()
}
}
impl Component {
/// The component part of an instance path in etcd.
pub fn etcd_root(&self) -> String {
......@@ -300,6 +316,20 @@ impl RuntimeProvider for Endpoint {
}
}
impl MetricsRegistry for Endpoint {
fn basename(&self) -> String {
self.name.clone()
}
fn parent_hierarchy(&self) -> Vec<String> {
[
self.component.parent_hierarchy(),
vec![self.component.basename()],
]
.concat()
}
}
impl Endpoint {
pub fn id(&self) -> EndpointId {
EndpointId {
......
......@@ -19,7 +19,7 @@ use futures::stream::StreamExt;
use futures::{Stream, TryStreamExt};
use super::*;
use crate::metrics::MetricsRegistry;
use crate::traits::events::{EventPublisher, EventSubscriber};
#[async_trait]
......@@ -78,6 +78,16 @@ impl EventSubscriber for Namespace {
}
}
impl MetricsRegistry for Namespace {
fn basename(&self) -> String {
self.name.clone()
}
fn parent_hierarchy(&self) -> Vec<String> {
vec![self.drt().basename()]
}
}
#[cfg(feature = "integration")]
#[cfg(test)]
mod tests {
......
......@@ -17,6 +17,7 @@ pub use crate::component::Component;
use crate::{
component::{self, ComponentBuilder, Endpoint, InstanceSource, Namespace},
discovery::DiscoveryClient,
metrics::MetricsRegistry,
service::ServiceClient,
transports::{etcd, nats, tcp},
ErrorContext,
......@@ -30,6 +31,16 @@ use std::collections::HashMap;
use tokio::sync::Mutex;
use tokio_util::sync::CancellationToken;
impl MetricsRegistry for DistributedRuntime {
fn basename(&self) -> String {
"".to_string() // drt has no basename. Basename only begins with the Namespace.
}
fn parent_hierarchy(&self) -> Vec<String> {
vec![] // drt is the root, so no parent hierarchy
}
}
impl DistributedRuntime {
pub async fn new(runtime: Runtime, config: DistributedConfig) -> Result<Self> {
let secondary = runtime.secondary();
......@@ -65,6 +76,16 @@ impl DistributedRuntime {
})
.await??;
// Start HTTP server for health and metrics if enabled in configuration
let config = crate::config::RuntimeConfig::from_settings().unwrap_or_default();
// IMPORTANT: We must extract cancel_token from runtime BEFORE moving runtime into the struct below.
// This is because after moving, runtime is no longer accessible in this scope (ownership rules).
let cancel_token = if config.system_server_enabled() {
Some(runtime.clone().child_token())
} else {
None
};
let distributed_runtime = Self {
runtime,
etcd_client,
......@@ -73,24 +94,27 @@ impl DistributedRuntime {
component_registry: component::Registry::new(),
is_static,
instance_sources: Arc::new(Mutex::new(HashMap::new())),
start_time: std::time::Instant::now(),
prometheus_registries_by_prefix: Arc::new(std::sync::Mutex::new(HashMap::<
String,
prometheus::Registry,
>::new())),
};
// Start HTTP server for health and metrics (if enabled)
let config = crate::config::RuntimeConfig::from_settings().unwrap_or_default();
if config.system_server_enabled() {
let drt_arc = Arc::new(distributed_runtime.clone());
let runtime_clone = distributed_runtime.runtime.clone();
// spawn_http_server spawns its own background task:
// Start HTTP server if enabled
if let Some(cancel_token) = cancel_token {
let host = config.system_host.clone();
let port = config.system_port;
// Start HTTP server (it spawns its own task internally)
match crate::http_server::spawn_http_server(
&config.system_host,
config.system_port,
runtime_clone.child_token(),
drt_arc,
&host,
port,
cancel_token,
Arc::new(distributed_runtime.clone()),
)
.await
{
Ok((addr, _handle)) => {
Ok((addr, _)) => {
tracing::info!("HTTP server started successfully on {}", addr);
}
Err(e) => {
......@@ -191,11 +215,6 @@ impl DistributedRuntime {
pub fn instance_sources(&self) -> Arc<Mutex<HashMap<Endpoint, Weak<InstanceSource>>>> {
self.instance_sources.clone()
}
/// Get the uptime of this DistributedRuntime in seconds
pub fn uptime(&self) -> std::time::Duration {
self.start_time.elapsed()
}
}
#[derive(Dissolve)]
......
......@@ -13,77 +13,106 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::metrics::MetricsRegistry;
use crate::traits::DistributedRuntimeProvider;
use axum::{body, http::StatusCode, response::IntoResponse, routing::get, Router};
use prometheus::{
proto, register_gauge_with_registry, Encoder, Gauge, Opts, Registry, TextEncoder,
};
use std::sync::Arc;
use std::sync::OnceLock;
use std::time::Instant;
use tokio::net::TcpListener;
use tokio_util::sync::CancellationToken;
use tracing;
/// Runtime metrics for HTTP server
pub struct RuntimeMetrics {
uptime_gauge: Gauge,
pub struct HttpMetricsRegistry {
pub drt: Arc<crate::DistributedRuntime>,
}
impl RuntimeMetrics {
pub fn new(metrics_registry: &Arc<Registry>) -> anyhow::Result<Arc<Self>> {
let uptime_opts = Opts::new(
"uptime_seconds",
"Total uptime of the DistributedRuntime in seconds",
)
.namespace("dynamo")
.subsystem("runtime");
let uptime_gauge = register_gauge_with_registry!(uptime_opts, metrics_registry)?;
impl crate::traits::DistributedRuntimeProvider for HttpMetricsRegistry {
fn drt(&self) -> &crate::DistributedRuntime {
&self.drt
}
}
Ok(Arc::new(Self { uptime_gauge }))
impl MetricsRegistry for HttpMetricsRegistry {
fn basename(&self) -> String {
"http_server".to_string()
}
pub fn update_uptime(&self, uptime_seconds: f64) {
self.uptime_gauge.set(uptime_seconds);
fn parent_hierarchy(&self) -> Vec<String> {
[self.drt().parent_hierarchy(), vec![self.drt().basename()]].concat()
}
}
/// HTTP server state containing pre-created metrics
/// HTTP server state containing metrics and uptime tracking
pub struct HttpServerState {
drt: Arc<crate::DistributedRuntime>,
registry: Arc<Registry>,
runtime_metrics: Arc<RuntimeMetrics>,
// global drt registry is for printing out the entire Prometheus format output
root_drt: Arc<crate::DistributedRuntime>,
start_time: OnceLock<Instant>,
uptime_gauge: Arc<prometheus::Gauge>,
}
impl HttpServerState {
/// Create new HTTP server state with pre-created metrics
/// Create new HTTP server state with the provided metrics registry
pub fn new(drt: Arc<crate::DistributedRuntime>) -> anyhow::Result<Self> {
let registry = Arc::new(Registry::new());
let http_metrics_registry = Arc::new(HttpMetricsRegistry { drt: drt.clone() });
let uptime_gauge = http_metrics_registry.as_ref().create_gauge(
"uptime_seconds",
"Total uptime of the DistributedRuntime in seconds",
&[],
)?;
let state = Self {
root_drt: drt,
start_time: OnceLock::new(),
uptime_gauge,
};
Ok(state)
}
// Create runtime metrics
let runtime_metrics = RuntimeMetrics::new(&registry)?;
/// Initialize the start time (can only be called once)
pub fn initialize_start_time(&self) -> Result<(), &'static str> {
self.start_time
.set(Instant::now())
.map_err(|_| "Start time already initialized")
}
Ok(Self {
drt,
registry,
runtime_metrics,
})
pub fn uptime(&self) -> Result<std::time::Duration, &'static str> {
self.start_time
.get()
.ok_or("Start time not initialized")
.map(|start_time| start_time.elapsed())
}
/// Get a reference to the distributed runtime
pub fn drt(&self) -> &crate::DistributedRuntime {
&self.root_drt
}
/// Update the uptime gauge with current value
pub fn update_uptime_gauge(&self) {
if let Ok(uptime) = self.uptime() {
let uptime_seconds = uptime.as_secs_f64();
self.uptime_gauge.set(uptime_seconds);
} else {
tracing::warn!("Failed to update uptime gauge: start time not initialized");
}
}
}
/// Start HTTP server with DistributedRuntime support
/// Start HTTP server with metrics support
pub async fn spawn_http_server(
host: &str,
port: u16,
cancel_token: CancellationToken,
drt: Arc<crate::DistributedRuntime>,
) -> anyhow::Result<(std::net::SocketAddr, tokio::task::JoinHandle<()>)> {
tracing::info!(
"[spawn_http_server] called with host={}, port={}",
host,
port
);
// Create HTTP server state with pre-created metrics
// Create HTTP server state with the provided metrics registry
let server_state = Arc::new(HttpServerState::new(drt)?);
// Initialize the start time
server_state
.initialize_start_time()
.map_err(|e| anyhow::anyhow!("Failed to initialize start time: {}", e))?;
let app = Router::new()
.route(
"/health",
......@@ -146,48 +175,57 @@ pub async fn spawn_http_server(
/// Health handler
async fn health_handler(state: Arc<HttpServerState>) -> impl IntoResponse {
tracing::info!("[health_handler] called");
let uptime = state.drt.uptime();
let response = format!("OK\nUptime: {} seconds\n", uptime.as_secs());
(StatusCode::OK, response)
match state.uptime() {
Ok(uptime) => {
let response = format!("OK\nUptime: {} seconds\n", uptime.as_secs());
(StatusCode::OK, response)
}
Err(e) => {
tracing::error!("Failed to get uptime: {}", e);
(
StatusCode::INTERNAL_SERVER_ERROR,
"Failed to get uptime".to_string(),
)
}
}
}
/// Metrics handler with DistributedRuntime uptime
async fn metrics_handler(state: Arc<HttpServerState>) -> impl IntoResponse {
// Update the uptime gauge with current value
let uptime_seconds = state.drt.uptime().as_secs_f64();
state.runtime_metrics.update_uptime(uptime_seconds);
// Gather metrics from the registry
let metric_families = state.registry.gather();
state.update_uptime_gauge();
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
match encoder.encode(&metric_families, &mut buffer) {
Ok(()) => match String::from_utf8(buffer) {
Ok(response) => (StatusCode::OK, response),
Err(e) => {
tracing::error!("Failed to encode metrics as UTF-8: {}", e);
(
StatusCode::INTERNAL_SERVER_ERROR,
"Failed to encode metrics as UTF-8".to_string(),
)
}
},
// Get metrics from the registry
match state.drt().prometheus_metrics_fmt() {
Ok(response) => (StatusCode::OK, response),
Err(e) => {
tracing::error!("Failed to encode metrics: {}", e);
tracing::error!("Failed to get metrics from registry: {}", e);
(
StatusCode::INTERNAL_SERVER_ERROR,
"Failed to encode metrics".to_string(),
"Failed to get metrics".to_string(),
)
}
}
}
// Regular tests: cargo test http_server --lib
// Integration tests: cargo test http_server --lib --features integration
#[cfg(test)]
/// Helper function to create a DRT instance for async testing
/// Uses the test-friendly constructor without discovery
async fn create_test_drt_async() -> crate::DistributedRuntime {
let rt = crate::Runtime::from_current().unwrap();
crate::DistributedRuntime::from_settings_without_discovery(rt)
.await
.unwrap()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::metrics::MetricsRegistry;
use std::sync::Arc;
use tokio::time::{sleep, Duration};
#[tokio::test]
......@@ -220,68 +258,70 @@ mod tests {
);
}
#[cfg(feature = "integration")]
#[tokio::test]
async fn test_runtime_metrics_creation() {
// Test RuntimeMetrics creation and functionality
let registry = Arc::new(Registry::new());
let runtime_metrics = RuntimeMetrics::new(&registry).unwrap();
// Wait a bit to ensure uptime is measurable
tokio::time::sleep(Duration::from_millis(10)).await;
async fn test_runtime_metrics_initialization_and_namespace() {
// Test that metrics have correct namespace
let drt = create_test_drt_async().await;
let runtime_metrics = HttpServerState::new(Arc::new(drt)).unwrap();
// Test updating uptime
let uptime_seconds = 123.456;
runtime_metrics.update_uptime(uptime_seconds);
// Initialize start time
runtime_metrics.initialize_start_time().unwrap();
// Gather metrics from the registry
let metric_families = registry.gather();
runtime_metrics.uptime_gauge.set(42.0);
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
encoder.encode(&metric_families, &mut buffer).unwrap();
let response = runtime_metrics.drt().prometheus_metrics_fmt().unwrap();
println!("Full metrics response:\n{}", response);
let response = String::from_utf8(buffer).unwrap();
assert!(response.contains("dynamo_runtime_uptime_seconds"));
assert!(response.contains("123.456"));
let expected = "\
# HELP uptime_seconds Total uptime of the DistributedRuntime in seconds
# TYPE uptime_seconds gauge
uptime_seconds{namespace=\"http_server\"} 42
";
assert_eq!(response, expected);
}
#[cfg(feature = "integration")]
#[tokio::test]
async fn test_runtime_metrics_namespace() {
// Test that metrics have correct namespace
let registry = Arc::new(Registry::new());
let runtime_metrics = RuntimeMetrics::new(&registry).unwrap();
async fn test_start_time_initialization() {
// Test that start time can only be initialized once
let drt = create_test_drt_async().await;
let runtime_metrics = HttpServerState::new(Arc::new(drt)).unwrap();
runtime_metrics.update_uptime(42.0);
// First initialization should succeed
assert!(runtime_metrics.initialize_start_time().is_ok());
let metric_families = registry.gather();
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
encoder.encode(&metric_families, &mut buffer).unwrap();
// Second initialization should fail
assert!(runtime_metrics.initialize_start_time().is_err());
let response = String::from_utf8(buffer).unwrap();
// Check for the full metric name with namespace and subsystem
assert!(response.contains("dynamo_runtime_uptime_seconds"));
assert!(response.contains("Total uptime of the DistributedRuntime in seconds"));
// Uptime should work after initialization
let _uptime = runtime_metrics.uptime().unwrap();
// If we get here, uptime calculation works correctly
}
/*
#[cfg(feature = "integration")]
#[tokio::test]
async fn test_uptime_without_initialization() {
// Test that uptime returns an error if start time is not initialized
let drt = create_test_drt_async().await;
let runtime_metrics = HttpServerState::new(Arc::new(drt)).unwrap();
// This should return an error because start time is not initialized
let result = runtime_metrics.uptime();
assert!(result.is_err());
assert_eq!(result.unwrap_err(), "Start time not initialized");
}
#[cfg(feature = "integration")]
#[tokio::test]
async fn test_spawn_http_server_endpoints() {
use std::sync::Arc;
use tokio::time::sleep;
use tokio_util::sync::CancellationToken;
// use tokio::io::{AsyncReadExt, AsyncWriteExt};
// use reqwest for HTTP requests
let runtime = crate::Runtime::from_settings().unwrap();
let drt = Arc::new(
crate::DistributedRuntime::from_settings_without_discovery(runtime)
.await
.unwrap(),
);
let cancel_token = CancellationToken::new();
let (addr, server_handle) = spawn_http_server("127.0.0.1", 0, cancel_token.clone(), drt)
.await
.unwrap();
let drt = create_test_drt_async().await;
let (addr, server_handle) =
spawn_http_server("127.0.0.1", 0, cancel_token.clone(), Arc::new(drt))
.await
.unwrap();
println!("[test] Waiting for server to start...");
sleep(std::time::Duration::from_millis(1000)).await;
println!("[test] Server should be up, starting requests...");
......@@ -324,5 +364,36 @@ mod tests {
}
}
}
*/
#[cfg(feature = "integration")]
#[tokio::test]
async fn test_http_server_basic_functionality() {
// Test basic HTTP server functionality without requiring etcd
let cancel_token = CancellationToken::new();
let cancel_token_for_server = cancel_token.clone();
// Test basic HTTP server lifecycle
let app = Router::new().route("/test", get(|| async { (StatusCode::OK, "test") }));
// start HTTP server
let server_handle = tokio::spawn(async move {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let _ = axum::serve(listener, app)
.with_graceful_shutdown(cancel_token_for_server.cancelled_owned())
.await;
});
// wait for a while to let the server start
sleep(Duration::from_millis(100)).await;
// cancel token
cancel_token.cancel();
// wait for the server to shut down
let result = tokio::time::timeout(Duration::from_secs(5), server_handle).await;
assert!(
result.is_ok(),
"HTTP server should shut down when cancel token is cancelled"
);
}
}
......@@ -38,6 +38,7 @@ pub mod discovery;
pub mod engine;
pub mod http_server;
pub mod logging;
pub mod metrics;
pub mod pipeline;
pub mod prelude;
pub mod protocols;
......@@ -99,6 +100,6 @@ pub struct DistributedRuntime {
instance_sources: Arc<Mutex<HashMap<Endpoint, Weak<InstanceSource>>>>,
// Start time for tracking uptime
start_time: std::time::Instant,
// This map associates metric prefixes with their corresponding Prometheus registries.
prometheus_registries_by_prefix: Arc<std::sync::Mutex<HashMap<String, prometheus::Registry>>>,
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Metric Registry Framework for Dynamo.
//!
//! This module provides registry classes for Prometheus metrics
//! that auto populates the labels with the namespace-component-endpoint hierarchy.
use std::any::Any;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
// This constant determines whether metric names should include the full hierarchy as a prefix.
// If set to true, a hierarchy like ["", "mynamespace", "mycomponent", "myendpoint"]
// results in a metric name of "mynamespace_mycomponent_myendpoint__myendpoint".
// If false, the metric name will be just "myendpoint".
// This setting is applied *universally* to ensure consistent naming conventions.
pub const USE_PREFIXED_METRIC_NAMES: bool = false;
// If set to true, then metrics will be labeled with the namespace, component, and endpoint.
pub const USE_AUTO_LABELS: bool = true;
// Prometheus imports
use prometheus::Encoder;
fn build_metric_name(prefix: &str, metric_name: &str) -> String {
if !USE_PREFIXED_METRIC_NAMES {
return metric_name.to_string();
}
if prefix.is_empty() {
metric_name.to_string()
} else {
// Double underscore to separate between prefix and actual metric name
format!("{}__{}", prefix, metric_name)
}
}
/// Trait that defines common behavior for Prometheus metric types
pub trait PrometheusMetric: prometheus::core::Collector + Clone + Send + Sync + 'static {
/// Create a new metric with the given options
fn with_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error>
where
Self: Sized;
/// Create a new metric with histogram options and custom buckets
/// This is a default implementation that will panic for non-histogram metrics
fn with_histogram_opts_and_buckets(
_opts: prometheus::HistogramOpts,
_buckets: Option<Vec<f64>>,
) -> Result<Self, prometheus::Error>
where
Self: Sized,
{
panic!("with_histogram_opts_and_buckets is not implemented for this metric type");
}
/// Create a new metric with counter options and label names (for CounterVec)
/// This is a default implementation that will panic for non-countervec metrics
fn with_opts_and_label_names(
_opts: prometheus::Opts,
_label_names: &[&str],
) -> Result<Self, prometheus::Error>
where
Self: Sized,
{
panic!("with_opts_and_label_names is not implemented for this metric type");
}
}
// Implement the trait for Counter, IntCounter, and Gauge
impl PrometheusMetric for prometheus::Counter {
fn with_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
prometheus::Counter::with_opts(opts)
}
}
impl PrometheusMetric for prometheus::IntCounter {
fn with_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
prometheus::IntCounter::with_opts(opts)
}
}
impl PrometheusMetric for prometheus::Gauge {
fn with_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
prometheus::Gauge::with_opts(opts)
}
}
impl PrometheusMetric for prometheus::IntGauge {
fn with_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
prometheus::IntGauge::with_opts(opts)
}
}
impl PrometheusMetric for prometheus::IntGaugeVec {
fn with_opts(_opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
Err(prometheus::Error::Msg(
"IntGaugeVec requires label names, use with_opts_and_label_names instead".to_string(),
))
}
fn with_opts_and_label_names(
opts: prometheus::Opts,
label_names: &[&str],
) -> Result<Self, prometheus::Error> {
prometheus::IntGaugeVec::new(opts, label_names)
}
}
// Implement the trait for Histogram
impl PrometheusMetric for prometheus::Histogram {
fn with_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
// Convert Opts to HistogramOpts
let histogram_opts = prometheus::HistogramOpts::new(opts.name, opts.help);
prometheus::Histogram::with_opts(histogram_opts)
}
fn with_histogram_opts_and_buckets(
mut opts: prometheus::HistogramOpts,
buckets: Option<Vec<f64>>,
) -> Result<Self, prometheus::Error> {
if let Some(custom_buckets) = buckets {
opts = opts.buckets(custom_buckets);
}
prometheus::Histogram::with_opts(opts)
}
}
// Implement the trait for CounterVec
impl PrometheusMetric for prometheus::CounterVec {
fn with_opts(_opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
// This will panic - CounterVec needs label names
panic!("CounterVec requires label names, use with_opts_and_label_names instead");
}
fn with_opts_and_label_names(
opts: prometheus::Opts,
label_names: &[&str],
) -> Result<Self, prometheus::Error> {
prometheus::CounterVec::new(opts, label_names)
}
}
/// Private helper function to create metrics - not accessible to trait implementors
fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
registry: &R,
metric_name: &str,
metric_desc: &str,
labels: &[(&str, &str)],
buckets: Option<Vec<f64>>,
const_labels: Option<&[&str]>,
) -> anyhow::Result<Arc<T>> {
// Validate that user-provided labels don't have duplicate keys
let mut seen_keys = std::collections::HashSet::new();
let basename = registry.basename();
let metric_name = build_metric_name(&registry.prefix(), metric_name);
let parent_hierarchy = registry.parent_hierarchy();
// Validate that user-provided labels don't have duplicate keys
for (key, _) in labels {
if !seen_keys.insert(*key) {
return Err(anyhow::anyhow!(
"Duplicate label key '{}' found in labels",
key
));
}
}
let hierarchy = [parent_hierarchy, vec![basename]].concat();
// Build updated_labels: auto-labels first, then user labels
let mut updated_labels: Vec<(String, String)> = Vec::new();
if USE_AUTO_LABELS {
// Validate that user-provided labels don't conflict with auto-generated labels
for (key, _) in labels {
if *key == "namespace" || *key == "component" || *key == "endpoint" {
return Err(anyhow::anyhow!(
"Label '{}' is automatically added by auto_label feature and cannot be manually set",
key
));
}
}
// Add auto-generated labels
if hierarchy.len() > 1 {
let namespace = &hierarchy[1];
if !namespace.is_empty() {
updated_labels.push(("namespace".to_string(), namespace.clone()));
}
}
if hierarchy.len() > 2 {
let component = &hierarchy[2];
if !component.is_empty() {
updated_labels.push(("component".to_string(), component.clone()));
}
}
if hierarchy.len() > 3 {
let endpoint = &hierarchy[3];
if !endpoint.is_empty() {
updated_labels.push(("endpoint".to_string(), endpoint.clone()));
}
}
}
// Add user labels
updated_labels.extend(
labels
.iter()
.map(|(k, v)| ((*k).to_string(), (*v).to_string())),
);
// Handle different metric types
let metric = if std::any::TypeId::of::<T>() == std::any::TypeId::of::<prometheus::Histogram>() {
// Special handling for Histogram with custom buckets
// buckets parameter is valid for Histogram, const_labels is not used
if const_labels.is_some() {
return Err(anyhow::anyhow!(
"const_labels parameter is not valid for Histogram"
));
}
let mut opts = prometheus::HistogramOpts::new(&metric_name, metric_desc);
for (key, value) in &updated_labels {
opts = opts.const_label(key.clone(), value.clone());
}
T::with_histogram_opts_and_buckets(opts, buckets)?
} else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<prometheus::CounterVec>() {
// Special handling for CounterVec with label names
// const_labels parameter is required for CounterVec
if buckets.is_some() {
return Err(anyhow::anyhow!(
"buckets parameter is not valid for CounterVec"
));
}
let mut opts = prometheus::Opts::new(&metric_name, metric_desc);
for (key, value) in &updated_labels {
opts = opts.const_label(key.clone(), value.clone());
}
let label_names = const_labels
.ok_or_else(|| anyhow::anyhow!("CounterVec requires const_labels parameter"))?;
T::with_opts_and_label_names(opts, label_names)?
} else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<prometheus::IntGaugeVec>() {
// Special handling for IntGaugeVec with label names
// const_labels parameter is required for IntGaugeVec
if buckets.is_some() {
return Err(anyhow::anyhow!(
"buckets parameter is not valid for IntGaugeVec"
));
}
let mut opts = prometheus::Opts::new(&metric_name, metric_desc);
for (key, value) in &updated_labels {
opts = opts.const_label(key.clone(), value.clone());
}
let label_names = const_labels
.ok_or_else(|| anyhow::anyhow!("IntGaugeVec requires const_labels parameter"))?;
T::with_opts_and_label_names(opts, label_names)?
} else {
// Standard handling for Counter, IntCounter, Gauge, IntGauge
// buckets and const_labels parameters are not valid for these types
if buckets.is_some() {
return Err(anyhow::anyhow!(
"buckets parameter is not valid for Counter, IntCounter, Gauge, or IntGauge"
));
}
if const_labels.is_some() {
return Err(anyhow::anyhow!(
"const_labels parameter is not valid for Counter, IntCounter, Gauge, or IntGauge"
));
}
let mut opts = prometheus::Opts::new(&metric_name, metric_desc);
for (key, value) in &updated_labels {
opts = opts.const_label(key.clone(), value.clone());
}
T::with_opts(opts)?
};
// Iterate over the DRT's registry and register this metric across all hierarchical levels.
// The prefixed_hierarchy is structured as: ["", "mynamespace", "mynamespace_mycomponent", "mynamespace_mycomponent_myendpoint"]
// This prefixing is essential to differentiate between the names of children and grandchildren.
let mut prometheus_registry = registry
.drt()
.prometheus_registries_by_prefix
.lock()
.unwrap();
// Build prefixed hierarchy and register metrics in a single loop
// current_prefix accumulates the hierarchical path as we iterate through hierarchy
// For example, if hierarchy = ["", "mynamespace", "mycomponent"], then:
// - Iteration 1: current_prefix = "" (empty string from DRT)
// - Iteration 2: current_prefix = "mynamespace"
// - Iteration 3: current_prefix = "mynamespace_mycomponent"
let mut current_prefix = String::new();
for name in &hierarchy {
if !current_prefix.is_empty() && !name.is_empty() {
current_prefix.push('_');
}
current_prefix.push_str(name);
// Register metric at this hierarchical level
let collector: Box<dyn prometheus::core::Collector> = Box::new(metric.clone());
let _ = prometheus_registry
.entry(current_prefix.clone())
.or_default()
.register(collector);
}
Ok(Arc::new(metric))
}
/// This trait should be implemented by all metric registries, including Prometheus, Envy, OpenTelemetry, and others.
/// It offers a unified interface for creating and managing metrics, organizing sub-registries, and
/// generating output in Prometheus text format.
pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvider {
// Get the name of this registry (without any prefix)
fn basename(&self) -> String;
/// Retrieve the complete hierarchy and basename for this registry. Currently, the prefix for drt is an empty string,
/// so we must account for the leading underscore. The existing code remains unchanged to accommodate any future
/// scenarios where drt's prefix might be assigned a value.
fn prefix(&self) -> String {
[self.parent_hierarchy(), vec![self.basename()]]
.concat()
.join("_")
.trim_start_matches('_')
.to_string()
}
// Get the parent hierarchy for this registry (just the base names, NOT the prefix)
fn parent_hierarchy(&self) -> Vec<String>;
// TODO: Add support for additional Prometheus metric types:
// - Counter: ✅ IMPLEMENTED - create_counter()
// - CounterVec: ✅ IMPLEMENTED - create_countervec()
// - IntCounter: ✅ IMPLEMENTED - create_intcounter()
// - Gauge: ✅ IMPLEMENTED - create_gauge()
// - IntGauge/IntGaugeVec: ✅ IMPLEMENTED - create_intgauge() and create_intgaugevec()
// - Histogram: ✅ IMPLEMENTED - create_histogram()
// - Summary: create_summary() - for quantiles and sum/count metrics
// - HistogramVec with custom buckets: create_histogram_with_buckets()
// - SummaryVec: create_summary_vec() - for labeled summaries
// - Untyped: create_untyped() - for untyped metrics
// - Info: create_info() - for info metrics with labels
// - Stateset: create_stateset() - for state-based metrics
// - GaugeHistogram: create_gauge_histogram() - for gauge histograms
/// Create a Counter metric
fn create_counter(
&self,
name: &str,
description: &str,
labels: &[(&str, &str)],
) -> anyhow::Result<Arc<prometheus::Counter>> {
create_metric(self, name, description, labels, None, None)
}
/// Create a Gauge metric
fn create_gauge(
&self,
name: &str,
description: &str,
labels: &[(&str, &str)],
) -> anyhow::Result<Arc<prometheus::Gauge>> {
create_metric(self, name, description, labels, None, None)
}
/// Create an IntCounter metric
fn create_intcounter(
&self,
name: &str,
description: &str,
labels: &[(&str, &str)],
) -> anyhow::Result<Arc<prometheus::IntCounter>> {
create_metric(self, name, description, labels, None, None)
}
/// Create a Histogram metric with custom buckets
fn create_histogram(
&self,
name: &str,
description: &str,
labels: &[(&str, &str)],
buckets: Option<Vec<f64>>,
) -> anyhow::Result<Arc<prometheus::Histogram>> {
create_metric(self, name, description, labels, buckets, None)
}
/// Create a CounterVec metric with label names (for dynamic labels)
fn create_countervec(
&self,
name: &str,
description: &str,
const_labels: &[&str],
const_label_values: &[(&str, &str)],
) -> anyhow::Result<Arc<prometheus::CounterVec>> {
create_metric(
self,
name,
description,
const_label_values,
None,
Some(const_labels),
)
}
/// Create an IntGauge metric
fn create_intgauge(
&self,
name: &str,
description: &str,
labels: &[(&str, &str)],
) -> anyhow::Result<Arc<prometheus::IntGauge>> {
create_metric(self, name, description, labels, None, None)
}
/// Create an IntGaugeVec metric with label names (for dynamic labels)
fn create_intgaugevec(
&self,
name: &str,
description: &str,
const_labels: &[&str],
const_label_values: &[(&str, &str)],
) -> anyhow::Result<Arc<prometheus::IntGaugeVec>> {
create_metric(
self,
name,
description,
const_label_values,
None,
Some(const_labels),
)
}
/// Get metrics in Prometheus text format
fn prometheus_metrics_fmt(&self) -> anyhow::Result<String> {
let prometheus_registry = {
let mut registry = self.drt().prometheus_registries_by_prefix.lock().unwrap();
registry.entry(self.prefix()).or_default().clone()
};
let metric_families = prometheus_registry.gather();
let encoder = prometheus::TextEncoder::new();
let mut buffer = Vec::new();
encoder.encode(&metric_families, &mut buffer)?;
Ok(String::from_utf8(buffer)?)
}
}
#[cfg(test)]
/// Helper function to create a DRT instance for testing
/// Uses the test-friendly constructor without discovery
pub fn create_test_drt() -> crate::DistributedRuntime {
let rt = crate::Runtime::single_threaded().unwrap();
tokio::runtime::Runtime::new().unwrap().block_on(async {
crate::DistributedRuntime::from_settings_without_discovery(rt.clone())
.await
.unwrap()
})
}
#[cfg(feature = "integration")]
#[cfg(test)]
mod test_prefixes {
use super::create_test_drt;
use super::*;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
#[test]
fn test_hierarchical_prefixes_and_parent_hierarchies() {
println!("=== Testing Names, Prefixes, and Parent Hierarchies ===");
// Create a distributed runtime for testing
let drt = create_test_drt();
// Generate random namespace name
let mut hasher = DefaultHasher::new();
"test_namespace".hash(&mut hasher);
let random_num = hasher.finish();
let namespace_name = format!("mynamespace{}", random_num);
// Create namespace
let namespace = drt.namespace(&namespace_name).unwrap();
// Create component
let component = namespace.component("mycomponent").unwrap();
// Create endpoint
let endpoint = component.endpoint("myendpoint");
// Test DistributedRuntime hierarchy
println!("\n=== DistributedRuntime ===");
println!("basename: '{}'", drt.basename());
println!("parent_hierarchy: {:?}", drt.parent_hierarchy());
println!("prefix: '{}'", drt.prefix());
assert_eq!(drt.basename(), "", "DRT basename should be empty");
assert_eq!(
drt.parent_hierarchy(),
Vec::<String>::new(),
"DRT parent hierarchy should be empty"
);
assert_eq!(drt.prefix(), "", "DRT prefix should be empty");
// Test Namespace hierarchy
println!("\n=== Namespace ===");
println!("basename: '{}'", namespace.basename());
println!("parent_hierarchy: {:?}", namespace.parent_hierarchy());
println!("prefix: '{}'", namespace.prefix());
assert_eq!(
namespace.basename(),
namespace_name,
"Namespace basename should match the generated name"
);
assert_eq!(
namespace.parent_hierarchy(),
vec![""],
"Namespace parent hierarchy should be [\"\"]"
);
assert_eq!(
namespace.prefix(),
namespace_name,
"Namespace prefix should match the generated name, because drt's prefix is empty"
);
// Test Component hierarchy
println!("\n=== Component ===");
println!("basename: '{}'", component.basename());
println!("parent_hierarchy: {:?}", component.parent_hierarchy());
println!("prefix: '{}'", component.prefix());
assert_eq!(
component.basename(),
"mycomponent",
"Component basename should be 'mycomponent'"
);
assert_eq!(
component.parent_hierarchy(),
vec!["", &namespace_name],
"Component parent hierarchy should contain the generated namespace name"
);
assert_eq!(
component.prefix(),
format!("{}_mycomponent", namespace),
"Component prefix should be 'namespace_mycomponent'"
);
// Test Endpoint hierarchy
println!("\n=== Endpoint ===");
println!("basename: '{}'", endpoint.basename());
println!("parent_hierarchy: {:?}", endpoint.parent_hierarchy());
println!("prefix: '{}'", endpoint.prefix());
assert_eq!(
endpoint.basename(),
"myendpoint",
"Endpoint basename should be 'myendpoint'"
);
assert_eq!(
endpoint.parent_hierarchy(),
vec!["", &namespace_name, "mycomponent"],
"Endpoint parent hierarchy should contain the generated namespace name"
);
assert_eq!(
endpoint.prefix(),
format!("{}_mycomponent_myendpoint", namespace),
"Endpoint prefix should be 'namespace_mycomponent_myendpoint'"
);
// Test hierarchy relationships
println!("\n=== Hierarchy Relationships ===");
assert!(
namespace.parent_hierarchy().contains(&drt.basename()),
"Namespace should have DRT prefix in parent hierarchy"
);
assert!(
component.parent_hierarchy().contains(&namespace.basename()),
"Component should have Namespace prefix in parent hierarchy"
);
assert!(
endpoint.parent_hierarchy().contains(&component.basename()),
"Endpoint should have Component prefix in parent hierarchy"
);
println!("✓ All parent-child relationships verified");
// Test hierarchy depth
println!("\n=== Hierarchy Depth ===");
assert_eq!(
drt.parent_hierarchy().len(),
0,
"DRT should have 0 parent hierarchy levels"
);
assert_eq!(
namespace.parent_hierarchy().len(),
1,
"Namespace should have 1 parent hierarchy level"
);
assert_eq!(
component.parent_hierarchy().len(),
2,
"Component should have 2 parent hierarchy levels"
);
assert_eq!(
endpoint.parent_hierarchy().len(),
3,
"Endpoint should have 3 parent hierarchy levels"
);
println!("✓ All hierarchy depths verified");
// Summary
println!("\n=== Summary ===");
println!("DRT prefix: '{}'", drt.prefix());
println!("Namespace prefix: '{}'", namespace.prefix());
println!("Component prefix: '{}'", component.prefix());
println!("Endpoint prefix: '{}'", endpoint.prefix());
println!("All hierarchy assertions passed!");
}
}
#[cfg(feature = "integration")]
#[cfg(test)]
mod test_simple_metricsregistry_trait {
use super::create_test_drt;
use super::*;
use prometheus::Counter;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::sync::Arc;
#[test]
fn test_factory_methods_via_registry_trait() {
// Setup real DRT and registry using the test-friendly constructor
let drt = create_test_drt();
// Generate random namespace name
let mut hasher = DefaultHasher::new();
"test_factory_namespace".hash(&mut hasher);
let random_num = hasher.finish();
let namespace_name = format!("mynamespace{}", random_num);
let namespace = drt.namespace(&namespace_name).unwrap();
let component = namespace.component("mycomponent").unwrap();
let endpoint = component.endpoint("myendpoint");
// Test Counter creation
let counter = endpoint
.create_counter("mycounter", "A test counter", &[])
.unwrap();
counter.inc_by(123.456789);
let epsilon = 0.01;
assert!((counter.get() - 123.456789).abs() < epsilon);
let endpoint_output = endpoint.prometheus_metrics_fmt().unwrap();
println!("Endpoint output:");
println!("{}", endpoint_output);
let expected_endpoint_output = format!(
r#"# HELP mycounter A test counter
# TYPE mycounter counter
mycounter{{component="mycomponent",endpoint="myendpoint",namespace="{}"}} 123.456789
"#,
namespace_name
);
assert_eq!(
endpoint_output, expected_endpoint_output,
"\n=== ENDPOINT COMPARISON FAILED ===\n\
Expected:\n{}\n\
Actual:\n{}\n\
==============================",
expected_endpoint_output, endpoint_output
);
// Test Gauge creation
let gauge = component
.create_gauge("mygauge", "A test gauge", &[])
.unwrap();
gauge.set(50000.0);
assert_eq!(gauge.get(), 50000.0);
// Test Prometheus format output for Component (gauge + histogram)
let component_output = component.prometheus_metrics_fmt().unwrap();
println!("Component output:");
println!("{}", component_output);
let expected_component_output = format!(
r#"# HELP mycounter A test counter
# TYPE mycounter counter
mycounter{{component="mycomponent",endpoint="myendpoint",namespace="{}"}} 123.456789
# HELP mygauge A test gauge
# TYPE mygauge gauge
mygauge{{component="mycomponent",namespace="{}"}} 50000
"#,
namespace_name, namespace_name
);
assert_eq!(
component_output, expected_component_output,
"\n=== COMPONENT COMPARISON FAILED ===\n\
Expected:\n{}\n\
Actual:\n{}\n\
==============================",
expected_component_output, component_output
);
let intcounter = namespace
.create_intcounter("myintcounter", "A test int counter", &[])
.unwrap();
intcounter.inc_by(12345);
assert_eq!(intcounter.get(), 12345);
// Test Prometheus format output for Namespace (int_counter + gauge + histogram)
let namespace_output = namespace.prometheus_metrics_fmt().unwrap();
println!("Namespace output:");
println!("{}", namespace_output);
let expected_namespace_output = format!(
r#"# HELP mycounter A test counter
# TYPE mycounter counter
mycounter{{component="mycomponent",endpoint="myendpoint",namespace="{}"}} 123.456789
# HELP mygauge A test gauge
# TYPE mygauge gauge
mygauge{{component="mycomponent",namespace="{}"}} 50000
# HELP myintcounter A test int counter
# TYPE myintcounter counter
myintcounter{{namespace="{}"}} 12345
"#,
namespace_name, namespace_name, namespace_name
);
assert_eq!(
namespace_output, expected_namespace_output,
"\n=== NAMESPACE COMPARISON FAILED ===\n\
Expected:\n{}\n\
Actual:\n{}\n\
==============================",
expected_namespace_output, namespace_output
);
// Create a histogram with specified buckets. The Prometheus format output will
// lack labels since the DistributedRuntime is unnamed.
let histogram = drt
.create_histogram(
"myhistogram",
"A test histogram",
&[],
Some(vec![1.0, 2.5, 5.0, 10.0]),
)
.unwrap();
histogram.observe(1.5);
histogram.observe(2.5);
histogram.observe(3.5);
// Test CounterVec creation
let countervec = drt
.create_countervec(
"mycountervec",
"A test counter vector",
&["method", "status"],
&[("service", "api")],
)
.unwrap();
countervec.with_label_values(&["GET", "200"]).inc_by(10.0);
countervec.with_label_values(&["POST", "201"]).inc_by(5.0);
// Test IntGauge creation
let intgauge = drt
.create_intgauge("myintgauge", "A test int gauge", &[])
.unwrap();
intgauge.set(42);
assert_eq!(intgauge.get(), 42);
// Test IntGaugeVec creation
let intgaugevec = drt
.create_intgaugevec(
"myintgaugevec",
"A test int gauge vector",
&["instance", "status"],
&[("service", "api")],
)
.unwrap();
intgaugevec
.with_label_values(&["server1", "active"])
.set(10);
intgaugevec
.with_label_values(&["server2", "inactive"])
.set(0);
// Test Prometheus format output for DRT (which should contain everything)
let drt_output = drt.prometheus_metrics_fmt().unwrap();
println!("DRT output:");
println!("{}", drt_output);
let expected_drt_output = format!(
r#"# HELP mycounter A test counter
# TYPE mycounter counter
mycounter{{component="mycomponent",endpoint="myendpoint",namespace="{}"}} 123.456789
# HELP mycountervec A test counter vector
# TYPE mycountervec counter
mycountervec{{method="GET",service="api",status="200"}} 10
mycountervec{{method="POST",service="api",status="201"}} 5
# HELP mygauge A test gauge
# TYPE mygauge gauge
mygauge{{component="mycomponent",namespace="{}"}} 50000
# HELP myhistogram A test histogram
# TYPE myhistogram histogram
myhistogram_bucket{{le="1"}} 0
myhistogram_bucket{{le="2.5"}} 2
myhistogram_bucket{{le="5"}} 3
myhistogram_bucket{{le="10"}} 3
myhistogram_bucket{{le="+Inf"}} 3
myhistogram_sum 7.5
myhistogram_count 3
# HELP myintcounter A test int counter
# TYPE myintcounter counter
myintcounter{{namespace="{}"}} 12345
# HELP myintgauge A test int gauge
# TYPE myintgauge gauge
myintgauge 42
# HELP myintgaugevec A test int gauge vector
# TYPE myintgaugevec gauge
myintgaugevec{{instance="server1",service="api",status="active"}} 10
myintgaugevec{{instance="server2",service="api",status="inactive"}} 0
"#,
namespace_name, namespace_name, namespace_name
);
assert_eq!(
drt_output, expected_drt_output,
"\n=== DRT COMPARISON FAILED ===\n\
Expected:\n{}\n\
Actual:\n{}\n\
==============================",
expected_drt_output, drt_output
);
println!("✓ All Prometheus format outputs verified successfully!");
}
}
......@@ -31,3 +31,14 @@ impl RuntimeProvider for DistributedRuntime {
&self.runtime
}
}
// This implementation is required because:
// 1. MetricsRegistry has a supertrait bound: `MetricsRegistry: Send + Sync + DistributedRuntimeProvider`
// 2. DistributedRuntime implements MetricsRegistry (in distributed.rs)
// 3. Therefore, DistributedRuntime must implement DistributedRuntimeProvider to satisfy the trait bound
// 4. This enables DistributedRuntime to serve as both a provider (of itself) and a metrics registry
impl DistributedRuntimeProvider for DistributedRuntime {
fn drt(&self) -> &DistributedRuntime {
self
}
}
......@@ -616,7 +616,7 @@ mod tests {
fn test_ectd_client() {
let rt = Runtime::from_settings().unwrap();
let rt_clone = rt.clone();
let config = DistributedConfig::from_settings();
let config = DistributedConfig::from_settings(false);
rt_clone.primary().block_on(async move {
let drt = DistributedRuntime::new(rt, config).await.unwrap();
......@@ -628,8 +628,11 @@ mod tests {
let key = "__integration_test_key";
let value = b"test_value";
let client = drt.etcd_client();
let lease_id = drt.primary_lease().id();
let client = drt.etcd_client().expect("etcd client should be available");
let lease_id = drt
.primary_lease()
.expect("primary lease should be available")
.id();
// Create the key
let result = client
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment