Unverified Commit acbdabc4 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat(metrics): add NATS client metrics to prometheus_metrics_fmt (#2292)


Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 591f4d56
...@@ -31,7 +31,6 @@ use dynamo_llm::kv_router::scheduler::KVHitRateEvent; ...@@ -31,7 +31,6 @@ use dynamo_llm::kv_router::scheduler::KVHitRateEvent;
use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT; use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT;
use dynamo_runtime::{ use dynamo_runtime::{
error, logging, error, logging,
metrics::MetricsRegistry,
traits::events::{EventPublisher, EventSubscriber}, traits::events::{EventPublisher, EventSubscriber},
utils::{Duration, Instant}, utils::{Duration, Instant},
DistributedRuntime, ErrorContext, Result, Runtime, Worker, DistributedRuntime, ErrorContext, Result, Runtime, Worker,
...@@ -137,14 +136,7 @@ async fn app(runtime: Runtime) -> Result<()> { ...@@ -137,14 +136,7 @@ async fn app(runtime: Runtime) -> Result<()> {
.await .await
.context("Unable to create unique instance of Count; possibly one already exists")?; .context("Unable to create unique instance of Count; possibly one already exists")?;
let target_component = { let target_component = namespace.component(&config.component_name)?;
let c = namespace.component(&config.component_name)?;
if let Some(ref model) = config.model_name {
c.add_labels(&[("model", model.as_str())])?
} else {
c
}
};
let target_endpoint = target_component.endpoint(&config.endpoint_name); let target_endpoint = target_component.endpoint(&config.endpoint_name);
let service_path = target_endpoint.path(); let service_path = target_endpoint.path();
......
...@@ -485,21 +485,6 @@ impl Component { ...@@ -485,21 +485,6 @@ impl Component {
Ok(()) Ok(())
}) })
} }
/// Add constant labels to this component (for metrics). Returns a new Component with labels.
/// labels: list of (key, value) tuples.
fn add_labels(&self, labels: Vec<(String, String)>) -> PyResult<Component> {
use rs::metrics::MetricsRegistry as _;
let pairs: Vec<(&str, &str)> = labels
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect();
let inner = self.inner.clone().add_labels(&pairs).map_err(to_pyerr)?;
Ok(Component {
inner,
event_loop: self.event_loop.clone(),
})
}
} }
#[pymethods] #[pymethods]
......
...@@ -7,7 +7,6 @@ use anyhow::Context as _; ...@@ -7,7 +7,6 @@ use anyhow::Context as _;
use tokio::sync::{mpsc::Receiver, Notify}; use tokio::sync::{mpsc::Receiver, Notify};
use dynamo_runtime::{ use dynamo_runtime::{
metrics::MetricsRegistry,
pipeline::{ pipeline::{
network::egress::push_router::PushRouter, ManyOut, Operator, RouterMode, SegmentSource, network::egress::push_router::PushRouter, ManyOut, Operator, RouterMode, SegmentSource,
ServiceBackend, SingleIn, Source, ServiceBackend, SingleIn, Source,
...@@ -170,8 +169,7 @@ impl ModelWatcher { ...@@ -170,8 +169,7 @@ impl ModelWatcher {
let component = self let component = self
.drt .drt
.namespace(&endpoint_id.namespace)? .namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component) .component(&endpoint_id.component)?;
.and_then(|c| c.add_labels(&[("model", &model_entry.name)]))?;
let client = component.endpoint(&endpoint_id.name).client().await?; let client = component.endpoint(&endpoint_id.name).client().await?;
let Some(etcd_client) = self.drt.etcd_client() else { let Some(etcd_client) = self.drt.etcd_client() else {
......
...@@ -27,7 +27,6 @@ use dynamo_runtime::{ ...@@ -27,7 +27,6 @@ use dynamo_runtime::{
component::Client, component::Client,
distributed::DistributedConfig, distributed::DistributedConfig,
engine::{AsyncEngineStream, Data}, engine::{AsyncEngineStream, Data},
metrics::MetricsRegistry,
pipeline::{ pipeline::{
Context, ManyOut, Operator, PushRouter, RouterMode, SegmentSource, ServiceBackend, Context, ManyOut, Operator, PushRouter, RouterMode, SegmentSource, ServiceBackend,
ServiceEngine, ServiceFrontend, SingleIn, Source, ServiceEngine, ServiceFrontend, SingleIn, Source,
...@@ -111,8 +110,7 @@ pub async fn prepare_engine( ...@@ -111,8 +110,7 @@ pub async fn prepare_engine(
let endpoint_id = local_model.endpoint_id(); let endpoint_id = local_model.endpoint_id();
let component = distributed_runtime let component = distributed_runtime
.namespace(&endpoint_id.namespace)? .namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component) .component(&endpoint_id.component)?;
.and_then(|c| c.add_labels(&[("model", card.slug().to_string().as_str())]))?;
let client = component.endpoint(&endpoint_id.name).client().await?; let client = component.endpoint(&endpoint_id.name).client().await?;
......
...@@ -17,7 +17,6 @@ use crate::{ ...@@ -17,7 +17,6 @@ use crate::{
}; };
use dynamo_runtime::engine::AsyncEngineStream; use dynamo_runtime::engine::AsyncEngineStream;
use dynamo_runtime::metrics::MetricsRegistry;
use dynamo_runtime::pipeline::{ use dynamo_runtime::pipeline::{
network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source, network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source,
}; };
...@@ -33,25 +32,9 @@ pub async fn run( ...@@ -33,25 +32,9 @@ pub async fn run(
let cancel_token = distributed_runtime.primary_token().clone(); let cancel_token = distributed_runtime.primary_token().clone();
let endpoint_id: EndpointId = path.parse()?; let endpoint_id: EndpointId = path.parse()?;
let model_name = match &engine_config {
EngineConfig::StaticFull { model, .. } | EngineConfig::StaticCore { model, .. } => {
Some(model.service_name().to_string())
}
EngineConfig::StaticRemote(model) | EngineConfig::Dynamic(model) => {
Some(model.service_name().to_string())
}
};
let component = distributed_runtime let component = distributed_runtime
.namespace(&endpoint_id.namespace)? .namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component) .component(&endpoint_id.component)?;
.and_then(|c| {
if let Some(ref name) = model_name {
c.add_labels(&[("model", name.as_str())])
} else {
Ok(c)
}
})?;
let endpoint = component let endpoint = component
.service_builder() .service_builder()
.create() .create()
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
// limitations under the License. // limitations under the License.
use dynamo_runtime::{ use dynamo_runtime::{
logging, metrics::MetricsRegistry, pipeline::PushRouter, protocols::annotated::Annotated, logging, pipeline::PushRouter, protocols::annotated::Annotated, stream::StreamExt,
stream::StreamExt, DistributedRuntime, Result, Runtime, Worker, DistributedRuntime, Result, Runtime, Worker,
}; };
use hello_world::DEFAULT_NAMESPACE; use hello_world::DEFAULT_NAMESPACE;
...@@ -31,7 +31,6 @@ async fn app(runtime: Runtime) -> Result<()> { ...@@ -31,7 +31,6 @@ async fn app(runtime: Runtime) -> Result<()> {
let client = distributed let client = distributed
.namespace(DEFAULT_NAMESPACE)? .namespace(DEFAULT_NAMESPACE)?
.component("backend")? .component("backend")?
.add_labels(&[("model", "hello_world_model")])?
.endpoint("generate") .endpoint("generate")
.client() .client()
.await?; .await?;
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
use dynamo_runtime::{ use dynamo_runtime::{
logging, logging,
metrics::MetricsRegistry,
pipeline::{ pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
ResponseStream, SingleIn, ResponseStream, SingleIn,
...@@ -70,7 +69,6 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> { ...@@ -70,7 +69,6 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> {
runtime runtime
.namespace(DEFAULT_NAMESPACE)? .namespace(DEFAULT_NAMESPACE)?
.component("backend")? .component("backend")?
.add_labels(&[("model", "hello_world_model")])?
.service_builder() .service_builder()
.create() .create()
.await? .await?
......
...@@ -17,8 +17,8 @@ use futures::StreamExt; ...@@ -17,8 +17,8 @@ use futures::StreamExt;
use service_metrics::DEFAULT_NAMESPACE; use service_metrics::DEFAULT_NAMESPACE;
use dynamo_runtime::{ use dynamo_runtime::{
logging, metrics::MetricsRegistry, pipeline::PushRouter, protocols::annotated::Annotated, logging, pipeline::PushRouter, protocols::annotated::Annotated, utils::Duration,
utils::Duration, DistributedRuntime, Result, Runtime, Worker, DistributedRuntime, Result, Runtime, Worker,
}; };
fn main() -> Result<()> { fn main() -> Result<()> {
...@@ -31,9 +31,7 @@ async fn app(runtime: Runtime) -> Result<()> { ...@@ -31,9 +31,7 @@ async fn app(runtime: Runtime) -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?; let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = distributed.namespace(DEFAULT_NAMESPACE)?; let namespace = distributed.namespace(DEFAULT_NAMESPACE)?;
let component = namespace let component = namespace.component("backend")?;
.component("backend")?
.add_labels(&[("model", "service_metrics_model")])?;
let client = component.endpoint("generate").client().await?; let client = component.endpoint("generate").client().await?;
......
...@@ -17,7 +17,6 @@ use service_metrics::{MyStats, DEFAULT_NAMESPACE}; ...@@ -17,7 +17,6 @@ use service_metrics::{MyStats, DEFAULT_NAMESPACE};
use dynamo_runtime::{ use dynamo_runtime::{
logging, logging,
metrics::MetricsRegistry,
pipeline::{ pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
ResponseStream, SingleIn, ResponseStream, SingleIn,
...@@ -72,7 +71,6 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> { ...@@ -72,7 +71,6 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> {
runtime runtime
.namespace(DEFAULT_NAMESPACE)? .namespace(DEFAULT_NAMESPACE)?
.component("backend")? .component("backend")?
.add_labels(&[("model", "service_metrics_model")])?
.service_builder() .service_builder()
.create() .create()
.await? .await?
......
...@@ -91,7 +91,6 @@ pub async fn backend(drt: DistributedRuntime, endpoint_name: Option<&str>) -> Re ...@@ -91,7 +91,6 @@ pub async fn backend(drt: DistributedRuntime, endpoint_name: Option<&str>) -> Re
let endpoint = drt let endpoint = drt
.namespace(DEFAULT_NAMESPACE)? .namespace(DEFAULT_NAMESPACE)?
.component(DEFAULT_COMPONENT)? .component(DEFAULT_COMPONENT)?
.add_labels(&[("model", DEFAULT_MODEL_NAME)])?
.service_builder() .service_builder()
.create() .create()
.await? .await?
......
...@@ -30,7 +30,10 @@ ...@@ -30,7 +30,10 @@
//! TODO: Top-level Overview of Endpoints/Functions //! TODO: Top-level Overview of Endpoints/Functions
use crate::{ use crate::{
config::HealthStatus, discovery::Lease, metrics::MetricsRegistry, service::ServiceSet, config::HealthStatus,
discovery::Lease,
metrics::{prometheus_names, MetricsRegistry},
service::ServiceSet,
transports::etcd::EtcdPath, transports::etcd::EtcdPath,
}; };
...@@ -45,6 +48,7 @@ use super::{ ...@@ -45,6 +48,7 @@ use super::{
use crate::pipeline::network::{ingress::push_endpoint::PushEndpoint, PushWorkHandler}; use crate::pipeline::network::{ingress::push_endpoint::PushEndpoint, PushWorkHandler};
use crate::protocols::Endpoint as EndpointId; use crate::protocols::Endpoint as EndpointId;
use crate::service::ComponentNatsPrometheusMetrics;
use async_nats::{ use async_nats::{
rustls::quic, rustls::quic,
service::{Service, ServiceExt}, service::{Service, ServiceExt},
...@@ -187,16 +191,6 @@ impl MetricsRegistry for Component { ...@@ -187,16 +191,6 @@ impl MetricsRegistry for Component {
] ]
.concat() .concat()
} }
fn stored_labels(&self) -> Vec<(&str, &str)> {
let mut all_labels = self.namespace.stored_labels();
all_labels.extend(self.labels.iter().map(|(k, v)| (k.as_str(), v.as_str())));
all_labels
}
fn labels_mut(&mut self) -> &mut Vec<(String, String)> {
&mut self.labels
}
} }
impl Component { impl Component {
...@@ -262,6 +256,8 @@ impl Component { ...@@ -262,6 +256,8 @@ impl Component {
Ok(out) Ok(out)
} }
/// Scrape ServiceSet, which contains NATS stats as well as user defined stats
/// embedded in data field of ServiceInfo.
pub async fn scrape_stats(&self, timeout: Duration) -> Result<ServiceSet> { pub async fn scrape_stats(&self, timeout: Duration) -> Result<ServiceSet> {
let service_name = self.service_name(); let service_name = self.service_name();
let service_client = self.drt().service_client(); let service_client = self.drt().service_client();
...@@ -270,6 +266,78 @@ impl Component { ...@@ -270,6 +266,78 @@ impl Component {
.await .await
} }
/// Add Prometheus metrics for this component's service stats.
///
/// Uses a channel to synchronize with the spawned async task, ensuring
/// metrics are updated before the callback returns.
pub fn add_metrics_callback(&self) -> Result<()> {
let component_metrics = ComponentNatsPrometheusMetrics::new(self)?;
let component_clone = self.clone();
let mut hierarchies = self.parent_hierarchy();
hierarchies.push(self.hierarchy());
debug_assert_eq!(
hierarchies.last().cloned().unwrap_or_default(),
self.service_name()
); // it happens that in component, hierarchy and service name are the same
// Register a metrics callback that scrapes component statistics
let metrics_callback = Arc::new(move || {
// Timeout for scraping metrics from components (in milliseconds)
// This value is also used by KV Router metrics aggregator (300ms) and other components
const METRICS_SCRAPE_TIMEOUT_MS: u64 = 300;
// Get the current Tokio runtime handle
let handle = tokio::runtime::Handle::try_current()
.map_err(|err| anyhow::anyhow!("No Tokio runtime handle available: {}", err))?;
let m = component_metrics.clone();
let c = component_clone.clone();
// Create a channel to synchronize with the spawned task
let (tx, rx) = std::sync::mpsc::channel::<anyhow::Result<()>>();
let timeout = std::time::Duration::from_millis(METRICS_SCRAPE_TIMEOUT_MS);
handle.spawn(async move {
let result = match c.scrape_stats(timeout).await {
Ok(service_set) => {
m.update_from_service_set(&service_set);
Ok(())
}
Err(err) => {
// Reset metrics on failure
m.reset_to_zeros();
Err(anyhow::anyhow!("Failed to scrape stats: {}", err))
}
};
// Send the result back to the waiting thread
// If send fails, the receiver has already given up waiting
let _ = tx.send(result);
});
// Wait for the spawned task to complete (with a timeout to prevent hanging)
// Add 100ms buffer to the scrape timeout to account for processing overhead
let recv_timeout = std::time::Duration::from_millis(METRICS_SCRAPE_TIMEOUT_MS + 100);
match rx.recv_timeout(recv_timeout) {
Ok(result) => result, // Return the actual result from scraping
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
component_metrics.reset_to_zeros();
Err(anyhow::anyhow!("Metrics collection timed out"))
}
Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {
component_metrics.reset_to_zeros();
Err(anyhow::anyhow!("Metrics collection task failed"))
}
}
});
self.drt()
.register_metrics_callback(hierarchies, metrics_callback);
Ok(())
}
/// TODO /// TODO
/// ///
/// This method will scrape the stats for all available services /// This method will scrape the stats for all available services
...@@ -347,16 +415,6 @@ impl MetricsRegistry for Endpoint { ...@@ -347,16 +415,6 @@ impl MetricsRegistry for Endpoint {
] ]
.concat() .concat()
} }
fn stored_labels(&self) -> Vec<(&str, &str)> {
let mut all_labels = self.component.stored_labels();
all_labels.extend(self.labels.iter().map(|(k, v)| (k.as_str(), v.as_str())));
all_labels
}
fn labels_mut(&mut self) -> &mut Vec<(String, String)> {
&mut self.labels
}
} }
impl Endpoint { impl Endpoint {
...@@ -520,11 +578,24 @@ impl Namespace { ...@@ -520,11 +578,24 @@ impl Namespace {
/// Create a [`Component`] in the namespace who's endpoints can be discovered with etcd /// Create a [`Component`] in the namespace who's endpoints can be discovered with etcd
pub fn component(&self, name: impl Into<String>) -> Result<Component> { pub fn component(&self, name: impl Into<String>) -> Result<Component> {
Ok(ComponentBuilder::from_runtime(self.runtime.clone()) let component = ComponentBuilder::from_runtime(self.runtime.clone())
.name(name) .name(name)
.namespace(self.clone()) .namespace(self.clone())
.is_static(self.is_static) .is_static(self.is_static)
.build()?) .build()?;
// Register the metrics callback for this component.
// If registration fails, log a warning but do not propagate the error,
// as metrics are not mission critical and should not block component creation.
if let Err(err) = component.add_metrics_callback() {
tracing::warn!(
"Failed to add metrics callback for component '{}': {}",
component.service_name(),
err
);
}
Ok(component)
} }
/// Create a [`Namespace`] in the parent namespace /// Create a [`Namespace`] in the parent namespace
......
...@@ -84,19 +84,19 @@ impl MetricsRegistry for Namespace { ...@@ -84,19 +84,19 @@ impl MetricsRegistry for Namespace {
} }
fn parent_hierarchy(&self) -> Vec<String> { fn parent_hierarchy(&self) -> Vec<String> {
vec![self.drt().basename()] // Build as: [ "" (DRT), non-empty parent basenames from root -> leaf ]
} let mut names = vec![String::new()]; // Start with empty string for DRT
fn stored_labels(&self) -> Vec<(&str, &str)> { // Collect parent basenames from root to leaf
// Convert Vec<(String, String)> to Vec<(&str, &str)> let parent_names: Vec<String> =
self.labels std::iter::successors(self.parent.as_deref(), |ns| ns.parent.as_deref())
.iter() .map(|ns| ns.basename())
.map(|(k, v)| (k.as_str(), v.as_str())) .filter(|name| !name.is_empty())
.collect() .collect();
}
// Append parent names in reverse order (root to leaf)
fn labels_mut(&mut self) -> &mut Vec<(String, String)> { names.extend(parent_names.into_iter().rev());
&mut self.labels names
} }
} }
......
...@@ -14,13 +14,14 @@ ...@@ -14,13 +14,14 @@
// limitations under the License. // limitations under the License.
pub use crate::component::Component; pub use crate::component::Component;
use crate::transports::nats::DRTNatsPrometheusMetrics;
use crate::{ use crate::{
component::{self, ComponentBuilder, Endpoint, InstanceSource, Namespace}, component::{self, ComponentBuilder, Endpoint, InstanceSource, Namespace},
discovery::DiscoveryClient, discovery::DiscoveryClient,
metrics::MetricsRegistry, metrics::MetricsRegistry,
service::ServiceClient, service::ServiceClient,
transports::{etcd, nats, tcp}, transports::{etcd, nats, tcp},
ErrorContext, ErrorContext, RuntimeCallback,
}; };
use super::{error, Arc, DistributedRuntime, OnceCell, Result, Runtime, SystemHealth, Weak, OK}; use super::{error, Arc, DistributedRuntime, OnceCell, Result, Runtime, SystemHealth, Weak, OK};
...@@ -40,18 +41,6 @@ impl MetricsRegistry for DistributedRuntime { ...@@ -40,18 +41,6 @@ impl MetricsRegistry for DistributedRuntime {
fn parent_hierarchy(&self) -> Vec<String> { fn parent_hierarchy(&self) -> Vec<String> {
vec![] // drt is the root, so no parent hierarchy vec![] // drt is the root, so no parent hierarchy
} }
fn stored_labels(&self) -> Vec<(&str, &str)> {
// Convert Vec<(String, String)> to Vec<(&str, &str)>
self.labels
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect()
}
fn labels_mut(&mut self) -> &mut Vec<(String, String)> {
&mut self.labels
}
} }
impl DistributedRuntime { impl DistributedRuntime {
...@@ -88,6 +77,8 @@ impl DistributedRuntime { ...@@ -88,6 +77,8 @@ impl DistributedRuntime {
live_endpoint_path, live_endpoint_path,
))); )));
let nats_client_for_metrics = nats_client.clone();
let distributed_runtime = Self { let distributed_runtime = Self {
runtime, runtime,
etcd_client, etcd_client,
...@@ -97,14 +88,29 @@ impl DistributedRuntime { ...@@ -97,14 +88,29 @@ impl DistributedRuntime {
component_registry: component::Registry::new(), component_registry: component::Registry::new(),
is_static, is_static,
instance_sources: Arc::new(Mutex::new(HashMap::new())), instance_sources: Arc::new(Mutex::new(HashMap::new())),
prometheus_registries_by_prefix: Arc::new(std::sync::Mutex::new(HashMap::< hierarchy_to_metricsregistry: Arc::new(std::sync::RwLock::new(HashMap::<
String, String,
prometheus::Registry, crate::MetricsRegistryEntry,
>::new())), >::new())),
system_health, system_health,
labels: Vec::new(),
}; };
let sys_nats_metrics = DRTNatsPrometheusMetrics::new(
&distributed_runtime,
nats_client_for_metrics.client().clone(),
)?;
let mut drt_hierarchies = distributed_runtime.parent_hierarchy();
drt_hierarchies.push(distributed_runtime.hierarchy());
// Register a callback to update NATS client metrics
let nats_metrics_callback = Arc::new({
let sys_nats_metrics_clone = sys_nats_metrics.clone();
move || {
sys_nats_metrics_clone.set_from_client_stats();
Ok(())
}
});
distributed_runtime.register_metrics_callback(drt_hierarchies, nats_metrics_callback);
// Start system status server if enabled // Start system status server if enabled
if let Some(cancel_token) = cancel_token { if let Some(cancel_token) = cancel_token {
let host = config.system_host.clone(); let host = config.system_host.clone();
...@@ -240,6 +246,76 @@ impl DistributedRuntime { ...@@ -240,6 +246,76 @@ impl DistributedRuntime {
pub fn instance_sources(&self) -> Arc<Mutex<HashMap<Endpoint, Weak<InstanceSource>>>> { pub fn instance_sources(&self) -> Arc<Mutex<HashMap<Endpoint, Weak<InstanceSource>>>> {
self.instance_sources.clone() self.instance_sources.clone()
} }
/// Add a Prometheus metric to a specific hierarchy's registry
pub fn add_prometheus_metric(
&self,
hierarchy: &str,
metric_name: &str,
prometheus_metric: Box<dyn prometheus::core::Collector>,
) -> anyhow::Result<()> {
let mut registries = self.hierarchy_to_metricsregistry.write().unwrap();
let entry = registries.entry(hierarchy.to_string()).or_default();
// If a metric with this name already exists for the hierarchy, warn and skip registration
if entry.has_metric_named(metric_name) {
tracing::warn!(
hierarchy = ?hierarchy,
metric_name = ?metric_name,
"Metric already exists in registry; skipping registration"
);
return Ok(());
}
// Try to register the metric and provide better error information
match entry.prometheus_registry.register(prometheus_metric) {
Ok(_) => Ok(()),
Err(e) => {
let error_msg = e.to_string();
tracing::error!(
hierarchy = ?hierarchy,
error = ?error_msg,
metric_name = ?metric_name,
"Metric registration failed"
);
Err(e.into())
}
}
}
/// Add a callback function to metrics registries for the given hierarchies
pub fn register_metrics_callback(&self, hierarchies: Vec<String>, callback: RuntimeCallback) {
let mut registries = self.hierarchy_to_metricsregistry.write().unwrap();
for hierarchy in hierarchies {
registries
.entry(hierarchy)
.or_default()
.add_callback(callback.clone());
}
}
/// Execute all callbacks for a given hierarchy key and return their results
pub fn execute_metrics_callbacks(&self, hierarchy: &str) -> Vec<anyhow::Result<()>> {
// Clone callbacks while holding read lock (fast operation)
let callbacks = {
let registries = self.hierarchy_to_metricsregistry.read().unwrap();
registries
.get(hierarchy)
.map(|entry| entry.runtime_callbacks.clone())
}; // Read lock released here
// Execute callbacks without holding the lock
match callbacks {
Some(callbacks) => callbacks.iter().map(|callback| callback()).collect(),
None => Vec::new(),
}
}
/// Get all registered hierarchy keys. Private because it is only used for testing.
fn get_registered_hierarchies(&self) -> Vec<String> {
let registries = self.hierarchy_to_metricsregistry.read().unwrap();
registries.keys().cloned().collect()
}
} }
#[derive(Dissolve)] #[derive(Dissolve)]
......
...@@ -147,6 +147,70 @@ impl SystemHealth { ...@@ -147,6 +147,70 @@ impl SystemHealth {
} }
} }
/// Type alias for runtime callback functions to reduce complexity
///
/// This type represents an Arc-wrapped callback function that can be:
/// - Shared efficiently across multiple threads and contexts
/// - Cloned without duplicating the underlying closure
/// - Used in generic contexts requiring 'static lifetime
///
/// The Arc wrapper is included in the type to make sharing explicit.
type RuntimeCallback = Arc<dyn Fn() -> anyhow::Result<()> + Send + Sync + 'static>;
/// Structure to hold Prometheus registries and associated callbacks for a given hierarchy
pub struct MetricsRegistryEntry {
/// The Prometheus registry for this prefix
pub prometheus_registry: prometheus::Registry,
/// List of function callbacks that receive a reference to any MetricsRegistry
pub runtime_callbacks: Vec<RuntimeCallback>,
}
impl MetricsRegistryEntry {
/// Create a new metrics registry entry with an empty registry and no callbacks
pub fn new() -> Self {
Self {
prometheus_registry: prometheus::Registry::new(),
runtime_callbacks: Vec::new(),
}
}
/// Add a callback function that receives a reference to any MetricsRegistry
pub fn add_callback(&mut self, callback: RuntimeCallback) {
self.runtime_callbacks.push(callback);
}
/// Execute all runtime callbacks and return their results
pub fn execute_callbacks(&self) -> Vec<anyhow::Result<()>> {
self.runtime_callbacks
.iter()
.map(|callback| callback())
.collect()
}
/// Returns true if a metric with the given name already exists in the Prometheus registry
pub fn has_metric_named(&self, metric_name: &str) -> bool {
self.prometheus_registry
.gather()
.iter()
.any(|mf| mf.name() == metric_name)
}
}
impl Default for MetricsRegistryEntry {
fn default() -> Self {
Self::new()
}
}
impl Clone for MetricsRegistryEntry {
fn clone(&self) -> Self {
Self {
prometheus_registry: self.prometheus_registry.clone(),
runtime_callbacks: Vec::new(), // Callbacks cannot be cloned, so we start with an empty list
}
}
}
/// Distributed [Runtime] which provides access to shared resources across the cluster, this includes /// Distributed [Runtime] which provides access to shared resources across the cluster, this includes
/// communication protocols and transports. /// communication protocols and transports.
#[derive(Clone)] #[derive(Clone)]
...@@ -176,9 +240,7 @@ pub struct DistributedRuntime { ...@@ -176,9 +240,7 @@ pub struct DistributedRuntime {
// Health Status // Health Status
system_health: Arc<std::sync::Mutex<SystemHealth>>, system_health: Arc<std::sync::Mutex<SystemHealth>>,
// This map associates metric prefixes with their corresponding Prometheus registries. // This map associates metric prefixes with their corresponding Prometheus registries and callbacks.
prometheus_registries_by_prefix: Arc<std::sync::Mutex<HashMap<String, prometheus::Registry>>>, // Uses RwLock for better concurrency - multiple threads can read (execute callbacks) simultaneously.
hierarchy_to_metricsregistry: Arc<std::sync::RwLock<HashMap<String, MetricsRegistryEntry>>>,
// Additional labels for metrics
labels: Vec<(String, String)>,
} }
...@@ -13,29 +13,46 @@ ...@@ -13,29 +13,46 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
//! Metric Registry Framework for Dynamo. //! Metrics registry trait and implementation for Prometheus metrics
//! //!
//! This module provides registry classes for Prometheus metrics //! This module provides a trait-based interface for creating and managing Prometheus metrics
//! that auto populates the labels with the component-endpoint hierarchy. //! with automatic label injection and hierarchical naming support.
//! All metrics are prefixed with "dynamo_component_" to avoid collisions with Kubernetes and other monitoring system labels.
pub mod prometheus_names;
use std::collections::HashSet;
use std::sync::Arc;
use std::sync::Mutex;
use crate::component::ComponentBuilder;
use anyhow;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
use std::any::Any; use std::any::Any;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::{Arc, Mutex};
// If set to true, then metrics will be labeled with the dynamo_namespace, dynamo_component, and dynamo_endpoint. // Import commonly used items to avoid verbose prefixes
use prometheus_names::{
build_metric_name, labels, name_prefix, nats, work_handler, COMPONENT_NATS_METRICS,
DRT_NATS_METRICS,
};
// Pipeline imports for endpoint creation
use crate::pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
ResponseStream, SingleIn,
};
use crate::protocols::annotated::Annotated;
use crate::stream;
use crate::stream::StreamExt;
// If set to true, then metrics will be labeled with the namespace, component, and endpoint labels.
// These labels are prefixed with "dynamo_" to avoid collisions with Kubernetes and other monitoring system labels. // These labels are prefixed with "dynamo_" to avoid collisions with Kubernetes and other monitoring system labels.
pub const USE_AUTO_LABELS: bool = true; pub const USE_AUTO_LABELS: bool = true;
// Prometheus imports // Prometheus imports
use prometheus::Encoder; use prometheus::Encoder;
fn build_metric_name(metric_name: &str) -> String {
format!("dynamo_component_{}", metric_name)
}
/// Lints a metric name component by stripping off invalid characters and validating Prometheus naming pattern /// Lints a metric name component by stripping off invalid characters and validating Prometheus naming pattern
/// Prometheus doesn't provide a built-in function to validate metric names, but the specification requires /// Prometheus doesn't provide a built-in function to validate metric names, but the specification requires
/// names to follow the pattern [a-zA-Z_:][a-zA-Z0-9_:]*. This function implements that validation. /// names to follow the pattern [a-zA-Z_:][a-zA-Z0-9_:]*. This function implements that validation.
...@@ -212,15 +229,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -212,15 +229,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
) -> anyhow::Result<T> { ) -> anyhow::Result<T> {
// Validate that user-provided labels don't have duplicate keys // Validate that user-provided labels don't have duplicate keys
validate_no_duplicate_label_keys(labels)?; validate_no_duplicate_label_keys(labels)?;
// Validate that user-provided labels don't conflict with stored labels // Note: stored labels functionality has been removed
for (key, _) in registry.stored_labels() {
if labels.iter().any(|(k, _)| *k == key) {
return Err(anyhow::anyhow!(
"Label key '{}' already exists in registry.",
key
));
}
}
let basename = registry.basename(); let basename = registry.basename();
let parent_hierarchy = registry.parent_hierarchy(); let parent_hierarchy = registry.parent_hierarchy();
...@@ -236,8 +245,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -236,8 +245,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if USE_AUTO_LABELS { if USE_AUTO_LABELS {
// Validate that user-provided labels don't conflict with auto-generated labels // Validate that user-provided labels don't conflict with auto-generated labels
for (key, _) in labels { for (key, _) in labels {
if *key == "dynamo_namespace" || *key == "dynamo_component" || *key == "dynamo_endpoint" if *key == labels::NAMESPACE || *key == labels::COMPONENT || *key == labels::ENDPOINT {
{
return Err(anyhow::anyhow!( return Err(anyhow::anyhow!(
"Label '{}' is automatically added by auto_label feature and cannot be manually set", "Label '{}' is automatically added by auto_label feature and cannot be manually set",
key key
...@@ -251,7 +259,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -251,7 +259,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if !namespace.is_empty() { if !namespace.is_empty() {
let valid_namespace = lint_prometheus_name(namespace)?; let valid_namespace = lint_prometheus_name(namespace)?;
if !valid_namespace.is_empty() { if !valid_namespace.is_empty() {
updated_labels.push(("dynamo_namespace".to_string(), valid_namespace)); updated_labels.push((labels::NAMESPACE.to_string(), valid_namespace));
} }
} }
} }
...@@ -260,7 +268,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -260,7 +268,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if !component.is_empty() { if !component.is_empty() {
let valid_component = lint_prometheus_name(component)?; let valid_component = lint_prometheus_name(component)?;
if !valid_component.is_empty() { if !valid_component.is_empty() {
updated_labels.push(("dynamo_component".to_string(), valid_component)); updated_labels.push((labels::COMPONENT.to_string(), valid_component));
} }
} }
} }
...@@ -269,7 +277,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -269,7 +277,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if !endpoint.is_empty() { if !endpoint.is_empty() {
let valid_endpoint = lint_prometheus_name(endpoint)?; let valid_endpoint = lint_prometheus_name(endpoint)?;
if !valid_endpoint.is_empty() { if !valid_endpoint.is_empty() {
updated_labels.push(("dynamo_endpoint".to_string(), valid_endpoint)); updated_labels.push((labels::ENDPOINT.to_string(), valid_endpoint));
} }
} }
} }
...@@ -281,13 +289,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -281,13 +289,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
.iter() .iter()
.map(|(k, v)| ((*k).to_string(), (*v).to_string())), .map(|(k, v)| ((*k).to_string(), (*v).to_string())),
); );
// Add stored labels (safe because overlaps were rejected above) // Note: stored labels functionality has been removed
updated_labels.extend(
registry
.stored_labels()
.into_iter()
.map(|(k, v)| (k.to_string(), v.to_string())),
);
// Handle different metric types // Handle different metric types
let prometheus_metric = if std::any::TypeId::of::<T>() let prometheus_metric = if std::any::TypeId::of::<T>()
...@@ -371,33 +373,26 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -371,33 +373,26 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
}; };
// Iterate over the DRT's registry and register this metric across all hierarchical levels. // Iterate over the DRT's registry and register this metric across all hierarchical levels.
// The prefixed_hierarchy is structured as: ["", "testnamespace", "testnamespace_testcomponent", "testnamespace_testcomponent_testendpoint"] // The accumulated hierarchy is structured as: ["", "testnamespace", "testnamespace_testcomponent", "testnamespace_testcomponent_testendpoint"]
// This prefixing is essential to differentiate between the names of children and grandchildren. // This accumulation is essential to differentiate between the names of children and grandchildren.
let mut prometheus_registry = registry // Build accumulated hierarchy and register metrics in a single loop
.drt()
.prometheus_registries_by_prefix
.lock()
.unwrap();
// Build prefixed hierarchy and register metrics in a single loop
// current_prefix accumulates the hierarchical path as we iterate through hierarchy // current_prefix accumulates the hierarchical path as we iterate through hierarchy
// For example, if hierarchy = ["", "testnamespace", "testcomponent"], then: // For example, if hierarchy = ["", "testnamespace", "testcomponent"], then:
// - Iteration 1: current_prefix = "" (empty string from DRT) // - Iteration 1: current_prefix = "" (empty string from DRT)
// - Iteration 2: current_prefix = "testnamespace" // - Iteration 2: current_prefix = "testnamespace"
// - Iteration 3: current_prefix = "testnamespace_testcomponent" // - Iteration 3: current_prefix = "testnamespace_testcomponent"
let mut current_prefix = String::new(); let mut current_hierarchy = String::new();
for name in &hierarchy { for name in &hierarchy {
if !current_prefix.is_empty() && !name.is_empty() { if !current_hierarchy.is_empty() && !name.is_empty() {
current_prefix.push('_'); current_hierarchy.push('_');
} }
current_prefix.push_str(name); current_hierarchy.push_str(name);
// Register metric at this hierarchical level // Register metric at this hierarchical level using the new helper function
let collector: Box<dyn prometheus::core::Collector> = Box::new(prometheus_metric.clone()); let collector: Box<dyn prometheus::core::Collector> = Box::new(prometheus_metric.clone());
let _ = prometheus_registry registry
.entry(current_prefix.clone()) .drt()
.or_default() .add_prometheus_metric(&current_hierarchy, &metric_name, collector)?;
.register(collector);
} }
Ok(prometheus_metric) Ok(prometheus_metric)
...@@ -406,55 +401,16 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>( ...@@ -406,55 +401,16 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
/// This trait should be implemented by all metric registries, including Prometheus, Envy, OpenTelemetry, and others. /// This trait should be implemented by all metric registries, including Prometheus, Envy, OpenTelemetry, and others.
/// It offers a unified interface for creating and managing metrics, organizing sub-registries, and /// It offers a unified interface for creating and managing metrics, organizing sub-registries, and
/// generating output in Prometheus text format. /// generating output in Prometheus text format.
pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvider { use crate::traits::DistributedRuntimeProvider;
// Get the name of this registry (without any prefix)
fn basename(&self) -> String;
/// Get any stored labels for this registry
fn stored_labels(&self) -> Vec<(&str, &str)> {
Vec::new()
}
/// Get mutable access to the labels storage - implementors must provide this
fn labels_mut(&mut self) -> &mut Vec<(String, String)>;
/// Add labels to this registry and return a new instance with the labels. pub trait MetricsRegistry: Send + Sync + DistributedRuntimeProvider {
/// This allows for method chaining like: runtime.namespace(...).add_labels(...)? // Get the name of this registry (without any hierarchy prefix)
/// Fails if: fn basename(&self) -> String;
/// - Provided `labels` contains duplicate keys, or
/// - Any provided key already exists in the registry's stored labels.
fn add_labels(mut self, labels: &[(&str, &str)]) -> anyhow::Result<Self>
where
Self: Sized,
{
validate_no_duplicate_label_keys(labels)?;
// 2) Validate no overlap with existing stored labels
let existing: std::collections::HashSet<&str> =
self.stored_labels().into_iter().map(|(k, _)| k).collect();
if let Some(conflict) = labels
.iter()
.map(|(k, _)| *k)
.find(|k| existing.contains(k))
{
return Err(anyhow::anyhow!(
"Label key '{}' already exists in registry; refusing to overwrite",
conflict
));
}
// 3) Safe to append
let labels_storage = self.labels_mut();
for (key, value) in labels {
labels_storage.push((key.to_string(), value.to_string()));
}
Ok(self)
}
/// Retrieve the complete hierarchy and basename for this registry. Currently, the prefix for drt is an empty string, /// Retrieve the complete hierarchy and basename for this registry. Currently, the hierarchy for drt is an empty string,
/// so we must account for the leading underscore. The existing code remains unchanged to accommodate any future /// so we must account for the leading underscore. The existing code remains unchanged to accommodate any future
/// scenarios where drt's prefix might be assigned a value. /// scenarios where drt's prefix might be assigned a value.
fn prefix(&self) -> String { fn hierarchy(&self) -> String {
[self.parent_hierarchy(), vec![self.basename()]] [self.parent_hierarchy(), vec![self.basename()]]
.concat() .concat()
.join("_") .join("_")
...@@ -462,7 +418,7 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid ...@@ -462,7 +418,7 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid
.to_string() .to_string()
} }
// Get the parent hierarchy for this registry (just the base names, NOT the prefix) // Get the parent hierarchy for this registry (just the base names, NOT the flattened hierarchy key)
fn parent_hierarchy(&self) -> Vec<String>; fn parent_hierarchy(&self) -> Vec<String>;
// TODO: Add support for additional Prometheus metric types: // TODO: Add support for additional Prometheus metric types:
...@@ -589,9 +545,24 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid ...@@ -589,9 +545,24 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid
/// Get metrics in Prometheus text format /// Get metrics in Prometheus text format
fn prometheus_metrics_fmt(&self) -> anyhow::Result<String> { fn prometheus_metrics_fmt(&self) -> anyhow::Result<String> {
// Execute callbacks first to ensure any new metrics are added to the registry
let callback_results = self.drt().execute_metrics_callbacks(&self.hierarchy());
// Log any callback errors but continue
for result in callback_results {
if let Err(e) = result {
tracing::error!("Error executing metrics callback: {}", e);
}
}
// Get the Prometheus registry for this hierarchy
let prometheus_registry = { let prometheus_registry = {
let mut registry = self.drt().prometheus_registries_by_prefix.lock().unwrap(); let mut registry_entry = self.drt().hierarchy_to_metricsregistry.write().unwrap();
registry.entry(self.prefix()).or_default().clone() registry_entry
.entry(self.hierarchy())
.or_default()
.prometheus_registry
.clone()
}; };
let metric_families = prometheus_registry.gather(); let metric_families = prometheus_registry.gather();
let encoder = prometheus::TextEncoder::new(); let encoder = prometheus::TextEncoder::new();
...@@ -602,19 +573,127 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid ...@@ -602,19 +573,127 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid
} }
#[cfg(test)] #[cfg(test)]
/// Helper function to create a DRT instance for testing mod test_helpers {
/// Uses the test-friendly constructor without discovery use super::prometheus_names::name_prefix;
pub fn create_test_drt() -> crate::DistributedRuntime { use super::prometheus_names::nats as nats_metrics;
use super::*;
/// Creates a test DistributedRuntime for integration tests.
/// Uses NATS; requires #[cfg(feature = "integration")].
#[cfg(feature = "integration")]
pub fn create_test_drt() -> crate::DistributedRuntime {
let rt = crate::Runtime::single_threaded().unwrap(); let rt = crate::Runtime::single_threaded().unwrap();
tokio::runtime::Runtime::new().unwrap().block_on(async { tokio::runtime::Runtime::new().unwrap().block_on(async {
crate::DistributedRuntime::from_settings_without_discovery(rt.clone()) crate::DistributedRuntime::from_settings_without_discovery(rt.clone())
.await .await
.unwrap() .unwrap()
}) })
}
/// Helper function to create a DRT instance for testing in async contexts
#[cfg(feature = "integration")]
pub async fn create_test_drt_async() -> crate::DistributedRuntime {
let rt = crate::Runtime::single_threaded().unwrap();
crate::DistributedRuntime::from_settings_without_discovery(rt.clone())
.await
.unwrap()
}
/// Base function to filter Prometheus output lines based on a predicate.
/// Returns lines that match the predicate, converted to String.
fn filter_prometheus_lines<F>(input: &str, mut predicate: F) -> Vec<String>
where
F: FnMut(&str) -> bool,
{
input
.lines()
.filter(|line| predicate(line))
.map(|line| line.to_string())
.collect::<Vec<_>>()
}
/// Filters out all NATS metrics from Prometheus output for test comparisons.
pub fn remove_nats_lines(input: &str) -> Vec<String> {
filter_prometheus_lines(input, |line| {
!line.contains(&format!(
"{}{}",
name_prefix::COMPONENT,
nats_metrics::PREFIX
)) && !line.trim().is_empty()
})
}
/// Filters to only include NATS metrics from Prometheus output for test comparisons.
pub fn extract_nats_lines(input: &str) -> Vec<String> {
filter_prometheus_lines(input, |line| {
line.contains(&format!(
"{}{}",
name_prefix::COMPONENT,
nats_metrics::PREFIX
))
})
}
/// Extracts all component metrics (excluding help text and type definitions).
/// Returns only the actual metric lines with values.
pub fn extract_metrics(input: &str) -> Vec<String> {
filter_prometheus_lines(input, |line| {
line.starts_with(name_prefix::COMPONENT)
&& !line.starts_with("#")
&& !line.trim().is_empty()
})
}
/// Parses a Prometheus metric line and extracts the name, labels, and value.
/// Used instead of fetching metrics directly to test end-to-end results, not intermediate state.
///
/// # Example
/// ```
/// let line = "http_requests_total{method=\"GET\"} 1234";
/// let (name, labels, value) = parse_prometheus_metric(line).unwrap();
/// assert_eq!(name, "http_requests_total");
/// assert_eq!(labels.get("method"), Some(&"GET".to_string()));
/// assert_eq!(value, 1234.0);
/// ```
pub fn parse_prometheus_metric(
line: &str,
) -> Option<(String, std::collections::HashMap<String, String>, f64)> {
if line.trim().is_empty() || line.starts_with('#') {
return None;
}
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() < 2 {
return None;
}
let metric_part = parts[0];
let value: f64 = parts[1].parse().ok()?;
let (name, labels) = if metric_part.contains('{') {
let brace_start = metric_part.find('{').unwrap();
let brace_end = metric_part.rfind('}').unwrap_or(metric_part.len());
let name = &metric_part[..brace_start];
let labels_str = &metric_part[brace_start + 1..brace_end];
let mut labels = std::collections::HashMap::new();
for pair in labels_str.split(',') {
if let Some((k, v)) = pair.split_once('=') {
let v = v.trim_matches('"');
labels.insert(k.trim().to_string(), v.to_string());
}
}
(name.to_string(), labels)
} else {
(metric_part.to_string(), std::collections::HashMap::new())
};
Some((name, labels, value))
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod test_metricsregistry_units {
use super::*; use super::*;
#[test] #[test]
...@@ -689,268 +768,310 @@ mod tests { ...@@ -689,268 +768,310 @@ mod tests {
"testnamespace" "testnamespace"
); // Hyphen removed ); // Hyphen removed
assert_eq!( assert_eq!(
lint_prometheus_name("test-namespace_123").unwrap(), lint_prometheus_name("test-namespace-123").unwrap(),
"testnamespace_123" "testnamespace123"
); // Hyphen removed ); // Multiple hyphens removed
// Test validation errors for invalid patterns
assert!(lint_prometheus_name("123test").is_err()); // Starts with digit
assert!(lint_prometheus_name("").is_ok()); // Empty is allowed
} }
}
#[cfg(feature = "integration")] #[test]
#[cfg(test)] fn test_parse_prometheus_metric() {
mod test_prefixes { use super::test_helpers::parse_prometheus_metric;
use super::create_test_drt; use std::collections::HashMap;
use super::*;
use prometheus::core::Collector; // Test parsing a metric with labels
let line = "http_requests_total{method=\"GET\",status=\"200\"} 1234";
let parsed = parse_prometheus_metric(line);
assert!(parsed.is_some());
let (name, labels, value) = parsed.unwrap();
assert_eq!(name, "http_requests_total");
let mut expected_labels = HashMap::new();
expected_labels.insert("method".to_string(), "GET".to_string());
expected_labels.insert("status".to_string(), "200".to_string());
assert_eq!(labels, expected_labels);
assert_eq!(value, 1234.0);
// Test parsing a metric without labels
let line = "cpu_usage 98.5";
let parsed = parse_prometheus_metric(line);
assert!(parsed.is_some());
let (name, labels, value) = parsed.unwrap();
assert_eq!(name, "cpu_usage");
assert!(labels.is_empty());
assert_eq!(value, 98.5);
// Test parsing a metric with float value
let line = "response_time{service=\"api\"} 0.123";
let parsed = parse_prometheus_metric(line);
assert!(parsed.is_some());
let (name, labels, value) = parsed.unwrap();
assert_eq!(name, "response_time");
let mut expected_labels = HashMap::new();
expected_labels.insert("service".to_string(), "api".to_string());
assert_eq!(labels, expected_labels);
assert_eq!(value, 0.123);
// Test parsing invalid lines
assert!(parse_prometheus_metric("").is_none()); // Empty line
assert!(parse_prometheus_metric("# HELP metric description").is_none()); // Help text
assert!(parse_prometheus_metric("# TYPE metric counter").is_none()); // Type definition
assert!(parse_prometheus_metric("metric_name").is_none()); // No value
println!("✓ Prometheus metric parsing works correctly!");
}
#[cfg(feature = "integration")]
#[test] #[test]
fn test_hierarchical_prefixes_and_parent_hierarchies() { fn test_metrics_registry_entry_callbacks() {
println!("=== Testing Names, Prefixes, and Parent Hierarchies ==="); use crate::MetricsRegistryEntry;
use std::sync::atomic::{AtomicUsize, Ordering};
// Create a distributed runtime for testing // Test 1: Basic callback execution with counter increments
let drt = create_test_drt(); {
let mut entry = MetricsRegistryEntry::new();
let counter = Arc::new(AtomicUsize::new(0));
// Add callbacks with different increment values
for increment in [1, 10, 100] {
let counter_clone = counter.clone();
entry.add_callback(Arc::new(move || {
counter_clone.fetch_add(increment, Ordering::SeqCst);
Ok(())
}));
}
// Use a simple constant namespace name // Verify counter starts at 0
let namespace_name = "testnamespace"; assert_eq!(counter.load(Ordering::SeqCst), 0);
// Create namespace // First execution
let namespace = drt.namespace(namespace_name).unwrap(); let results = entry.execute_callbacks();
assert_eq!(results.len(), 3);
assert!(results.iter().all(|r| r.is_ok()));
assert_eq!(counter.load(Ordering::SeqCst), 111); // 1 + 10 + 100
// Create component // Second execution - callbacks should be reusable
let component = namespace.component("testcomponent").unwrap(); let results = entry.execute_callbacks();
assert_eq!(results.len(), 3);
assert_eq!(counter.load(Ordering::SeqCst), 222); // 111 + 111
// Create endpoint // Test cloning - cloned entry should have no callbacks
let endpoint = component.endpoint("testendpoint"); let cloned = entry.clone();
assert_eq!(cloned.execute_callbacks().len(), 0);
assert_eq!(counter.load(Ordering::SeqCst), 222); // No change
// Test DistributedRuntime hierarchy // Original still has callbacks
println!("\n=== DistributedRuntime ==="); entry.execute_callbacks();
println!("basename: '{}'", drt.basename()); assert_eq!(counter.load(Ordering::SeqCst), 333); // 222 + 111
println!("parent_hierarchy: {:?}", drt.parent_hierarchy()); }
println!("prefix: '{}'", drt.prefix());
assert_eq!(drt.basename(), "", "DRT basename should be empty"); // Test 2: Mixed success and error callbacks
assert_eq!( {
drt.parent_hierarchy(), let mut entry = MetricsRegistryEntry::new();
Vec::<String>::new(), let counter = Arc::new(AtomicUsize::new(0));
"DRT parent hierarchy should be empty"
);
assert_eq!(drt.prefix(), "", "DRT prefix should be empty");
// Test Namespace hierarchy // Successful callback
println!("\n=== Namespace ==="); let counter_clone = counter.clone();
println!("basename: '{}'", namespace.basename()); entry.add_callback(Arc::new(move || {
println!("parent_hierarchy: {:?}", namespace.parent_hierarchy()); counter_clone.fetch_add(1, Ordering::SeqCst);
println!("prefix: '{}'", namespace.prefix()); Ok(())
}));
assert_eq!( // Error callback
namespace.basename(), entry.add_callback(Arc::new(|| Err(anyhow::anyhow!("Simulated error"))));
namespace_name,
"Namespace basename should match the generated name"
);
assert_eq!(
namespace.parent_hierarchy(),
vec![""],
"Namespace parent hierarchy should be [\"\"]"
);
assert_eq!(
namespace.prefix(),
namespace_name,
"Namespace prefix should match the generated name, because drt's prefix is empty"
);
// Test Component hierarchy // Another successful callback
println!("\n=== Component ==="); let counter_clone = counter.clone();
println!("basename: '{}'", component.basename()); entry.add_callback(Arc::new(move || {
println!("parent_hierarchy: {:?}", component.parent_hierarchy()); counter_clone.fetch_add(10, Ordering::SeqCst);
println!("prefix: '{}'", component.prefix()); Ok(())
}));
// Execute and verify mixed results
let results = entry.execute_callbacks();
assert_eq!(results.len(), 3);
assert!(results[0].is_ok());
assert!(results[1].is_err());
assert!(results[2].is_ok());
// Verify error message
assert_eq!( assert_eq!(
component.basename(), results[1].as_ref().unwrap_err().to_string(),
"testcomponent", "Simulated error"
"Component basename should be 'testcomponent'"
);
assert_eq!(
component.parent_hierarchy(),
vec!["", &namespace_name],
"Component parent hierarchy should contain the generated namespace name"
);
assert_eq!(
component.prefix(),
format!("{}_testcomponent", namespace),
"Component prefix should be 'namespace_testcomponent'"
); );
// Test Endpoint hierarchy // Verify successful callbacks still executed
println!("\n=== Endpoint ==="); assert_eq!(counter.load(Ordering::SeqCst), 11); // 1 + 10
println!("basename: '{}'", endpoint.basename());
println!("parent_hierarchy: {:?}", endpoint.parent_hierarchy());
println!("prefix: '{}'", endpoint.prefix());
assert_eq!( // Execute again - errors should be consistent
endpoint.basename(), let results = entry.execute_callbacks();
"testendpoint", assert!(results[1].is_err());
"Endpoint basename should be 'testendpoint'" assert_eq!(counter.load(Ordering::SeqCst), 22); // 11 + 11
); }
assert_eq!(
endpoint.parent_hierarchy(),
vec!["", &namespace_name, "testcomponent"],
"Endpoint parent hierarchy should contain the generated namespace name"
);
assert_eq!(
endpoint.prefix(),
format!("{}_testcomponent_testendpoint", namespace),
"Endpoint prefix should be 'namespace_testcomponent_testendpoint'"
);
// Test hierarchy relationships // Test 3: Empty registry
println!("\n=== Hierarchy Relationships ==="); {
assert!( let entry = MetricsRegistryEntry::new();
namespace.parent_hierarchy().contains(&drt.basename()), let results = entry.execute_callbacks();
"Namespace should have DRT prefix in parent hierarchy" assert_eq!(results.len(), 0);
); }
assert!( }
component.parent_hierarchy().contains(&namespace.basename()), }
"Component should have Namespace prefix in parent hierarchy"
); #[cfg(feature = "integration")]
assert!( #[cfg(test)]
endpoint.parent_hierarchy().contains(&component.basename()), mod test_metricsregistry_prefixes {
"Endpoint should have Component prefix in parent hierarchy" use super::*;
); use prometheus::core::Collector;
println!("✓ All parent-child relationships verified");
// Test hierarchy depth #[test]
println!("\n=== Hierarchy Depth ==="); fn test_hierarchical_prefixes_and_parent_hierarchies() {
let drt = super::test_helpers::create_test_drt();
const DRT_NAME: &str = "";
const NAMESPACE_NAME: &str = "ns901";
const COMPONENT_NAME: &str = "comp901";
const ENDPOINT_NAME: &str = "ep901";
let namespace = drt.namespace(NAMESPACE_NAME).unwrap();
let component = namespace.component(COMPONENT_NAME).unwrap();
let endpoint = component.endpoint(ENDPOINT_NAME);
// DRT
assert_eq!(drt.basename(), DRT_NAME);
assert_eq!(drt.parent_hierarchy(), Vec::<String>::new());
assert_eq!(drt.hierarchy(), DRT_NAME);
// Namespace
assert_eq!(namespace.basename(), NAMESPACE_NAME);
assert_eq!(namespace.parent_hierarchy(), vec!["".to_string()]);
assert_eq!(namespace.hierarchy(), NAMESPACE_NAME);
// Component
assert_eq!(component.basename(), COMPONENT_NAME);
assert_eq!( assert_eq!(
drt.parent_hierarchy().len(), component.parent_hierarchy(),
0, vec!["".to_string(), NAMESPACE_NAME.to_string()]
"DRT should have 0 parent hierarchy levels"
); );
assert_eq!( assert_eq!(
namespace.parent_hierarchy().len(), component.hierarchy(),
1, format!("{}_{}", NAMESPACE_NAME, COMPONENT_NAME)
"Namespace should have 1 parent hierarchy level"
); );
// Endpoint
assert_eq!(endpoint.basename(), ENDPOINT_NAME);
assert_eq!( assert_eq!(
component.parent_hierarchy().len(), endpoint.parent_hierarchy(),
2, vec![
"Component should have 2 parent hierarchy levels" "".to_string(),
NAMESPACE_NAME.to_string(),
COMPONENT_NAME.to_string(),
]
); );
assert_eq!( assert_eq!(
endpoint.parent_hierarchy().len(), endpoint.hierarchy(),
3, format!("{}_{}_{}", NAMESPACE_NAME, COMPONENT_NAME, ENDPOINT_NAME)
"Endpoint should have 3 parent hierarchy levels"
); );
println!("✓ All hierarchy depths verified");
// Summary // Relationships
println!("\n=== Summary ==="); assert!(namespace.parent_hierarchy().contains(&drt.basename()));
println!("DRT prefix: '{}'", drt.prefix()); assert!(component.parent_hierarchy().contains(&namespace.basename()));
println!("Namespace prefix: '{}'", namespace.prefix()); assert!(endpoint.parent_hierarchy().contains(&component.basename()));
println!("Component prefix: '{}'", component.prefix());
println!("Endpoint prefix: '{}'", endpoint.prefix());
println!("All hierarchy assertions passed!");
// Test invalid namespace behavior // Depth
println!("\n=== Testing Invalid Namespace Behavior ==="); assert_eq!(drt.parent_hierarchy().len(), 0);
assert_eq!(namespace.parent_hierarchy().len(), 1);
assert_eq!(component.parent_hierarchy().len(), 2);
assert_eq!(endpoint.parent_hierarchy().len(), 3);
// Create a namespace with invalid name (contains hyphen) // Invalid namespace behavior (sanitization should still error after becoming "123")
let invalid_namespace = drt.namespace("@@123").unwrap(); let invalid_namespace = drt.namespace("@@123").unwrap();
let result = invalid_namespace.create_counter("test_counter", "A test counter", &[]);
assert!(result.is_err());
if let Err(e) = &result {
assert!(e.to_string().contains("123"));
}
// Debug: Let's see what the hierarchy looks like // Valid namespace works
println!( let valid_namespace = drt.namespace("ns567").unwrap();
"Invalid namespace basename: '{}'", assert!(valid_namespace
invalid_namespace.basename() .create_counter("test_counter", "A test counter", &[])
); .is_ok());
println!( }
"Invalid namespace parent_hierarchy: {:?}",
invalid_namespace.parent_hierarchy()
);
println!("Invalid namespace prefix: '{}'", invalid_namespace.prefix());
// Try to create a metric - this should succeed because the namespace name will be sanitized #[test]
let result = invalid_namespace.create_counter("test_counter", "A test counter", &[]); fn test_recursive_namespace() {
println!("Result with invalid namespace '@@123':"); // Create a distributed runtime for testing
println!("{:?}", result); let drt = super::test_helpers::create_test_drt();
// The result should be an error because '@@123' gets sanitized to '123' which is invalid // Create a deeply chained namespace: ns1.ns2.ns3
assert!( let ns1 = drt.namespace("ns1").unwrap();
result.is_err(), let ns2 = ns1.namespace("ns2").unwrap();
"Creating metric with namespace '@@123' should fail because it gets sanitized to '123' which is invalid" let ns3 = ns2.namespace("ns3").unwrap();
// Create a component in the deepest namespace
let component = ns3.component("test-component").unwrap();
// Verify the hierarchy structure
assert_eq!(ns1.basename(), "ns1");
assert_eq!(ns1.parent_hierarchy(), vec!("".to_string()));
assert_eq!(ns1.hierarchy(), "ns1");
assert_eq!(ns2.basename(), "ns2");
assert_eq!(
ns2.parent_hierarchy(),
vec!["".to_string(), "ns1".to_string()]
); );
assert_eq!(ns2.hierarchy(), "ns1_ns2");
// Verify the error message indicates the sanitized name is still invalid assert_eq!(ns3.basename(), "ns3");
if let Err(e) = &result { assert_eq!(
let error_msg = e.to_string(); ns3.parent_hierarchy(),
assert!( vec!["".to_string(), "ns1".to_string(), "ns2".to_string()]
error_msg.contains("123"),
"Error message should mention the sanitized name '123', got: {}",
error_msg
); );
} assert_eq!(ns3.hierarchy(), "ns1_ns2_ns3");
// For comparison, show a valid namespace works assert_eq!(component.basename(), "test-component");
let valid_namespace = drt.namespace("test_namespace").unwrap(); assert_eq!(
let valid_result = valid_namespace.create_counter("test_counter", "A test counter", &[]); component.parent_hierarchy(),
println!("Result with valid namespace 'test_namespace':"); vec![
println!("{:?}", valid_result); "".to_string(),
assert!( "ns1".to_string(),
valid_result.is_ok(), "ns2".to_string(),
"Creating metric with valid namespace should succeed" "ns3".to_string()
]
); );
assert_eq!(component.hierarchy(), "ns1_ns2_ns3_test-component");
println!("✓ Invalid namespace behavior verified!"); println!("✓ Chained namespace test passed - all prefixes correct");
} }
} }
#[cfg(feature = "integration")] #[cfg(feature = "integration")]
#[cfg(test)] #[cfg(test)]
mod test_simple_metricsregistry_trait { mod test_metricsregistry_prometheus_fmt_outputs {
use super::create_test_drt; use super::prometheus_names::name_prefix;
use super::prometheus_names::nats as nats_metrics;
use super::prometheus_names::{COMPONENT_NATS_METRICS, DRT_NATS_METRICS};
use super::*; use super::*;
use prometheus::Counter; use prometheus::Counter;
use std::sync::Arc; use std::sync::Arc;
#[test] #[test]
fn test_component_prometheus_output_contains_custom_label() { fn test_prometheusfactory_using_metrics_registry_trait() {
// Arrange: DRT → namespace → component with a custom label
let drt = create_test_drt();
let namespace = drt.namespace("testnamespace").unwrap();
let component = namespace
.component("testcomponent")
.unwrap()
.add_labels(&[("service", "api")])
.unwrap();
// Act: create a simple gauge and render Prometheus text
let gauge = component
.create_gauge("with_label", "Gauge with custom label", &[])
.unwrap();
gauge.set(1.0);
let output = component.prometheus_metrics_fmt().unwrap();
// Assert: custom label is present (don’t rely on label ordering)
assert!(
output.contains("dynamo_component_with_label{") && output.contains(r#"service="api""#),
"Expected custom label service=\"api\" in Prometheus output:\n{}",
output
);
}
#[test]
fn test_factory_methods_via_registry_trait() {
// Setup real DRT and registry using the test-friendly constructor // Setup real DRT and registry using the test-friendly constructor
let drt = create_test_drt(); let drt = super::test_helpers::create_test_drt();
// Use a simple constant namespace name // Use a simple constant namespace name
let namespace_name = "testnamespace"; let namespace_name = "ns345";
let namespace = drt.namespace(namespace_name).unwrap(); let namespace = drt.namespace(namespace_name).unwrap();
let component = namespace.component("testcomponent").unwrap(); let component = namespace.component("comp345").unwrap();
let endpoint = component.endpoint("testendpoint"); let endpoint = component.endpoint("ep345");
// Test Counter creation // Test Counter creation
let counter = endpoint let counter = endpoint
...@@ -960,15 +1081,18 @@ mod test_simple_metricsregistry_trait { ...@@ -960,15 +1081,18 @@ mod test_simple_metricsregistry_trait {
let epsilon = 0.01; let epsilon = 0.01;
assert!((counter.get() - 123.456789).abs() < epsilon); assert!((counter.get() - 123.456789).abs() < epsilon);
let endpoint_output = endpoint.prometheus_metrics_fmt().unwrap(); let endpoint_output_raw = endpoint.prometheus_metrics_fmt().unwrap();
println!("Endpoint output:"); println!("Endpoint output:");
println!("{}", endpoint_output); println!("{}", endpoint_output_raw);
// Filter out NATS service metrics for test comparison
let endpoint_output =
super::test_helpers::remove_nats_lines(&endpoint_output_raw).join("\n");
let expected_endpoint_output = format!( let expected_endpoint_output = format!(
r#"# HELP dynamo_component_testcounter A test counter r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789 dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789"#
"#
); );
assert_eq!( assert_eq!(
...@@ -988,18 +1112,21 @@ dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint=" ...@@ -988,18 +1112,21 @@ dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="
assert_eq!(gauge.get(), 50000.0); assert_eq!(gauge.get(), 50000.0);
// Test Prometheus format output for Component (gauge + histogram) // Test Prometheus format output for Component (gauge + histogram)
let component_output = component.prometheus_metrics_fmt().unwrap(); let component_output_raw = component.prometheus_metrics_fmt().unwrap();
println!("Component output:"); println!("Component output:");
println!("{}", component_output); println!("{}", component_output_raw);
// Filter out NATS service metrics for test comparison
let component_output =
super::test_helpers::remove_nats_lines(&component_output_raw).join("\n");
let expected_component_output = format!( let expected_component_output = format!(
r#"# HELP dynamo_component_testcounter A test counter r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789 dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789
# HELP dynamo_component_testgauge A test gauge # HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge # TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="testnamespace"}} 50000 dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000"#
"#
); );
assert_eq!( assert_eq!(
...@@ -1018,21 +1145,24 @@ dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="t ...@@ -1018,21 +1145,24 @@ dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="t
assert_eq!(intcounter.get(), 12345); assert_eq!(intcounter.get(), 12345);
// Test Prometheus format output for Namespace (int_counter + gauge + histogram) // Test Prometheus format output for Namespace (int_counter + gauge + histogram)
let namespace_output = namespace.prometheus_metrics_fmt().unwrap(); let namespace_output_raw = namespace.prometheus_metrics_fmt().unwrap();
println!("Namespace output:"); println!("Namespace output:");
println!("{}", namespace_output); println!("{}", namespace_output_raw);
// Filter out NATS service metrics for test comparison
let namespace_output =
super::test_helpers::remove_nats_lines(&namespace_output_raw).join("\n");
let expected_namespace_output = format!( let expected_namespace_output = format!(
r#"# HELP dynamo_component_testcounter A test counter r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789 dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789
# HELP dynamo_component_testgauge A test gauge # HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge # TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="testnamespace"}} 50000 dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000
# HELP dynamo_component_testintcounter A test int counter # HELP dynamo_component_testintcounter A test int counter
# TYPE dynamo_component_testintcounter counter # TYPE dynamo_component_testintcounter counter
dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345 dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345"#
"#
); );
assert_eq!( assert_eq!(
...@@ -1044,45 +1174,19 @@ dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345 ...@@ -1044,45 +1174,19 @@ dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345
expected_namespace_output, namespace_output expected_namespace_output, namespace_output
); );
// Create a histogram with specified buckets. The Prometheus format output will
// lack labels since the DistributedRuntime is unnamed.
let histogram = drt
.create_histogram(
"testhistogram",
"A test histogram",
&[],
Some(vec![1.0, 2.5, 5.0, 10.0]),
)
.unwrap();
histogram.observe(1.5);
histogram.observe(2.5);
histogram.observe(3.5);
// Test CounterVec creation
let countervec = drt
.create_countervec(
"testcountervec",
"A test counter vector",
&["method", "status"],
&[("service", "api")],
)
.unwrap();
countervec.with_label_values(&["GET", "200"]).inc_by(10.0);
countervec.with_label_values(&["POST", "201"]).inc_by(5.0);
// Test IntGauge creation // Test IntGauge creation
let intgauge = drt let intgauge = namespace
.create_intgauge("testintgauge", "A test int gauge", &[]) .create_intgauge("testintgauge", "A test int gauge", &[])
.unwrap(); .unwrap();
intgauge.set(42); intgauge.set(42);
assert_eq!(intgauge.get(), 42); assert_eq!(intgauge.get(), 42);
// Test IntGaugeVec creation // Test IntGaugeVec creation
let intgaugevec = drt let intgaugevec = namespace
.create_intgaugevec( .create_intgaugevec(
"testintgaugevec", "testintgaugevec",
"A test int gauge vector", "A test int gauge vector",
&["instance", "status"], &["instance", "service", "status"],
&[("service", "api")], &[("service", "api")],
) )
.unwrap(); .unwrap();
...@@ -1093,22 +1197,46 @@ dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345 ...@@ -1093,22 +1197,46 @@ dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345
.with_label_values(&["server2", "inactive"]) .with_label_values(&["server2", "inactive"])
.set(0); .set(0);
// Test Prometheus format output for DRT (which should contain everything) // Test CounterVec creation
let drt_output = drt.prometheus_metrics_fmt().unwrap(); let countervec = endpoint
.create_countervec(
"testcountervec",
"A test counter vector",
&["method", "status"],
&[("service", "api")],
)
.unwrap();
countervec.with_label_values(&["GET", "200"]).inc_by(10.0);
countervec.with_label_values(&["POST", "201"]).inc_by(5.0);
// Test Histogram creation
let histogram = component
.create_histogram("testhistogram", "A test histogram", &[], None)
.unwrap();
histogram.observe(1.0);
histogram.observe(2.5);
histogram.observe(4.0);
// Test Prometheus format output for DRT (all metrics combined)
let drt_output_raw = drt.prometheus_metrics_fmt().unwrap();
println!("DRT output:"); println!("DRT output:");
println!("{}", drt_output); println!("{}", drt_output_raw);
// Filter out all NATS metrics for comparison
let filtered_drt_output =
super::test_helpers::remove_nats_lines(&drt_output_raw).join("\n");
let expected_drt_output = format!( let expected_drt_output = format!(
r#"# HELP dynamo_component_testcounter A test counter r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter # TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789 dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789
# HELP dynamo_component_testcountervec A test counter vector # HELP dynamo_component_testcountervec A test counter vector
# TYPE dynamo_component_testcountervec counter # TYPE dynamo_component_testcountervec counter
dynamo_component_testcountervec{{method="GET",service="api",status="200"}} 10 dynamo_component_testcountervec{{method="GET",service="api",status="200"}} 10
dynamo_component_testcountervec{{method="POST",service="api",status="201"}} 5 dynamo_component_testcountervec{{method="POST",service="api",status="201"}} 5
# HELP dynamo_component_testgauge A test gauge # HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge # TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="testnamespace"}} 50000 dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000
# HELP dynamo_component_testhistogram A test histogram # HELP dynamo_component_testhistogram A test histogram
# TYPE dynamo_component_testhistogram histogram # TYPE dynamo_component_testhistogram histogram
dynamo_component_testhistogram_bucket{{le="1"}} 0 dynamo_component_testhistogram_bucket{{le="1"}} 0
...@@ -1120,26 +1248,436 @@ dynamo_component_testhistogram_sum 7.5 ...@@ -1120,26 +1248,436 @@ dynamo_component_testhistogram_sum 7.5
dynamo_component_testhistogram_count 3 dynamo_component_testhistogram_count 3
# HELP dynamo_component_testintcounter A test int counter # HELP dynamo_component_testintcounter A test int counter
# TYPE dynamo_component_testintcounter counter # TYPE dynamo_component_testintcounter counter
dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345 dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345
# HELP dynamo_component_testintgauge A test int gauge # HELP dynamo_component_testintgauge A test int gauge
# TYPE dynamo_component_testintgauge gauge # TYPE dynamo_component_testintgauge gauge
dynamo_component_testintgauge 42 dynamo_component_testintgauge 42
# HELP dynamo_component_testintgaugevec A test int gauge vector # HELP dynamo_component_testintgaugevec A test int gauge vector
# TYPE dynamo_component_testintgaugevec gauge # TYPE dynamo_component_testintgaugevec gauge
dynamo_component_testintgaugevec{{instance="server1",service="api",status="active"}} 10 dynamo_component_testintgaugevec{{instance="server1",service="api",status="active"}} 10
dynamo_component_testintgaugevec{{instance="server2",service="api",status="inactive"}} 0 dynamo_component_testintgaugevec{{instance="server2",service="api",status="inactive"}} 0"#
"#
); );
assert_eq!( assert_eq!(
filtered_drt_output, expected_drt_output, filtered_drt_output, expected_drt_output,
"\n=== DRT COMPARISON FAILED ===\n\ "\n=== DRT COMPARISON FAILED ===\n\
Expected:\n{}\n\ Expected:\n{}\n\
Actual:\n{}\n\ Actual (filtered):\n{}\n\
==============================", ==============================",
expected_drt_output, filtered_drt_output expected_drt_output, filtered_drt_output
); );
println!("✓ All Prometheus format outputs verified successfully!"); println!("✓ All Prometheus format outputs verified successfully!");
} }
#[test]
fn test_refactored_filter_functions() {
// Test data with mixed content
let test_input = r#"# HELP dynamo_component_requests Total requests
# TYPE dynamo_component_requests counter
dynamo_component_requests 42
# HELP dynamo_component_nats_connection_state Connection state
# TYPE dynamo_component_nats_connection_state gauge
dynamo_component_nats_connection_state 1
# HELP dynamo_component_latency Response latency
# TYPE dynamo_component_latency histogram
dynamo_component_latency_bucket{le="0.1"} 10
dynamo_component_latency_bucket{le="0.5"} 25
dynamo_component_nats_total_requests 100
dynamo_component_nats_total_errors 5"#;
// Test remove_nats_lines (excludes NATS lines but keeps help/type)
let filtered_out = super::test_helpers::remove_nats_lines(test_input);
assert_eq!(filtered_out.len(), 7); // 7 non-NATS lines
assert!(!filtered_out.iter().any(|line| line.contains("nats")));
// Test extract_nats_lines (includes all NATS lines including help/type)
let filtered_only = super::test_helpers::extract_nats_lines(test_input);
assert_eq!(filtered_only.len(), 5); // 5 NATS lines
assert!(filtered_only.iter().all(|line| line.contains("nats")));
// Test extract_metrics (only actual metric lines, excluding help/type)
let metrics_only = super::test_helpers::extract_metrics(test_input);
assert_eq!(metrics_only.len(), 6); // 6 actual metric lines (excluding help/type)
assert!(metrics_only
.iter()
.all(|line| line.starts_with("dynamo_component") && !line.starts_with("#")));
println!("✓ All refactored filter functions work correctly!");
}
}
#[cfg(feature = "integration")]
#[cfg(test)]
mod test_metricsregistry_nats {
use super::prometheus_names::name_prefix;
use super::prometheus_names::nats as nats_metrics;
use super::prometheus_names::{COMPONENT_NATS_METRICS, DRT_NATS_METRICS};
use super::*;
use crate::pipeline::PushRouter;
use crate::{DistributedRuntime, Runtime};
use tokio::time::{sleep, Duration};
#[test]
fn test_drt_nats_metrics() {
// Setup real DRT and registry using the test-friendly constructor
let drt = super::test_helpers::create_test_drt();
// Get DRT output which should include NATS client metrics
let drt_output = drt.prometheus_metrics_fmt().unwrap();
println!("DRT output with NATS metrics:");
println!("{}", drt_output);
// Additional checks for NATS client metrics (without checking specific values)
let drt_nats_metrics = super::test_helpers::extract_nats_lines(&drt_output);
// Check that NATS client metrics are present
assert!(
!drt_nats_metrics.is_empty(),
"NATS client metrics should be present"
);
// Check for specific NATS client metric names (without values)
let drt_metrics = super::test_helpers::extract_metrics(&drt_output);
let actual_drt_nats_metrics_sorted: Vec<&str> = drt_metrics
.iter()
.map(|line| {
let without_labels = line.split('{').next().unwrap_or(line);
// Remove the value part (everything after the last space)
without_labels.split(' ').next().unwrap_or(without_labels)
})
.collect();
let expect_drt_nats_metrics_sorted = {
let mut temp = DRT_NATS_METRICS
.iter()
.map(|metric| build_metric_name(metric))
.collect::<Vec<_>>();
temp.sort();
temp
};
// Print both lists for comparison
println!(
"actual_drt_nats_metrics_sorted: {:?}",
actual_drt_nats_metrics_sorted
);
println!(
"expect_drt_nats_metrics_sorted: {:?}",
expect_drt_nats_metrics_sorted
);
// Compare the sorted lists
assert_eq!(
actual_drt_nats_metrics_sorted,
expect_drt_nats_metrics_sorted,
"DRT_NATS_METRICS with prefix and expected_nats_metrics should be identical when sorted"
);
println!("✓ DistributedRuntime NATS metrics integration test passed!");
}
#[test]
fn test_nats_metric_names() {
// This test only tests the existence of the NATS metrics. It does not check
// the values of the metrics.
// Setup real DRT and registry using the test-friendly constructor
let drt = super::test_helpers::create_test_drt();
// Create a namespace and components from the DRT
let namespace = drt.namespace("ns789").unwrap();
let components = namespace.component("comp789").unwrap();
// Get components output which should include NATS client metrics
// Additional checks for NATS client metrics (without checking specific values)
let component_nats_metrics =
super::test_helpers::extract_nats_lines(&components.prometheus_metrics_fmt().unwrap());
println!(
"Component NATS metrics count: {}",
component_nats_metrics.len()
);
// Check that NATS client metrics are present
assert!(
!component_nats_metrics.is_empty(),
"NATS client metrics should be present"
);
// Check for specific NATS client metric names (without values)
let component_metrics =
super::test_helpers::extract_metrics(&components.prometheus_metrics_fmt().unwrap());
let actual_component_nats_metrics_sorted: Vec<&str> = component_metrics
.iter()
.map(|line| {
let without_labels = line.split('{').next().unwrap_or(line);
// Remove the value part (everything after the last space)
without_labels.split(' ').next().unwrap_or(without_labels)
})
.collect();
let expect_component_nats_metrics_sorted = {
let mut temp = COMPONENT_NATS_METRICS
.iter()
.map(|metric| build_metric_name(metric))
.collect::<Vec<_>>();
temp.sort();
temp
};
// Print both lists for comparison
println!(
"actual_component_nats_metrics_sorted: {:?}",
actual_component_nats_metrics_sorted
);
println!(
"expect_component_nats_metrics_sorted: {:?}",
expect_component_nats_metrics_sorted
);
// Compare the sorted lists
assert_eq!(
actual_component_nats_metrics_sorted,
expect_component_nats_metrics_sorted,
"COMPONENT_NATS_METRICS with prefix and expected_nats_metrics should be identical when sorted"
);
// Get both DRT and component output and filter for component metrics
let drt_and_component_metrics =
super::test_helpers::extract_metrics(&drt.prometheus_metrics_fmt().unwrap());
println!(
"DRT and component metrics count: {}",
drt_and_component_metrics.len()
);
// Check that the NATS metrics are present in the component output
assert_eq!(
drt_and_component_metrics.len(),
DRT_NATS_METRICS.len() + COMPONENT_NATS_METRICS.len(),
"DRT at this point should have both the DRT and component NATS metrics"
);
// Check that the NATS metrics are present in the component output
println!("✓ Component NATS metrics integration test passed!");
}
/// Tests NATS metrics values before and after endpoint activity with large message processing.
/// Creates endpoint, sends test messages + 10k byte message, validates metrics (NATS + work handler)
/// at initial state and post-activity state. Ensures byte thresholds, message counts, and processing
/// times are within expected ranges. Tests end-to-end client-server communication and metrics collection.
#[tokio::test]
async fn test_nats_metrics_values() -> anyhow::Result<()> {
struct MessageHandler {}
impl MessageHandler {
fn new() -> std::sync::Arc<Self> {
std::sync::Arc::new(Self {})
}
}
#[async_trait]
impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for MessageHandler {
async fn generate(
&self,
input: SingleIn<String>,
) -> Result<ManyOut<Annotated<String>>, Error> {
let (data, ctx) = input.into_parts();
let response = format!("{}", data);
let stream = stream::iter(vec![Annotated::from_data(response)]);
Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
}
}
println!("\n=== Initializing DistributedRuntime ===");
let runtime = Runtime::from_current()?;
let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = drt.namespace("ns123").unwrap();
let component = namespace.component("comp123").unwrap();
let ingress = Ingress::for_engine(MessageHandler::new()).unwrap();
let _backend_handle = tokio::spawn(async move {
let service = component.service_builder().create().await.unwrap();
let endpoint = service.endpoint("echo").endpoint_builder().handler(ingress);
endpoint.start().await.unwrap();
});
sleep(Duration::from_millis(500)).await;
println!("✓ Launched endpoint service in background successfully");
let drt_output = drt.prometheus_metrics_fmt().unwrap();
let parsed_metrics: Vec<_> = drt_output
.lines()
.filter_map(|line| super::test_helpers::parse_prometheus_metric(line))
.collect();
println!("=== Initial DRT metrics output ===");
println!("{}", drt_output);
println!("\n=== Checking Initial Metric Values ===");
let initial_expected_metric_values = [
// DRT NATS metrics (ordered to match DRT_NATS_METRICS)
(build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Should be connected
(build_metric_name(nats::CONNECTS), 1.0, 1.0), // Should have 1 connection
(build_metric_name(nats::IN_TOTAL_BYTES), 300.0, 500.0), // ~75% to ~125% of 417
(build_metric_name(nats::IN_MESSAGES), 0.0, 0.0), // No messages yet
(build_metric_name(nats::OUT_OVERHEAD_BYTES), 500.0, 700.0), // ~75% to ~125% of 612 (includes endpoint creation overhead)
(build_metric_name(nats::OUT_MESSAGES), 0.0, 0.0), // No messages yet
// Component NATS metrics (ordered to match COMPONENT_NATS_METRICS)
(build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 0.0), // No processing yet
(build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // No errors yet
(build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // No requests yet
(build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 0.0), // No processing yet
(build_metric_name(nats::ACTIVE_SERVICES), 0.0, 0.0), // No services yet
(build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 0.0), // No endpoints yet
];
for (metric_name, min_value, max_value) in &initial_expected_metric_values {
let actual_value = parsed_metrics
.iter()
.find(|(name, _, _)| name == metric_name)
.map(|(_, _, value)| *value)
.unwrap_or_else(|| panic!("Could not find expected metric: {}", metric_name));
assert!(
actual_value >= *min_value && actual_value <= *max_value,
"Initial metric {} should be between {} and {}, but got {}",
metric_name,
min_value,
max_value,
actual_value
);
}
println!("\n=== Client Runtime to hit the endpoint ===");
let client_runtime = Runtime::from_current()?;
let client_distributed = DistributedRuntime::from_settings(client_runtime.clone()).await?;
let namespace = client_distributed.namespace("ns123")?;
let component = namespace.component("comp123")?;
let client = component.endpoint("echo").client().await?;
client.wait_for_instances().await?;
println!("✓ Connected to endpoint, waiting for instances...");
let router =
PushRouter::<String, Annotated<String>>::from_client(client, Default::default())
.await?;
for i in 0..10 {
let msg = i.to_string().repeat(2000); // 2k bytes message
let mut stream = router.random(msg.clone().into()).await?;
while let Some(resp) = stream.next().await {
// Check if response matches the original message
if let Some(data) = &resp.data {
let is_same = data == &msg;
println!(
"Response {}: {} bytes, matches original: {}",
i,
data.len(),
is_same
);
}
}
sleep(Duration::from_millis(100)).await;
}
println!("✓ Sent messages and received responses successfully");
let final_drt_output = drt.prometheus_metrics_fmt().unwrap();
println!("\n=== Final Prometheus DRT output ===");
println!("{}", final_drt_output);
let final_drt_nats_output = super::test_helpers::extract_nats_lines(&final_drt_output);
println!("\n=== Filtered NATS metrics from final DRT output ===");
for line in &final_drt_nats_output {
println!("{}", line);
}
let final_parsed_metrics: Vec<_> = super::test_helpers::extract_metrics(&final_drt_output)
.iter()
.filter_map(|line| super::test_helpers::parse_prometheus_metric(line))
.collect();
let post_expected_metric_values = [
// DRT NATS metrics (ordered to match DRT_NATS_METRICS)
(build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Should remain connected
(build_metric_name(nats::CONNECTS), 1.0, 1.0), // Should remain 1 connection
(build_metric_name(nats::IN_TOTAL_BYTES), 22000.0, 28000.0), // ~75% to ~125% of 24977 (10 messages × 2000 bytes + overhead)
(build_metric_name(nats::IN_MESSAGES), 10.0, 12.0), // Allow small drift (callback may run twice)
(build_metric_name(nats::OUT_OVERHEAD_BYTES), 2076.0, 3461.0), // ~75% to ~125% of 2769 (synchronous metrics collection overhead)
(build_metric_name(nats::OUT_MESSAGES), 10.0, 12.0), // Allow small drift (callback may run twice)
// Component NATS metrics (ordered to match COMPONENT_NATS_METRICS)
(build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 1.0), // Should be low processing time
(build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // Should have no errors
(build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // NATS metrics don't track work handler requests
(build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 5.0), // Should be low total processing time
(build_metric_name(nats::ACTIVE_SERVICES), 0.0, 0.0), // NATS metrics don't track work handler services
(build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 0.0), // NATS metrics don't track work handler endpoints
// Work handler metrics with ranges
(build_metric_name(work_handler::REQUESTS_TOTAL), 10.0, 10.0), // Exact count (10 messages)
(
build_metric_name(work_handler::REQUEST_BYTES_TOTAL),
21000.0,
26000.0,
), // ~75% to ~125% of 23520 (10 × 2000 bytes + overhead)
(
build_metric_name(work_handler::RESPONSE_BYTES_TOTAL),
18000.0,
23000.0,
), // ~75% to ~125% of 20660 (10 × 2000 bytes + overhead, but response size varies)
// Additional component metrics
(
build_metric_name(work_handler::CONCURRENT_REQUESTS),
0.0,
1.0,
), // Should be 0 or very low
(
format!(
"{}_count",
build_metric_name(work_handler::REQUEST_DURATION_SECONDS)
),
10.0,
10.0,
), // Exact count (10 messages)
(
format!(
"{}_sum",
build_metric_name(work_handler::REQUEST_DURATION_SECONDS)
),
0.001,
0.999,
), // Processing time sum (10 messages)
];
println!("\n=== Checking Post-Activity All Metrics (NATS + Work Handler) ===");
for (metric_name, min_value, max_value) in &post_expected_metric_values {
let actual_value = final_parsed_metrics
.iter()
.find(|(name, _, _)| name == metric_name)
.map(|(_, _, value)| *value)
.unwrap_or_else(|| {
panic!(
"Could not find expected post-activity metric: {}",
metric_name
)
});
assert!(
actual_value >= *min_value && actual_value <= *max_value,
"Post-activity metric {} should be between {} and {}, but got {}",
metric_name,
min_value,
max_value,
actual_value
);
println!(
"✓ {}: {} (range: {} to {})",
metric_name, actual_value, min_value, max_value
);
}
println!("✓ All NATS and component metrics parsed successfully!");
println!("✓ Byte metrics verified to be >= 100 bytes!");
println!("✓ Post-activity metrics verified with higher thresholds!");
println!("✓ Work handler metrics reflect increased activity!");
Ok(())
}
} }
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Prometheus metric name constants
//!
//! This module provides centralized Prometheus metric name constants for various components
//! to ensure consistency and avoid duplication across the codebase.
/// Builds a full metric name by prepending the component prefix
pub fn build_metric_name(metric_name: &str) -> String {
format!("{}{}", name_prefix::COMPONENT, metric_name)
}
/// Metric name prefixes used across the metrics system
pub mod name_prefix {
/// Prefix for all Prometheus metric names.
pub const COMPONENT: &str = "dynamo_component_";
// TODO(keivenc): uncomment below for the frontend
// pub const FRONTEND: &str = "dynamo_frontend_";
}
/// Automatically inserted Prometheus label names used across the metrics system
pub mod labels {
/// Label for component identification
pub const COMPONENT: &str = "dynamo_component";
/// Label for namespace identification
pub const NAMESPACE: &str = "dynamo_namespace";
/// Label for endpoint identification
pub const ENDPOINT: &str = "dynamo_endpoint";
}
/// NATS Prometheus metric names
pub mod nats {
/// Prefix for all NATS client metrics
pub const PREFIX: &str = "nats_";
/// ===== DistributedRuntime metrics =====
/// Total number of bytes received by NATS client
pub const IN_TOTAL_BYTES: &str = "nats_in_total_bytes";
/// Total number of bytes sent by NATS client
pub const OUT_OVERHEAD_BYTES: &str = "nats_out_overhead_bytes";
/// Total number of messages received by NATS client
pub const IN_MESSAGES: &str = "nats_in_messages";
/// Total number of messages sent by NATS client
pub const OUT_MESSAGES: &str = "nats_out_messages";
/// Total number of connections established by NATS client
pub const CONNECTS: &str = "nats_connects";
/// Current connection state of NATS client (0=disconnected, 1=connected, 2=reconnecting)
pub const CONNECTION_STATE: &str = "nats_connection_state";
/// ===== Component metrics (ordered to match NatsStatsMetrics fields) =====
/// Average processing time in milliseconds (maps to: average_processing_time in ms)
pub const AVG_PROCESSING_MS: &str = "nats_avg_processing_time_ms";
/// Total errors across all endpoints (maps to: num_errors)
pub const TOTAL_ERRORS: &str = "nats_total_errors";
/// Total requests across all endpoints (maps to: num_requests)
pub const TOTAL_REQUESTS: &str = "nats_total_requests";
/// Total processing time in milliseconds (maps to: processing_time in ms)
pub const TOTAL_PROCESSING_MS: &str = "nats_total_processing_time_ms";
/// Number of active services (derived from ServiceSet.services)
pub const ACTIVE_SERVICES: &str = "nats_active_services";
/// Number of active endpoints (derived from ServiceInfo.endpoints)
pub const ACTIVE_ENDPOINTS: &str = "nats_active_endpoints";
}
/// All NATS client Prometheus metric names as an array for iteration/validation
pub const DRT_NATS_METRICS: &[&str] = &[
nats::CONNECTION_STATE,
nats::CONNECTS,
nats::IN_TOTAL_BYTES,
nats::IN_MESSAGES,
nats::OUT_OVERHEAD_BYTES,
nats::OUT_MESSAGES,
];
/// All component service Prometheus metric names as an array for iteration/validation
/// (ordered to match NatsStatsMetrics fields)
pub const COMPONENT_NATS_METRICS: &[&str] = &[
nats::AVG_PROCESSING_MS, // maps to: average_processing_time (nanoseconds)
nats::TOTAL_ERRORS, // maps to: num_errors
nats::TOTAL_REQUESTS, // maps to: num_requests
nats::TOTAL_PROCESSING_MS, // maps to: processing_time (nanoseconds)
nats::ACTIVE_SERVICES, // derived from ServiceSet.services
nats::ACTIVE_ENDPOINTS, // derived from ServiceInfo.endpoints
];
/// Work handler Prometheus metric names
pub mod work_handler {
/// Total number of requests processed by work handler
pub const REQUESTS_TOTAL: &str = "requests_total";
/// Total number of bytes received in requests by work handler
pub const REQUEST_BYTES_TOTAL: &str = "request_bytes_total";
/// Total number of bytes sent in responses by work handler
pub const RESPONSE_BYTES_TOTAL: &str = "response_bytes_total";
/// Number of requests currently being processed by work handler
pub const CONCURRENT_REQUESTS: &str = "concurrent_requests";
/// Time spent processing requests by work handler (histogram)
pub const REQUEST_DURATION_SECONDS: &str = "request_duration_seconds";
}
...@@ -19,13 +19,22 @@ ...@@ -19,13 +19,22 @@
// we will want to associate the components cancellation token with the // we will want to associate the components cancellation token with the
// component's "service state" // component's "service state"
use crate::{error, transports::nats, utils::stream, Result}; use crate::{
component::Component,
error,
metrics::{prometheus_names, MetricsRegistry},
traits::*,
transports::nats,
utils::stream,
DistributedRuntime, Result,
};
use async_nats::Message; use async_nats::Message;
use async_stream::try_stream; use async_stream::try_stream;
use bytes::Bytes; use bytes::Bytes;
use derive_getters::Dissolve; use derive_getters::Dissolve;
use futures::stream::{StreamExt, TryStreamExt}; use futures::stream::{StreamExt, TryStreamExt};
use prometheus;
use serde::{de::DeserializeOwned, Deserialize, Serialize}; use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::time::Duration; use std::time::Duration;
...@@ -39,11 +48,55 @@ impl ServiceClient { ...@@ -39,11 +48,55 @@ impl ServiceClient {
} }
} }
/// ServiceSet contains a collection of services with their endpoints and metrics
///
/// Tree structure:
/// Structure:
/// - ServiceSet
/// - services: Vec<ServiceInfo>
/// - name: String
/// - id: String
/// - version: String
/// - started: String
/// - endpoints: Vec<EndpointInfo>
/// - name: String
/// - subject: String
/// - data: Option<NatsStatsMetrics>
/// - average_processing_time: f64
/// - last_error: String
/// - num_errors: u64
/// - num_requests: u64
/// - processing_time: u64
/// - queue_group: String
/// - data: serde_json::Value (custom stats)
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ServiceSet { pub struct ServiceSet {
services: Vec<ServiceInfo>, services: Vec<ServiceInfo>,
} }
/// This is a example JSON from `nats req '$SRV.STATS.dynamo_backend'`:
/// {
/// "type": "io.nats.micro.v1.stats_response",
/// "name": "dynamo_backend",
/// "id": "bdu7nA8tbhy9mEkxIWlkBA",
/// "version": "0.0.1",
/// "started": "2025-08-08T05:07:17.720783523Z",
/// "endpoints": [
/// {
/// "name": "dynamo_backend-generate-694d988806b92e39",
/// "subject": "dynamo_backend.generate-694d988806b92e39",
/// "num_requests": 0,
/// "num_errors": 0,
/// "processing_time": 0,
/// "average_processing_time": 0,
/// "last_error": "",
/// "data": {
/// "val": 10
/// },
/// "queue_group": "q"
/// }
/// ]
/// }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ServiceInfo { pub struct ServiceInfo {
pub name: String, pub name: String,
...@@ -53,13 +106,15 @@ pub struct ServiceInfo { ...@@ -53,13 +106,15 @@ pub struct ServiceInfo {
pub endpoints: Vec<EndpointInfo>, pub endpoints: Vec<EndpointInfo>,
} }
/// Each endpoint has name, subject, num_requests, num_errors, processing_time, average_processing_time, last_error, queue_group, and data
#[derive(Debug, Clone, Serialize, Deserialize, Dissolve)] #[derive(Debug, Clone, Serialize, Deserialize, Dissolve)]
pub struct EndpointInfo { pub struct EndpointInfo {
pub name: String, pub name: String,
pub subject: String, pub subject: String,
/// Extra fields that don't fit in EndpointInfo will be flattened into the Metrics struct.
#[serde(flatten)] #[serde(flatten)]
pub data: Option<Metrics>, pub data: Option<NatsStatsMetrics>,
} }
impl EndpointInfo { impl EndpointInfo {
...@@ -79,20 +134,21 @@ impl EndpointInfo { ...@@ -79,20 +134,21 @@ impl EndpointInfo {
// for easy deserialization. Ideally, this type already exists or can // for easy deserialization. Ideally, this type already exists or can
// be exposed in the library somewhere. // be exposed in the library somewhere.
/// Stats structure returned from NATS service API /// Stats structure returned from NATS service API
/// https://github.com/nats-io/nats.rs/blob/main/async-nats/src/service/endpoint.rs
#[derive(Debug, Clone, Serialize, Deserialize, Dissolve)] #[derive(Debug, Clone, Serialize, Deserialize, Dissolve)]
pub struct Metrics { pub struct NatsStatsMetrics {
// Standard NATS Service API fields // Standard NATS Stats Service API fields from $SRV.STATS.<service_name> requests
pub average_processing_time: f64, pub average_processing_time: u64, // in nanoseconds according to nats-io
pub last_error: String, pub last_error: String,
pub num_errors: u64, pub num_errors: u64,
pub num_requests: u64, pub num_requests: u64,
pub processing_time: u64, pub processing_time: u64, // in nanoseconds according to nats-io
pub queue_group: String, pub queue_group: String,
// Field containing custom stats handler data // Field containing custom stats handler data
pub data: serde_json::Value, pub data: serde_json::Value,
} }
impl Metrics { impl NatsStatsMetrics {
pub fn decode<T: for<'de> Deserialize<'de>>(self) -> Result<T> { pub fn decode<T: for<'de> Deserialize<'de>>(self) -> Result<T> {
serde_json::from_value(self.data).map_err(Into::into) serde_json::from_value(self.data).map_err(Into::into)
} }
...@@ -154,6 +210,11 @@ impl ServiceSet { ...@@ -154,6 +210,11 @@ impl ServiceSet {
.into_iter() .into_iter()
.flat_map(|s| s.endpoints.into_iter()) .flat_map(|s| s.endpoints.into_iter())
} }
/// Get a reference to the services in this ServiceSet
pub fn services(&self) -> &[ServiceInfo] {
&self.services
}
} }
#[cfg(test)] #[cfg(test)]
...@@ -173,8 +234,8 @@ mod tests { ...@@ -173,8 +234,8 @@ mod tests {
EndpointInfo { EndpointInfo {
name: "endpoint1".to_string(), name: "endpoint1".to_string(),
subject: "subject1".to_string(), subject: "subject1".to_string(),
data: Some(Metrics { data: Some(NatsStatsMetrics {
average_processing_time: 0.1, average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(), last_error: "none".to_string(),
num_errors: 0, num_errors: 0,
num_requests: 10, num_requests: 10,
...@@ -186,8 +247,8 @@ mod tests { ...@@ -186,8 +247,8 @@ mod tests {
EndpointInfo { EndpointInfo {
name: "endpoint2-foo".to_string(), name: "endpoint2-foo".to_string(),
subject: "subject2".to_string(), subject: "subject2".to_string(),
data: Some(Metrics { data: Some(NatsStatsMetrics {
average_processing_time: 0.1, average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(), last_error: "none".to_string(),
num_errors: 0, num_errors: 0,
num_requests: 10, num_requests: 10,
...@@ -207,8 +268,8 @@ mod tests { ...@@ -207,8 +268,8 @@ mod tests {
EndpointInfo { EndpointInfo {
name: "endpoint1".to_string(), name: "endpoint1".to_string(),
subject: "subject1".to_string(), subject: "subject1".to_string(),
data: Some(Metrics { data: Some(NatsStatsMetrics {
average_processing_time: 0.1, average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(), last_error: "none".to_string(),
num_errors: 0, num_errors: 0,
num_requests: 10, num_requests: 10,
...@@ -220,8 +281,8 @@ mod tests { ...@@ -220,8 +281,8 @@ mod tests {
EndpointInfo { EndpointInfo {
name: "endpoint2-bar".to_string(), name: "endpoint2-bar".to_string(),
subject: "subject2".to_string(), subject: "subject2".to_string(),
data: Some(Metrics { data: Some(NatsStatsMetrics {
average_processing_time: 0.1, average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(), last_error: "none".to_string(),
num_errors: 0, num_errors: 0,
num_requests: 10, num_requests: 10,
...@@ -244,3 +305,135 @@ mod tests { ...@@ -244,3 +305,135 @@ mod tests {
assert_eq!(endpoints.len(), 2); assert_eq!(endpoints.len(), 2);
} }
} }
/// Prometheus metrics for component service statistics (ordered to match NatsStatsMetrics)
///
/// ⚠️ IMPORTANT: These Prometheus Gauges are COPIES of NATS data, not live references!
///
/// How it works:
/// 1. NATS provides source data via NatsStatsMetrics
/// 2. Metrics callbacks read current NATS values and update these Prometheus Gauges
/// 3. Prometheus scrapes these Gauge values (snapshots, not live data)
///
/// Flow: NATS Service → NatsStatsMetrics (Counters) → Metrics Callback → Prometheus Gauge
/// Note: These are snapshots updated when execute_metrics_callbacks() is called.
#[derive(Debug, Clone)]
pub struct ComponentNatsPrometheusMetrics {
/// Average processing time in milliseconds (maps to: average_processing_time)
pub avg_processing_ms: prometheus::Gauge,
/// Total errors across all endpoints (maps to: num_errors)
pub total_errors: prometheus::IntGauge,
/// Total requests across all endpoints (maps to: num_requests)
pub total_requests: prometheus::IntGauge,
/// Total processing time in milliseconds (maps to: processing_time)
pub total_processing_ms: prometheus::IntGauge,
/// Number of active services (derived from ServiceSet.services)
pub active_services: prometheus::IntGauge,
/// Number of active endpoints (derived from ServiceInfo.endpoints)
pub active_endpoints: prometheus::IntGauge,
}
impl ComponentNatsPrometheusMetrics {
/// Create new ComponentServiceMetrics using Component's DistributedRuntime's Prometheus constructors
pub fn new(component: &Component) -> Result<Self> {
let avg_processing_ms = component.create_gauge(
prometheus_names::nats::AVG_PROCESSING_MS,
"Average processing time across all component endpoints in milliseconds",
&[],
)?;
let total_errors = component.create_intgauge(
prometheus_names::nats::TOTAL_ERRORS,
"Total number of errors across all component endpoints",
&[],
)?;
let total_requests = component.create_intgauge(
prometheus_names::nats::TOTAL_REQUESTS,
"Total number of requests across all component endpoints",
&[],
)?;
let total_processing_ms = component.create_intgauge(
prometheus_names::nats::TOTAL_PROCESSING_MS,
"Total processing time across all component endpoints in milliseconds",
&[],
)?;
let active_services = component.create_intgauge(
prometheus_names::nats::ACTIVE_SERVICES,
"Number of active services in this component",
&[],
)?;
let active_endpoints = component.create_intgauge(
prometheus_names::nats::ACTIVE_ENDPOINTS,
"Number of active endpoints across all services",
&[],
)?;
Ok(Self {
avg_processing_ms,
total_errors,
total_requests,
total_processing_ms,
active_services,
active_endpoints,
})
}
/// Update metrics from scraped ServiceSet data
pub fn update_from_service_set(&self, service_set: &ServiceSet) {
// Variables ordered to match NatsStatsMetrics fields
let mut processing_time_samples = 0u64; // for average_processing_time calculation
let mut total_errors = 0u64; // maps to: num_errors
let mut total_requests = 0u64; // maps to: num_requests
let mut total_processing_time_nanos = 0u64; // maps to: processing_time (nanoseconds from NATS)
let mut endpoint_count = 0u64; // for derived metrics
let service_count = service_set.services().len() as i64;
for service in service_set.services() {
for endpoint in &service.endpoints {
endpoint_count += 1;
if let Some(ref stats) = endpoint.data {
total_errors += stats.num_errors;
total_requests += stats.num_requests;
total_processing_time_nanos += stats.processing_time;
if stats.num_requests > 0 {
processing_time_samples += 1;
}
}
}
}
// Update metrics (ordered to match NatsStatsMetrics fields)
// Calculate average processing time in milliseconds (maps to: average_processing_time)
if processing_time_samples > 0 && total_requests > 0 {
let avg_time_nanos = total_processing_time_nanos as f64 / total_requests as f64;
let avg_time_ms = avg_time_nanos / 1_000_000.0; // Convert nanoseconds to milliseconds
self.avg_processing_ms.set(avg_time_ms);
} else {
self.avg_processing_ms.set(0.0);
}
self.total_errors.set(total_errors as i64); // maps to: num_errors
self.total_requests.set(total_requests as i64); // maps to: num_requests
self.total_processing_ms
.set((total_processing_time_nanos / 1_000_000) as i64); // maps to: processing_time (converted to milliseconds)
self.active_services.set(service_count); // derived from ServiceSet.services
self.active_endpoints.set(endpoint_count as i64); // derived from ServiceInfo.endpoints
}
/// Reset all metrics to zero. Useful when no data is available or to clear stale values.
pub fn reset_to_zeros(&self) {
self.avg_processing_ms.set(0.0);
self.total_errors.set(0);
self.total_requests.set(0);
self.total_processing_ms.set(0);
self.active_services.set(0);
self.active_endpoints.set(0);
}
}
...@@ -209,6 +209,7 @@ pub async fn spawn_system_status_server( ...@@ -209,6 +209,7 @@ pub async fn spawn_system_status_server(
tracing::error!("System status server error: {}", e); tracing::error!("System status server error: {}", e);
} }
}); });
Ok((actual_address, handle)) Ok((actual_address, handle))
} }
...@@ -254,7 +255,18 @@ async fn metrics_handler(state: Arc<SystemStatusState>) -> impl IntoResponse { ...@@ -254,7 +255,18 @@ async fn metrics_handler(state: Arc<SystemStatusState>) -> impl IntoResponse {
// Update the uptime gauge with current value // Update the uptime gauge with current value
state.update_uptime_gauge(); state.update_uptime_gauge();
// Get metrics from the registry // Execute all the callbacks starting at the DistributedRuntime level
assert!(state.drt().basename() == "");
let callback_results = state
.drt()
.execute_metrics_callbacks(&state.drt().hierarchy());
for result in callback_results {
if let Err(e) = result {
tracing::error!("Error executing metrics callback: {}", e);
}
}
// Get all metrics from DistributedRuntime (top-level)
match state.drt().prometheus_metrics_fmt() { match state.drt().prometheus_metrics_fmt() {
Ok(response) => (StatusCode::OK, response), Ok(response) => (StatusCode::OK, response),
Err(e) => { Err(e) => {
...@@ -341,12 +353,20 @@ mod tests { ...@@ -341,12 +353,20 @@ mod tests {
let response = runtime_metrics.drt().prometheus_metrics_fmt().unwrap(); let response = runtime_metrics.drt().prometheus_metrics_fmt().unwrap();
println!("Full metrics response:\n{}", response); println!("Full metrics response:\n{}", response);
// Filter out NATS client metrics for comparison
use crate::metrics::prometheus_names::nats as nats_metrics;
let filtered_response: String = response
.lines()
.filter(|line| !line.contains(nats_metrics::PREFIX))
.collect::<Vec<_>>()
.join("\n");
let expected = "\ let expected = "\
# HELP dynamo_component_dynamo_uptime_seconds Total uptime of the DistributedRuntime in seconds # HELP dynamo_component_dynamo_uptime_seconds Total uptime of the DistributedRuntime in seconds
# TYPE dynamo_component_dynamo_uptime_seconds gauge # TYPE dynamo_component_dynamo_uptime_seconds gauge
dynamo_component_dynamo_uptime_seconds 42 dynamo_component_dynamo_uptime_seconds 42";
"; assert_eq!(filtered_response, expected);
assert_eq!(response, expected);
} }
#[cfg(feature = "integration")] #[cfg(feature = "integration")]
......
...@@ -632,15 +632,11 @@ mod tests { ...@@ -632,15 +632,11 @@ mod tests {
.id(); .id();
// Create the key // Create the key
let result = client let result = client.kv_create(key, value.to_vec(), Some(lease_id)).await;
.kv_create(key.to_string(), value.to_vec(), Some(lease_id))
.await;
assert!(result.is_ok(), ""); assert!(result.is_ok(), "");
// Try to create the key again - this should fail // Try to create the key again - this should fail
let result = client let result = client.kv_create(key, value.to_vec(), Some(lease_id)).await;
.kv_create(key.to_string(), value.to_vec(), Some(lease_id))
.await;
assert!(result.is_err()); assert!(result.is_err());
// Create or validate should succeed as the values match // Create or validate should succeed as the values match
......
...@@ -28,20 +28,23 @@ ...@@ -28,20 +28,23 @@
//! - `NATS_AUTH_CREDENTIALS_FILE`: the path to the credentials file //! - `NATS_AUTH_CREDENTIALS_FILE`: the path to the credentials file
//! //!
//! Note: `NATS_AUTH_USERNAME` and `NATS_AUTH_PASSWORD` must be used together. //! Note: `NATS_AUTH_USERNAME` and `NATS_AUTH_PASSWORD` must be used together.
use crate::Result; use crate::{metrics::MetricsRegistry, Result};
use async_nats::connection::State;
use async_nats::{client, jetstream, Subscriber}; use async_nats::{client, jetstream, Subscriber};
use bytes::Bytes; use bytes::Bytes;
use derive_builder::Builder; use derive_builder::Builder;
use futures::{StreamExt, TryStreamExt}; use futures::{StreamExt, TryStreamExt};
use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntCounter, IntGauge, Opts, Registry};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::Arc; use std::sync::atomic::Ordering;
use tokio::fs::File as TokioFile; use tokio::fs::File as TokioFile;
use tokio::io::AsyncRead; use tokio::io::AsyncRead;
use tokio::time; use tokio::time;
use url::Url; use url::Url;
use validator::{Validate, ValidationError}; use validator::{Validate, ValidationError};
use crate::metrics::prometheus_names::nats as nats_metrics;
pub use crate::slug::Slug; pub use crate::slug::Slug;
use tracing as log; use tracing as log;
...@@ -504,6 +507,109 @@ impl NatsQueue { ...@@ -504,6 +507,109 @@ impl NatsQueue {
} }
} }
/// Prometheus metrics that mirror the NATS client statistics (in primitive types)
/// to be used for the System Status Server.
///
/// ⚠️ IMPORTANT: These Prometheus Gauges are COPIES of NATS client data, not live references!
///
/// How it works:
/// 1. NATS client provides source data via client.statistics() and connection_state()
/// 2. set_from_client_stats() reads current NATS values and updates these Prometheus Gauges
/// 3. Prometheus scrapes these Gauge values (snapshots, not live data)
///
/// Flow: NATS Client → Client Statistics → set_from_client_stats() → Prometheus Gauge
/// Note: These are snapshots updated when set_from_client_stats() is called.
#[derive(Debug, Clone)]
pub struct DRTNatsPrometheusMetrics {
nats_client: client::Client,
/// Number of bytes received (excluding protocol overhead)
pub in_bytes: IntGauge,
/// Number of bytes sent (excluding protocol overhead)
pub out_bytes: IntGauge,
/// Number of messages received
pub in_messages: IntGauge,
/// Number of messages sent
pub out_messages: IntGauge,
/// Number of times connection was established
pub connects: IntGauge,
/// Current connection state (0 = disconnected, 1 = connected, 2 = reconnecting)
pub connection_state: IntGauge,
}
impl DRTNatsPrometheusMetrics {
/// Create a new instance of NATS client metrics using a DistributedRuntime's Prometheus constructors
pub fn new(drt: &crate::DistributedRuntime, nats_client: client::Client) -> Result<Self> {
let in_bytes = drt.create_intgauge(
nats_metrics::IN_TOTAL_BYTES,
"Total number of bytes received by NATS client",
&[],
)?;
let out_bytes = drt.create_intgauge(
nats_metrics::OUT_OVERHEAD_BYTES,
"Total number of bytes sent by NATS client",
&[],
)?;
let in_messages = drt.create_intgauge(
nats_metrics::IN_MESSAGES,
"Total number of messages received by NATS client",
&[],
)?;
let out_messages = drt.create_intgauge(
nats_metrics::OUT_MESSAGES,
"Total number of messages sent by NATS client",
&[],
)?;
let connects = drt.create_intgauge(
nats_metrics::CONNECTS,
"Total number of connections established by NATS client",
&[],
)?;
let connection_state = drt.create_intgauge(
nats_metrics::CONNECTION_STATE,
"Current connection state of NATS client (0=disconnected, 1=connected, 2=reconnecting)",
&[],
)?;
Ok(Self {
nats_client,
in_bytes,
out_bytes,
in_messages,
out_messages,
connects,
connection_state,
})
}
/// Copy statistics from the stored NATS client to these Prometheus metrics
pub fn set_from_client_stats(&self) {
let stats = self.nats_client.statistics();
// Get current values from the client statistics
let in_bytes = stats.in_bytes.load(Ordering::Relaxed);
let out_bytes = stats.out_bytes.load(Ordering::Relaxed);
let in_messages = stats.in_messages.load(Ordering::Relaxed);
let out_messages = stats.out_messages.load(Ordering::Relaxed);
let connects = stats.connects.load(Ordering::Relaxed);
// Get connection state
let connection_state = match self.nats_client.connection_state() {
State::Connected => 1,
// treat Disconnected and Pending as "down"
State::Disconnected | State::Pending => 0,
};
// Update Prometheus metrics
// Using gauges allows us to set absolute values directly
self.in_bytes.set(in_bytes as i64);
self.out_bytes.set(out_bytes as i64);
self.in_messages.set(in_messages as i64);
self.out_messages.set(out_messages as i64);
self.connects.set(connects as i64);
self.connection_state.set(connection_state);
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment