Unverified Commit acbdabc4 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat(metrics): add NATS client metrics to prometheus_metrics_fmt (#2292)


Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 591f4d56
......@@ -31,7 +31,6 @@ use dynamo_llm::kv_router::scheduler::KVHitRateEvent;
use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT;
use dynamo_runtime::{
error, logging,
metrics::MetricsRegistry,
traits::events::{EventPublisher, EventSubscriber},
utils::{Duration, Instant},
DistributedRuntime, ErrorContext, Result, Runtime, Worker,
......@@ -137,14 +136,7 @@ async fn app(runtime: Runtime) -> Result<()> {
.await
.context("Unable to create unique instance of Count; possibly one already exists")?;
let target_component = {
let c = namespace.component(&config.component_name)?;
if let Some(ref model) = config.model_name {
c.add_labels(&[("model", model.as_str())])?
} else {
c
}
};
let target_component = namespace.component(&config.component_name)?;
let target_endpoint = target_component.endpoint(&config.endpoint_name);
let service_path = target_endpoint.path();
......
......@@ -485,21 +485,6 @@ impl Component {
Ok(())
})
}
/// Add constant labels to this component (for metrics). Returns a new Component with labels.
/// labels: list of (key, value) tuples.
fn add_labels(&self, labels: Vec<(String, String)>) -> PyResult<Component> {
use rs::metrics::MetricsRegistry as _;
let pairs: Vec<(&str, &str)> = labels
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect();
let inner = self.inner.clone().add_labels(&pairs).map_err(to_pyerr)?;
Ok(Component {
inner,
event_loop: self.event_loop.clone(),
})
}
}
#[pymethods]
......
......@@ -7,7 +7,6 @@ use anyhow::Context as _;
use tokio::sync::{mpsc::Receiver, Notify};
use dynamo_runtime::{
metrics::MetricsRegistry,
pipeline::{
network::egress::push_router::PushRouter, ManyOut, Operator, RouterMode, SegmentSource,
ServiceBackend, SingleIn, Source,
......@@ -170,8 +169,7 @@ impl ModelWatcher {
let component = self
.drt
.namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)
.and_then(|c| c.add_labels(&[("model", &model_entry.name)]))?;
.component(&endpoint_id.component)?;
let client = component.endpoint(&endpoint_id.name).client().await?;
let Some(etcd_client) = self.drt.etcd_client() else {
......
......@@ -27,7 +27,6 @@ use dynamo_runtime::{
component::Client,
distributed::DistributedConfig,
engine::{AsyncEngineStream, Data},
metrics::MetricsRegistry,
pipeline::{
Context, ManyOut, Operator, PushRouter, RouterMode, SegmentSource, ServiceBackend,
ServiceEngine, ServiceFrontend, SingleIn, Source,
......@@ -111,8 +110,7 @@ pub async fn prepare_engine(
let endpoint_id = local_model.endpoint_id();
let component = distributed_runtime
.namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)
.and_then(|c| c.add_labels(&[("model", card.slug().to_string().as_str())]))?;
.component(&endpoint_id.component)?;
let client = component.endpoint(&endpoint_id.name).client().await?;
......
......@@ -17,7 +17,6 @@ use crate::{
};
use dynamo_runtime::engine::AsyncEngineStream;
use dynamo_runtime::metrics::MetricsRegistry;
use dynamo_runtime::pipeline::{
network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source,
};
......@@ -33,25 +32,9 @@ pub async fn run(
let cancel_token = distributed_runtime.primary_token().clone();
let endpoint_id: EndpointId = path.parse()?;
let model_name = match &engine_config {
EngineConfig::StaticFull { model, .. } | EngineConfig::StaticCore { model, .. } => {
Some(model.service_name().to_string())
}
EngineConfig::StaticRemote(model) | EngineConfig::Dynamic(model) => {
Some(model.service_name().to_string())
}
};
let component = distributed_runtime
.namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)
.and_then(|c| {
if let Some(ref name) = model_name {
c.add_labels(&[("model", name.as_str())])
} else {
Ok(c)
}
})?;
.component(&endpoint_id.component)?;
let endpoint = component
.service_builder()
.create()
......
......@@ -14,8 +14,8 @@
// limitations under the License.
use dynamo_runtime::{
logging, metrics::MetricsRegistry, pipeline::PushRouter, protocols::annotated::Annotated,
stream::StreamExt, DistributedRuntime, Result, Runtime, Worker,
logging, pipeline::PushRouter, protocols::annotated::Annotated, stream::StreamExt,
DistributedRuntime, Result, Runtime, Worker,
};
use hello_world::DEFAULT_NAMESPACE;
......@@ -31,7 +31,6 @@ async fn app(runtime: Runtime) -> Result<()> {
let client = distributed
.namespace(DEFAULT_NAMESPACE)?
.component("backend")?
.add_labels(&[("model", "hello_world_model")])?
.endpoint("generate")
.client()
.await?;
......
......@@ -15,7 +15,6 @@
use dynamo_runtime::{
logging,
metrics::MetricsRegistry,
pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
ResponseStream, SingleIn,
......@@ -70,7 +69,6 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> {
runtime
.namespace(DEFAULT_NAMESPACE)?
.component("backend")?
.add_labels(&[("model", "hello_world_model")])?
.service_builder()
.create()
.await?
......
......@@ -17,8 +17,8 @@ use futures::StreamExt;
use service_metrics::DEFAULT_NAMESPACE;
use dynamo_runtime::{
logging, metrics::MetricsRegistry, pipeline::PushRouter, protocols::annotated::Annotated,
utils::Duration, DistributedRuntime, Result, Runtime, Worker,
logging, pipeline::PushRouter, protocols::annotated::Annotated, utils::Duration,
DistributedRuntime, Result, Runtime, Worker,
};
fn main() -> Result<()> {
......@@ -31,9 +31,7 @@ async fn app(runtime: Runtime) -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = distributed.namespace(DEFAULT_NAMESPACE)?;
let component = namespace
.component("backend")?
.add_labels(&[("model", "service_metrics_model")])?;
let component = namespace.component("backend")?;
let client = component.endpoint("generate").client().await?;
......
......@@ -17,7 +17,6 @@ use service_metrics::{MyStats, DEFAULT_NAMESPACE};
use dynamo_runtime::{
logging,
metrics::MetricsRegistry,
pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
ResponseStream, SingleIn,
......@@ -72,7 +71,6 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> {
runtime
.namespace(DEFAULT_NAMESPACE)?
.component("backend")?
.add_labels(&[("model", "service_metrics_model")])?
.service_builder()
.create()
.await?
......
......@@ -91,7 +91,6 @@ pub async fn backend(drt: DistributedRuntime, endpoint_name: Option<&str>) -> Re
let endpoint = drt
.namespace(DEFAULT_NAMESPACE)?
.component(DEFAULT_COMPONENT)?
.add_labels(&[("model", DEFAULT_MODEL_NAME)])?
.service_builder()
.create()
.await?
......
......@@ -30,7 +30,10 @@
//! TODO: Top-level Overview of Endpoints/Functions
use crate::{
config::HealthStatus, discovery::Lease, metrics::MetricsRegistry, service::ServiceSet,
config::HealthStatus,
discovery::Lease,
metrics::{prometheus_names, MetricsRegistry},
service::ServiceSet,
transports::etcd::EtcdPath,
};
......@@ -45,6 +48,7 @@ use super::{
use crate::pipeline::network::{ingress::push_endpoint::PushEndpoint, PushWorkHandler};
use crate::protocols::Endpoint as EndpointId;
use crate::service::ComponentNatsPrometheusMetrics;
use async_nats::{
rustls::quic,
service::{Service, ServiceExt},
......@@ -187,16 +191,6 @@ impl MetricsRegistry for Component {
]
.concat()
}
fn stored_labels(&self) -> Vec<(&str, &str)> {
let mut all_labels = self.namespace.stored_labels();
all_labels.extend(self.labels.iter().map(|(k, v)| (k.as_str(), v.as_str())));
all_labels
}
fn labels_mut(&mut self) -> &mut Vec<(String, String)> {
&mut self.labels
}
}
impl Component {
......@@ -262,6 +256,8 @@ impl Component {
Ok(out)
}
/// Scrape ServiceSet, which contains NATS stats as well as user defined stats
/// embedded in data field of ServiceInfo.
pub async fn scrape_stats(&self, timeout: Duration) -> Result<ServiceSet> {
let service_name = self.service_name();
let service_client = self.drt().service_client();
......@@ -270,6 +266,78 @@ impl Component {
.await
}
/// Add Prometheus metrics for this component's service stats.
///
/// Uses a channel to synchronize with the spawned async task, ensuring
/// metrics are updated before the callback returns.
pub fn add_metrics_callback(&self) -> Result<()> {
let component_metrics = ComponentNatsPrometheusMetrics::new(self)?;
let component_clone = self.clone();
let mut hierarchies = self.parent_hierarchy();
hierarchies.push(self.hierarchy());
debug_assert_eq!(
hierarchies.last().cloned().unwrap_or_default(),
self.service_name()
); // it happens that in component, hierarchy and service name are the same
// Register a metrics callback that scrapes component statistics
let metrics_callback = Arc::new(move || {
// Timeout for scraping metrics from components (in milliseconds)
// This value is also used by KV Router metrics aggregator (300ms) and other components
const METRICS_SCRAPE_TIMEOUT_MS: u64 = 300;
// Get the current Tokio runtime handle
let handle = tokio::runtime::Handle::try_current()
.map_err(|err| anyhow::anyhow!("No Tokio runtime handle available: {}", err))?;
let m = component_metrics.clone();
let c = component_clone.clone();
// Create a channel to synchronize with the spawned task
let (tx, rx) = std::sync::mpsc::channel::<anyhow::Result<()>>();
let timeout = std::time::Duration::from_millis(METRICS_SCRAPE_TIMEOUT_MS);
handle.spawn(async move {
let result = match c.scrape_stats(timeout).await {
Ok(service_set) => {
m.update_from_service_set(&service_set);
Ok(())
}
Err(err) => {
// Reset metrics on failure
m.reset_to_zeros();
Err(anyhow::anyhow!("Failed to scrape stats: {}", err))
}
};
// Send the result back to the waiting thread
// If send fails, the receiver has already given up waiting
let _ = tx.send(result);
});
// Wait for the spawned task to complete (with a timeout to prevent hanging)
// Add 100ms buffer to the scrape timeout to account for processing overhead
let recv_timeout = std::time::Duration::from_millis(METRICS_SCRAPE_TIMEOUT_MS + 100);
match rx.recv_timeout(recv_timeout) {
Ok(result) => result, // Return the actual result from scraping
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
component_metrics.reset_to_zeros();
Err(anyhow::anyhow!("Metrics collection timed out"))
}
Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {
component_metrics.reset_to_zeros();
Err(anyhow::anyhow!("Metrics collection task failed"))
}
}
});
self.drt()
.register_metrics_callback(hierarchies, metrics_callback);
Ok(())
}
/// TODO
///
/// This method will scrape the stats for all available services
......@@ -347,16 +415,6 @@ impl MetricsRegistry for Endpoint {
]
.concat()
}
fn stored_labels(&self) -> Vec<(&str, &str)> {
let mut all_labels = self.component.stored_labels();
all_labels.extend(self.labels.iter().map(|(k, v)| (k.as_str(), v.as_str())));
all_labels
}
fn labels_mut(&mut self) -> &mut Vec<(String, String)> {
&mut self.labels
}
}
impl Endpoint {
......@@ -520,11 +578,24 @@ impl Namespace {
/// Create a [`Component`] in the namespace who's endpoints can be discovered with etcd
pub fn component(&self, name: impl Into<String>) -> Result<Component> {
Ok(ComponentBuilder::from_runtime(self.runtime.clone())
let component = ComponentBuilder::from_runtime(self.runtime.clone())
.name(name)
.namespace(self.clone())
.is_static(self.is_static)
.build()?)
.build()?;
// Register the metrics callback for this component.
// If registration fails, log a warning but do not propagate the error,
// as metrics are not mission critical and should not block component creation.
if let Err(err) = component.add_metrics_callback() {
tracing::warn!(
"Failed to add metrics callback for component '{}': {}",
component.service_name(),
err
);
}
Ok(component)
}
/// Create a [`Namespace`] in the parent namespace
......
......@@ -84,19 +84,19 @@ impl MetricsRegistry for Namespace {
}
fn parent_hierarchy(&self) -> Vec<String> {
vec![self.drt().basename()]
}
fn stored_labels(&self) -> Vec<(&str, &str)> {
// Convert Vec<(String, String)> to Vec<(&str, &str)>
self.labels
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect()
}
fn labels_mut(&mut self) -> &mut Vec<(String, String)> {
&mut self.labels
// Build as: [ "" (DRT), non-empty parent basenames from root -> leaf ]
let mut names = vec![String::new()]; // Start with empty string for DRT
// Collect parent basenames from root to leaf
let parent_names: Vec<String> =
std::iter::successors(self.parent.as_deref(), |ns| ns.parent.as_deref())
.map(|ns| ns.basename())
.filter(|name| !name.is_empty())
.collect();
// Append parent names in reverse order (root to leaf)
names.extend(parent_names.into_iter().rev());
names
}
}
......
......@@ -14,13 +14,14 @@
// limitations under the License.
pub use crate::component::Component;
use crate::transports::nats::DRTNatsPrometheusMetrics;
use crate::{
component::{self, ComponentBuilder, Endpoint, InstanceSource, Namespace},
discovery::DiscoveryClient,
metrics::MetricsRegistry,
service::ServiceClient,
transports::{etcd, nats, tcp},
ErrorContext,
ErrorContext, RuntimeCallback,
};
use super::{error, Arc, DistributedRuntime, OnceCell, Result, Runtime, SystemHealth, Weak, OK};
......@@ -40,18 +41,6 @@ impl MetricsRegistry for DistributedRuntime {
fn parent_hierarchy(&self) -> Vec<String> {
vec![] // drt is the root, so no parent hierarchy
}
fn stored_labels(&self) -> Vec<(&str, &str)> {
// Convert Vec<(String, String)> to Vec<(&str, &str)>
self.labels
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect()
}
fn labels_mut(&mut self) -> &mut Vec<(String, String)> {
&mut self.labels
}
}
impl DistributedRuntime {
......@@ -88,6 +77,8 @@ impl DistributedRuntime {
live_endpoint_path,
)));
let nats_client_for_metrics = nats_client.clone();
let distributed_runtime = Self {
runtime,
etcd_client,
......@@ -97,14 +88,29 @@ impl DistributedRuntime {
component_registry: component::Registry::new(),
is_static,
instance_sources: Arc::new(Mutex::new(HashMap::new())),
prometheus_registries_by_prefix: Arc::new(std::sync::Mutex::new(HashMap::<
hierarchy_to_metricsregistry: Arc::new(std::sync::RwLock::new(HashMap::<
String,
prometheus::Registry,
crate::MetricsRegistryEntry,
>::new())),
system_health,
labels: Vec::new(),
};
let sys_nats_metrics = DRTNatsPrometheusMetrics::new(
&distributed_runtime,
nats_client_for_metrics.client().clone(),
)?;
let mut drt_hierarchies = distributed_runtime.parent_hierarchy();
drt_hierarchies.push(distributed_runtime.hierarchy());
// Register a callback to update NATS client metrics
let nats_metrics_callback = Arc::new({
let sys_nats_metrics_clone = sys_nats_metrics.clone();
move || {
sys_nats_metrics_clone.set_from_client_stats();
Ok(())
}
});
distributed_runtime.register_metrics_callback(drt_hierarchies, nats_metrics_callback);
// Start system status server if enabled
if let Some(cancel_token) = cancel_token {
let host = config.system_host.clone();
......@@ -240,6 +246,76 @@ impl DistributedRuntime {
pub fn instance_sources(&self) -> Arc<Mutex<HashMap<Endpoint, Weak<InstanceSource>>>> {
self.instance_sources.clone()
}
/// Add a Prometheus metric to a specific hierarchy's registry
pub fn add_prometheus_metric(
&self,
hierarchy: &str,
metric_name: &str,
prometheus_metric: Box<dyn prometheus::core::Collector>,
) -> anyhow::Result<()> {
let mut registries = self.hierarchy_to_metricsregistry.write().unwrap();
let entry = registries.entry(hierarchy.to_string()).or_default();
// If a metric with this name already exists for the hierarchy, warn and skip registration
if entry.has_metric_named(metric_name) {
tracing::warn!(
hierarchy = ?hierarchy,
metric_name = ?metric_name,
"Metric already exists in registry; skipping registration"
);
return Ok(());
}
// Try to register the metric and provide better error information
match entry.prometheus_registry.register(prometheus_metric) {
Ok(_) => Ok(()),
Err(e) => {
let error_msg = e.to_string();
tracing::error!(
hierarchy = ?hierarchy,
error = ?error_msg,
metric_name = ?metric_name,
"Metric registration failed"
);
Err(e.into())
}
}
}
/// Add a callback function to metrics registries for the given hierarchies
pub fn register_metrics_callback(&self, hierarchies: Vec<String>, callback: RuntimeCallback) {
let mut registries = self.hierarchy_to_metricsregistry.write().unwrap();
for hierarchy in hierarchies {
registries
.entry(hierarchy)
.or_default()
.add_callback(callback.clone());
}
}
/// Execute all callbacks for a given hierarchy key and return their results
pub fn execute_metrics_callbacks(&self, hierarchy: &str) -> Vec<anyhow::Result<()>> {
// Clone callbacks while holding read lock (fast operation)
let callbacks = {
let registries = self.hierarchy_to_metricsregistry.read().unwrap();
registries
.get(hierarchy)
.map(|entry| entry.runtime_callbacks.clone())
}; // Read lock released here
// Execute callbacks without holding the lock
match callbacks {
Some(callbacks) => callbacks.iter().map(|callback| callback()).collect(),
None => Vec::new(),
}
}
/// Get all registered hierarchy keys. Private because it is only used for testing.
fn get_registered_hierarchies(&self) -> Vec<String> {
let registries = self.hierarchy_to_metricsregistry.read().unwrap();
registries.keys().cloned().collect()
}
}
#[derive(Dissolve)]
......
......@@ -147,6 +147,70 @@ impl SystemHealth {
}
}
/// Type alias for runtime callback functions to reduce complexity
///
/// This type represents an Arc-wrapped callback function that can be:
/// - Shared efficiently across multiple threads and contexts
/// - Cloned without duplicating the underlying closure
/// - Used in generic contexts requiring 'static lifetime
///
/// The Arc wrapper is included in the type to make sharing explicit.
type RuntimeCallback = Arc<dyn Fn() -> anyhow::Result<()> + Send + Sync + 'static>;
/// Structure to hold Prometheus registries and associated callbacks for a given hierarchy
pub struct MetricsRegistryEntry {
/// The Prometheus registry for this prefix
pub prometheus_registry: prometheus::Registry,
/// List of function callbacks that receive a reference to any MetricsRegistry
pub runtime_callbacks: Vec<RuntimeCallback>,
}
impl MetricsRegistryEntry {
/// Create a new metrics registry entry with an empty registry and no callbacks
pub fn new() -> Self {
Self {
prometheus_registry: prometheus::Registry::new(),
runtime_callbacks: Vec::new(),
}
}
/// Add a callback function that receives a reference to any MetricsRegistry
pub fn add_callback(&mut self, callback: RuntimeCallback) {
self.runtime_callbacks.push(callback);
}
/// Execute all runtime callbacks and return their results
pub fn execute_callbacks(&self) -> Vec<anyhow::Result<()>> {
self.runtime_callbacks
.iter()
.map(|callback| callback())
.collect()
}
/// Returns true if a metric with the given name already exists in the Prometheus registry
pub fn has_metric_named(&self, metric_name: &str) -> bool {
self.prometheus_registry
.gather()
.iter()
.any(|mf| mf.name() == metric_name)
}
}
impl Default for MetricsRegistryEntry {
fn default() -> Self {
Self::new()
}
}
impl Clone for MetricsRegistryEntry {
fn clone(&self) -> Self {
Self {
prometheus_registry: self.prometheus_registry.clone(),
runtime_callbacks: Vec::new(), // Callbacks cannot be cloned, so we start with an empty list
}
}
}
/// Distributed [Runtime] which provides access to shared resources across the cluster, this includes
/// communication protocols and transports.
#[derive(Clone)]
......@@ -176,9 +240,7 @@ pub struct DistributedRuntime {
// Health Status
system_health: Arc<std::sync::Mutex<SystemHealth>>,
// This map associates metric prefixes with their corresponding Prometheus registries.
prometheus_registries_by_prefix: Arc<std::sync::Mutex<HashMap<String, prometheus::Registry>>>,
// Additional labels for metrics
labels: Vec<(String, String)>,
// This map associates metric prefixes with their corresponding Prometheus registries and callbacks.
// Uses RwLock for better concurrency - multiple threads can read (execute callbacks) simultaneously.
hierarchy_to_metricsregistry: Arc<std::sync::RwLock<HashMap<String, MetricsRegistryEntry>>>,
}
......@@ -13,29 +13,46 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Metric Registry Framework for Dynamo.
//! Metrics registry trait and implementation for Prometheus metrics
//!
//! This module provides registry classes for Prometheus metrics
//! that auto populates the labels with the component-endpoint hierarchy.
//! All metrics are prefixed with "dynamo_component_" to avoid collisions with Kubernetes and other monitoring system labels.
//! This module provides a trait-based interface for creating and managing Prometheus metrics
//! with automatic label injection and hierarchical naming support.
pub mod prometheus_names;
use std::collections::HashSet;
use std::sync::Arc;
use std::sync::Mutex;
use crate::component::ComponentBuilder;
use anyhow;
use once_cell::sync::Lazy;
use regex::Regex;
use std::any::Any;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
// If set to true, then metrics will be labeled with the dynamo_namespace, dynamo_component, and dynamo_endpoint.
// Import commonly used items to avoid verbose prefixes
use prometheus_names::{
build_metric_name, labels, name_prefix, nats, work_handler, COMPONENT_NATS_METRICS,
DRT_NATS_METRICS,
};
// Pipeline imports for endpoint creation
use crate::pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
ResponseStream, SingleIn,
};
use crate::protocols::annotated::Annotated;
use crate::stream;
use crate::stream::StreamExt;
// If set to true, then metrics will be labeled with the namespace, component, and endpoint labels.
// These labels are prefixed with "dynamo_" to avoid collisions with Kubernetes and other monitoring system labels.
pub const USE_AUTO_LABELS: bool = true;
// Prometheus imports
use prometheus::Encoder;
fn build_metric_name(metric_name: &str) -> String {
format!("dynamo_component_{}", metric_name)
}
/// Lints a metric name component by stripping off invalid characters and validating Prometheus naming pattern
/// Prometheus doesn't provide a built-in function to validate metric names, but the specification requires
/// names to follow the pattern [a-zA-Z_:][a-zA-Z0-9_:]*. This function implements that validation.
......@@ -212,15 +229,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
) -> anyhow::Result<T> {
// Validate that user-provided labels don't have duplicate keys
validate_no_duplicate_label_keys(labels)?;
// Validate that user-provided labels don't conflict with stored labels
for (key, _) in registry.stored_labels() {
if labels.iter().any(|(k, _)| *k == key) {
return Err(anyhow::anyhow!(
"Label key '{}' already exists in registry.",
key
));
}
}
// Note: stored labels functionality has been removed
let basename = registry.basename();
let parent_hierarchy = registry.parent_hierarchy();
......@@ -236,8 +245,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if USE_AUTO_LABELS {
// Validate that user-provided labels don't conflict with auto-generated labels
for (key, _) in labels {
if *key == "dynamo_namespace" || *key == "dynamo_component" || *key == "dynamo_endpoint"
{
if *key == labels::NAMESPACE || *key == labels::COMPONENT || *key == labels::ENDPOINT {
return Err(anyhow::anyhow!(
"Label '{}' is automatically added by auto_label feature and cannot be manually set",
key
......@@ -251,7 +259,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if !namespace.is_empty() {
let valid_namespace = lint_prometheus_name(namespace)?;
if !valid_namespace.is_empty() {
updated_labels.push(("dynamo_namespace".to_string(), valid_namespace));
updated_labels.push((labels::NAMESPACE.to_string(), valid_namespace));
}
}
}
......@@ -260,7 +268,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if !component.is_empty() {
let valid_component = lint_prometheus_name(component)?;
if !valid_component.is_empty() {
updated_labels.push(("dynamo_component".to_string(), valid_component));
updated_labels.push((labels::COMPONENT.to_string(), valid_component));
}
}
}
......@@ -269,7 +277,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
if !endpoint.is_empty() {
let valid_endpoint = lint_prometheus_name(endpoint)?;
if !valid_endpoint.is_empty() {
updated_labels.push(("dynamo_endpoint".to_string(), valid_endpoint));
updated_labels.push((labels::ENDPOINT.to_string(), valid_endpoint));
}
}
}
......@@ -281,13 +289,7 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
.iter()
.map(|(k, v)| ((*k).to_string(), (*v).to_string())),
);
// Add stored labels (safe because overlaps were rejected above)
updated_labels.extend(
registry
.stored_labels()
.into_iter()
.map(|(k, v)| (k.to_string(), v.to_string())),
);
// Note: stored labels functionality has been removed
// Handle different metric types
let prometheus_metric = if std::any::TypeId::of::<T>()
......@@ -371,33 +373,26 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
};
// Iterate over the DRT's registry and register this metric across all hierarchical levels.
// The prefixed_hierarchy is structured as: ["", "testnamespace", "testnamespace_testcomponent", "testnamespace_testcomponent_testendpoint"]
// This prefixing is essential to differentiate between the names of children and grandchildren.
let mut prometheus_registry = registry
.drt()
.prometheus_registries_by_prefix
.lock()
.unwrap();
// Build prefixed hierarchy and register metrics in a single loop
// The accumulated hierarchy is structured as: ["", "testnamespace", "testnamespace_testcomponent", "testnamespace_testcomponent_testendpoint"]
// This accumulation is essential to differentiate between the names of children and grandchildren.
// Build accumulated hierarchy and register metrics in a single loop
// current_prefix accumulates the hierarchical path as we iterate through hierarchy
// For example, if hierarchy = ["", "testnamespace", "testcomponent"], then:
// - Iteration 1: current_prefix = "" (empty string from DRT)
// - Iteration 2: current_prefix = "testnamespace"
// - Iteration 3: current_prefix = "testnamespace_testcomponent"
let mut current_prefix = String::new();
let mut current_hierarchy = String::new();
for name in &hierarchy {
if !current_prefix.is_empty() && !name.is_empty() {
current_prefix.push('_');
if !current_hierarchy.is_empty() && !name.is_empty() {
current_hierarchy.push('_');
}
current_prefix.push_str(name);
current_hierarchy.push_str(name);
// Register metric at this hierarchical level
// Register metric at this hierarchical level using the new helper function
let collector: Box<dyn prometheus::core::Collector> = Box::new(prometheus_metric.clone());
let _ = prometheus_registry
.entry(current_prefix.clone())
.or_default()
.register(collector);
registry
.drt()
.add_prometheus_metric(&current_hierarchy, &metric_name, collector)?;
}
Ok(prometheus_metric)
......@@ -406,55 +401,16 @@ fn create_metric<T: PrometheusMetric, R: MetricsRegistry + ?Sized>(
/// This trait should be implemented by all metric registries, including Prometheus, Envy, OpenTelemetry, and others.
/// It offers a unified interface for creating and managing metrics, organizing sub-registries, and
/// generating output in Prometheus text format.
pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvider {
// Get the name of this registry (without any prefix)
fn basename(&self) -> String;
/// Get any stored labels for this registry
fn stored_labels(&self) -> Vec<(&str, &str)> {
Vec::new()
}
/// Get mutable access to the labels storage - implementors must provide this
fn labels_mut(&mut self) -> &mut Vec<(String, String)>;
/// Add labels to this registry and return a new instance with the labels.
/// This allows for method chaining like: runtime.namespace(...).add_labels(...)?
/// Fails if:
/// - Provided `labels` contains duplicate keys, or
/// - Any provided key already exists in the registry's stored labels.
fn add_labels(mut self, labels: &[(&str, &str)]) -> anyhow::Result<Self>
where
Self: Sized,
{
validate_no_duplicate_label_keys(labels)?;
// 2) Validate no overlap with existing stored labels
let existing: std::collections::HashSet<&str> =
self.stored_labels().into_iter().map(|(k, _)| k).collect();
if let Some(conflict) = labels
.iter()
.map(|(k, _)| *k)
.find(|k| existing.contains(k))
{
return Err(anyhow::anyhow!(
"Label key '{}' already exists in registry; refusing to overwrite",
conflict
));
}
use crate::traits::DistributedRuntimeProvider;
// 3) Safe to append
let labels_storage = self.labels_mut();
for (key, value) in labels {
labels_storage.push((key.to_string(), value.to_string()));
}
Ok(self)
}
pub trait MetricsRegistry: Send + Sync + DistributedRuntimeProvider {
// Get the name of this registry (without any hierarchy prefix)
fn basename(&self) -> String;
/// Retrieve the complete hierarchy and basename for this registry. Currently, the prefix for drt is an empty string,
/// Retrieve the complete hierarchy and basename for this registry. Currently, the hierarchy for drt is an empty string,
/// so we must account for the leading underscore. The existing code remains unchanged to accommodate any future
/// scenarios where drt's prefix might be assigned a value.
fn prefix(&self) -> String {
fn hierarchy(&self) -> String {
[self.parent_hierarchy(), vec![self.basename()]]
.concat()
.join("_")
......@@ -462,7 +418,7 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid
.to_string()
}
// Get the parent hierarchy for this registry (just the base names, NOT the prefix)
// Get the parent hierarchy for this registry (just the base names, NOT the flattened hierarchy key)
fn parent_hierarchy(&self) -> Vec<String>;
// TODO: Add support for additional Prometheus metric types:
......@@ -589,9 +545,24 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid
/// Get metrics in Prometheus text format
fn prometheus_metrics_fmt(&self) -> anyhow::Result<String> {
// Execute callbacks first to ensure any new metrics are added to the registry
let callback_results = self.drt().execute_metrics_callbacks(&self.hierarchy());
// Log any callback errors but continue
for result in callback_results {
if let Err(e) = result {
tracing::error!("Error executing metrics callback: {}", e);
}
}
// Get the Prometheus registry for this hierarchy
let prometheus_registry = {
let mut registry = self.drt().prometheus_registries_by_prefix.lock().unwrap();
registry.entry(self.prefix()).or_default().clone()
let mut registry_entry = self.drt().hierarchy_to_metricsregistry.write().unwrap();
registry_entry
.entry(self.hierarchy())
.or_default()
.prometheus_registry
.clone()
};
let metric_families = prometheus_registry.gather();
let encoder = prometheus::TextEncoder::new();
......@@ -602,19 +573,127 @@ pub trait MetricsRegistry: Send + Sync + crate::traits::DistributedRuntimeProvid
}
#[cfg(test)]
/// Helper function to create a DRT instance for testing
/// Uses the test-friendly constructor without discovery
pub fn create_test_drt() -> crate::DistributedRuntime {
let rt = crate::Runtime::single_threaded().unwrap();
tokio::runtime::Runtime::new().unwrap().block_on(async {
mod test_helpers {
use super::prometheus_names::name_prefix;
use super::prometheus_names::nats as nats_metrics;
use super::*;
/// Creates a test DistributedRuntime for integration tests.
/// Uses NATS; requires #[cfg(feature = "integration")].
#[cfg(feature = "integration")]
pub fn create_test_drt() -> crate::DistributedRuntime {
let rt = crate::Runtime::single_threaded().unwrap();
tokio::runtime::Runtime::new().unwrap().block_on(async {
crate::DistributedRuntime::from_settings_without_discovery(rt.clone())
.await
.unwrap()
})
}
/// Helper function to create a DRT instance for testing in async contexts
#[cfg(feature = "integration")]
pub async fn create_test_drt_async() -> crate::DistributedRuntime {
let rt = crate::Runtime::single_threaded().unwrap();
crate::DistributedRuntime::from_settings_without_discovery(rt.clone())
.await
.unwrap()
})
}
/// Base function to filter Prometheus output lines based on a predicate.
/// Returns lines that match the predicate, converted to String.
fn filter_prometheus_lines<F>(input: &str, mut predicate: F) -> Vec<String>
where
F: FnMut(&str) -> bool,
{
input
.lines()
.filter(|line| predicate(line))
.map(|line| line.to_string())
.collect::<Vec<_>>()
}
/// Filters out all NATS metrics from Prometheus output for test comparisons.
pub fn remove_nats_lines(input: &str) -> Vec<String> {
filter_prometheus_lines(input, |line| {
!line.contains(&format!(
"{}{}",
name_prefix::COMPONENT,
nats_metrics::PREFIX
)) && !line.trim().is_empty()
})
}
/// Filters to only include NATS metrics from Prometheus output for test comparisons.
pub fn extract_nats_lines(input: &str) -> Vec<String> {
filter_prometheus_lines(input, |line| {
line.contains(&format!(
"{}{}",
name_prefix::COMPONENT,
nats_metrics::PREFIX
))
})
}
/// Extracts all component metrics (excluding help text and type definitions).
/// Returns only the actual metric lines with values.
pub fn extract_metrics(input: &str) -> Vec<String> {
filter_prometheus_lines(input, |line| {
line.starts_with(name_prefix::COMPONENT)
&& !line.starts_with("#")
&& !line.trim().is_empty()
})
}
/// Parses a Prometheus metric line and extracts the name, labels, and value.
/// Used instead of fetching metrics directly to test end-to-end results, not intermediate state.
///
/// # Example
/// ```
/// let line = "http_requests_total{method=\"GET\"} 1234";
/// let (name, labels, value) = parse_prometheus_metric(line).unwrap();
/// assert_eq!(name, "http_requests_total");
/// assert_eq!(labels.get("method"), Some(&"GET".to_string()));
/// assert_eq!(value, 1234.0);
/// ```
pub fn parse_prometheus_metric(
line: &str,
) -> Option<(String, std::collections::HashMap<String, String>, f64)> {
if line.trim().is_empty() || line.starts_with('#') {
return None;
}
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() < 2 {
return None;
}
let metric_part = parts[0];
let value: f64 = parts[1].parse().ok()?;
let (name, labels) = if metric_part.contains('{') {
let brace_start = metric_part.find('{').unwrap();
let brace_end = metric_part.rfind('}').unwrap_or(metric_part.len());
let name = &metric_part[..brace_start];
let labels_str = &metric_part[brace_start + 1..brace_end];
let mut labels = std::collections::HashMap::new();
for pair in labels_str.split(',') {
if let Some((k, v)) = pair.split_once('=') {
let v = v.trim_matches('"');
labels.insert(k.trim().to_string(), v.to_string());
}
}
(name.to_string(), labels)
} else {
(metric_part.to_string(), std::collections::HashMap::new())
};
Some((name, labels, value))
}
}
#[cfg(test)]
mod tests {
mod test_metricsregistry_units {
use super::*;
#[test]
......@@ -689,268 +768,310 @@ mod tests {
"testnamespace"
); // Hyphen removed
assert_eq!(
lint_prometheus_name("test-namespace_123").unwrap(),
"testnamespace_123"
); // Hyphen removed
// Test validation errors for invalid patterns
assert!(lint_prometheus_name("123test").is_err()); // Starts with digit
assert!(lint_prometheus_name("").is_ok()); // Empty is allowed
lint_prometheus_name("test-namespace-123").unwrap(),
"testnamespace123"
); // Multiple hyphens removed
}
}
#[cfg(feature = "integration")]
#[cfg(test)]
mod test_prefixes {
use super::create_test_drt;
use super::*;
use prometheus::core::Collector;
#[test]
fn test_parse_prometheus_metric() {
use super::test_helpers::parse_prometheus_metric;
use std::collections::HashMap;
// Test parsing a metric with labels
let line = "http_requests_total{method=\"GET\",status=\"200\"} 1234";
let parsed = parse_prometheus_metric(line);
assert!(parsed.is_some());
let (name, labels, value) = parsed.unwrap();
assert_eq!(name, "http_requests_total");
let mut expected_labels = HashMap::new();
expected_labels.insert("method".to_string(), "GET".to_string());
expected_labels.insert("status".to_string(), "200".to_string());
assert_eq!(labels, expected_labels);
assert_eq!(value, 1234.0);
// Test parsing a metric without labels
let line = "cpu_usage 98.5";
let parsed = parse_prometheus_metric(line);
assert!(parsed.is_some());
let (name, labels, value) = parsed.unwrap();
assert_eq!(name, "cpu_usage");
assert!(labels.is_empty());
assert_eq!(value, 98.5);
// Test parsing a metric with float value
let line = "response_time{service=\"api\"} 0.123";
let parsed = parse_prometheus_metric(line);
assert!(parsed.is_some());
let (name, labels, value) = parsed.unwrap();
assert_eq!(name, "response_time");
let mut expected_labels = HashMap::new();
expected_labels.insert("service".to_string(), "api".to_string());
assert_eq!(labels, expected_labels);
assert_eq!(value, 0.123);
// Test parsing invalid lines
assert!(parse_prometheus_metric("").is_none()); // Empty line
assert!(parse_prometheus_metric("# HELP metric description").is_none()); // Help text
assert!(parse_prometheus_metric("# TYPE metric counter").is_none()); // Type definition
assert!(parse_prometheus_metric("metric_name").is_none()); // No value
println!("✓ Prometheus metric parsing works correctly!");
}
#[cfg(feature = "integration")]
#[test]
fn test_hierarchical_prefixes_and_parent_hierarchies() {
println!("=== Testing Names, Prefixes, and Parent Hierarchies ===");
fn test_metrics_registry_entry_callbacks() {
use crate::MetricsRegistryEntry;
use std::sync::atomic::{AtomicUsize, Ordering};
// Create a distributed runtime for testing
let drt = create_test_drt();
// Test 1: Basic callback execution with counter increments
{
let mut entry = MetricsRegistryEntry::new();
let counter = Arc::new(AtomicUsize::new(0));
// Add callbacks with different increment values
for increment in [1, 10, 100] {
let counter_clone = counter.clone();
entry.add_callback(Arc::new(move || {
counter_clone.fetch_add(increment, Ordering::SeqCst);
Ok(())
}));
}
// Use a simple constant namespace name
let namespace_name = "testnamespace";
// Verify counter starts at 0
assert_eq!(counter.load(Ordering::SeqCst), 0);
// Create namespace
let namespace = drt.namespace(namespace_name).unwrap();
// First execution
let results = entry.execute_callbacks();
assert_eq!(results.len(), 3);
assert!(results.iter().all(|r| r.is_ok()));
assert_eq!(counter.load(Ordering::SeqCst), 111); // 1 + 10 + 100
// Create component
let component = namespace.component("testcomponent").unwrap();
// Second execution - callbacks should be reusable
let results = entry.execute_callbacks();
assert_eq!(results.len(), 3);
assert_eq!(counter.load(Ordering::SeqCst), 222); // 111 + 111
// Create endpoint
let endpoint = component.endpoint("testendpoint");
// Test cloning - cloned entry should have no callbacks
let cloned = entry.clone();
assert_eq!(cloned.execute_callbacks().len(), 0);
assert_eq!(counter.load(Ordering::SeqCst), 222); // No change
// Test DistributedRuntime hierarchy
println!("\n=== DistributedRuntime ===");
println!("basename: '{}'", drt.basename());
println!("parent_hierarchy: {:?}", drt.parent_hierarchy());
println!("prefix: '{}'", drt.prefix());
// Original still has callbacks
entry.execute_callbacks();
assert_eq!(counter.load(Ordering::SeqCst), 333); // 222 + 111
}
assert_eq!(drt.basename(), "", "DRT basename should be empty");
assert_eq!(
drt.parent_hierarchy(),
Vec::<String>::new(),
"DRT parent hierarchy should be empty"
);
assert_eq!(drt.prefix(), "", "DRT prefix should be empty");
// Test 2: Mixed success and error callbacks
{
let mut entry = MetricsRegistryEntry::new();
let counter = Arc::new(AtomicUsize::new(0));
// Successful callback
let counter_clone = counter.clone();
entry.add_callback(Arc::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
Ok(())
}));
// Error callback
entry.add_callback(Arc::new(|| Err(anyhow::anyhow!("Simulated error"))));
// Another successful callback
let counter_clone = counter.clone();
entry.add_callback(Arc::new(move || {
counter_clone.fetch_add(10, Ordering::SeqCst);
Ok(())
}));
// Execute and verify mixed results
let results = entry.execute_callbacks();
assert_eq!(results.len(), 3);
assert!(results[0].is_ok());
assert!(results[1].is_err());
assert!(results[2].is_ok());
// Verify error message
assert_eq!(
results[1].as_ref().unwrap_err().to_string(),
"Simulated error"
);
// Test Namespace hierarchy
println!("\n=== Namespace ===");
println!("basename: '{}'", namespace.basename());
println!("parent_hierarchy: {:?}", namespace.parent_hierarchy());
println!("prefix: '{}'", namespace.prefix());
// Verify successful callbacks still executed
assert_eq!(counter.load(Ordering::SeqCst), 11); // 1 + 10
assert_eq!(
namespace.basename(),
namespace_name,
"Namespace basename should match the generated name"
);
assert_eq!(
namespace.parent_hierarchy(),
vec![""],
"Namespace parent hierarchy should be [\"\"]"
);
assert_eq!(
namespace.prefix(),
namespace_name,
"Namespace prefix should match the generated name, because drt's prefix is empty"
);
// Execute again - errors should be consistent
let results = entry.execute_callbacks();
assert!(results[1].is_err());
assert_eq!(counter.load(Ordering::SeqCst), 22); // 11 + 11
}
// Test Component hierarchy
println!("\n=== Component ===");
println!("basename: '{}'", component.basename());
println!("parent_hierarchy: {:?}", component.parent_hierarchy());
println!("prefix: '{}'", component.prefix());
// Test 3: Empty registry
{
let entry = MetricsRegistryEntry::new();
let results = entry.execute_callbacks();
assert_eq!(results.len(), 0);
}
}
}
assert_eq!(
component.basename(),
"testcomponent",
"Component basename should be 'testcomponent'"
);
#[cfg(feature = "integration")]
#[cfg(test)]
mod test_metricsregistry_prefixes {
use super::*;
use prometheus::core::Collector;
#[test]
fn test_hierarchical_prefixes_and_parent_hierarchies() {
let drt = super::test_helpers::create_test_drt();
const DRT_NAME: &str = "";
const NAMESPACE_NAME: &str = "ns901";
const COMPONENT_NAME: &str = "comp901";
const ENDPOINT_NAME: &str = "ep901";
let namespace = drt.namespace(NAMESPACE_NAME).unwrap();
let component = namespace.component(COMPONENT_NAME).unwrap();
let endpoint = component.endpoint(ENDPOINT_NAME);
// DRT
assert_eq!(drt.basename(), DRT_NAME);
assert_eq!(drt.parent_hierarchy(), Vec::<String>::new());
assert_eq!(drt.hierarchy(), DRT_NAME);
// Namespace
assert_eq!(namespace.basename(), NAMESPACE_NAME);
assert_eq!(namespace.parent_hierarchy(), vec!["".to_string()]);
assert_eq!(namespace.hierarchy(), NAMESPACE_NAME);
// Component
assert_eq!(component.basename(), COMPONENT_NAME);
assert_eq!(
component.parent_hierarchy(),
vec!["", &namespace_name],
"Component parent hierarchy should contain the generated namespace name"
vec!["".to_string(), NAMESPACE_NAME.to_string()]
);
assert_eq!(
component.prefix(),
format!("{}_testcomponent", namespace),
"Component prefix should be 'namespace_testcomponent'"
component.hierarchy(),
format!("{}_{}", NAMESPACE_NAME, COMPONENT_NAME)
);
// Test Endpoint hierarchy
println!("\n=== Endpoint ===");
println!("basename: '{}'", endpoint.basename());
println!("parent_hierarchy: {:?}", endpoint.parent_hierarchy());
println!("prefix: '{}'", endpoint.prefix());
assert_eq!(
endpoint.basename(),
"testendpoint",
"Endpoint basename should be 'testendpoint'"
);
// Endpoint
assert_eq!(endpoint.basename(), ENDPOINT_NAME);
assert_eq!(
endpoint.parent_hierarchy(),
vec!["", &namespace_name, "testcomponent"],
"Endpoint parent hierarchy should contain the generated namespace name"
vec![
"".to_string(),
NAMESPACE_NAME.to_string(),
COMPONENT_NAME.to_string(),
]
);
assert_eq!(
endpoint.prefix(),
format!("{}_testcomponent_testendpoint", namespace),
"Endpoint prefix should be 'namespace_testcomponent_testendpoint'"
endpoint.hierarchy(),
format!("{}_{}_{}", NAMESPACE_NAME, COMPONENT_NAME, ENDPOINT_NAME)
);
// Test hierarchy relationships
println!("\n=== Hierarchy Relationships ===");
assert!(
namespace.parent_hierarchy().contains(&drt.basename()),
"Namespace should have DRT prefix in parent hierarchy"
);
assert!(
component.parent_hierarchy().contains(&namespace.basename()),
"Component should have Namespace prefix in parent hierarchy"
);
assert!(
endpoint.parent_hierarchy().contains(&component.basename()),
"Endpoint should have Component prefix in parent hierarchy"
);
println!("✓ All parent-child relationships verified");
// Relationships
assert!(namespace.parent_hierarchy().contains(&drt.basename()));
assert!(component.parent_hierarchy().contains(&namespace.basename()));
assert!(endpoint.parent_hierarchy().contains(&component.basename()));
// Test hierarchy depth
println!("\n=== Hierarchy Depth ===");
assert_eq!(
drt.parent_hierarchy().len(),
0,
"DRT should have 0 parent hierarchy levels"
);
assert_eq!(
namespace.parent_hierarchy().len(),
1,
"Namespace should have 1 parent hierarchy level"
);
assert_eq!(
component.parent_hierarchy().len(),
2,
"Component should have 2 parent hierarchy levels"
);
assert_eq!(
endpoint.parent_hierarchy().len(),
3,
"Endpoint should have 3 parent hierarchy levels"
);
println!("✓ All hierarchy depths verified");
// Depth
assert_eq!(drt.parent_hierarchy().len(), 0);
assert_eq!(namespace.parent_hierarchy().len(), 1);
assert_eq!(component.parent_hierarchy().len(), 2);
assert_eq!(endpoint.parent_hierarchy().len(), 3);
// Summary
println!("\n=== Summary ===");
println!("DRT prefix: '{}'", drt.prefix());
println!("Namespace prefix: '{}'", namespace.prefix());
println!("Component prefix: '{}'", component.prefix());
println!("Endpoint prefix: '{}'", endpoint.prefix());
println!("All hierarchy assertions passed!");
// Invalid namespace behavior (sanitization should still error after becoming "123")
let invalid_namespace = drt.namespace("@@123").unwrap();
let result = invalid_namespace.create_counter("test_counter", "A test counter", &[]);
assert!(result.is_err());
if let Err(e) = &result {
assert!(e.to_string().contains("123"));
}
// Test invalid namespace behavior
println!("\n=== Testing Invalid Namespace Behavior ===");
// Valid namespace works
let valid_namespace = drt.namespace("ns567").unwrap();
assert!(valid_namespace
.create_counter("test_counter", "A test counter", &[])
.is_ok());
}
// Create a namespace with invalid name (contains hyphen)
let invalid_namespace = drt.namespace("@@123").unwrap();
#[test]
fn test_recursive_namespace() {
// Create a distributed runtime for testing
let drt = super::test_helpers::create_test_drt();
// Debug: Let's see what the hierarchy looks like
println!(
"Invalid namespace basename: '{}'",
invalid_namespace.basename()
);
println!(
"Invalid namespace parent_hierarchy: {:?}",
invalid_namespace.parent_hierarchy()
);
println!("Invalid namespace prefix: '{}'", invalid_namespace.prefix());
// Create a deeply chained namespace: ns1.ns2.ns3
let ns1 = drt.namespace("ns1").unwrap();
let ns2 = ns1.namespace("ns2").unwrap();
let ns3 = ns2.namespace("ns3").unwrap();
// Try to create a metric - this should succeed because the namespace name will be sanitized
let result = invalid_namespace.create_counter("test_counter", "A test counter", &[]);
println!("Result with invalid namespace '@@123':");
println!("{:?}", result);
// Create a component in the deepest namespace
let component = ns3.component("test-component").unwrap();
// The result should be an error because '@@123' gets sanitized to '123' which is invalid
assert!(
result.is_err(),
"Creating metric with namespace '@@123' should fail because it gets sanitized to '123' which is invalid"
// Verify the hierarchy structure
assert_eq!(ns1.basename(), "ns1");
assert_eq!(ns1.parent_hierarchy(), vec!("".to_string()));
assert_eq!(ns1.hierarchy(), "ns1");
assert_eq!(ns2.basename(), "ns2");
assert_eq!(
ns2.parent_hierarchy(),
vec!["".to_string(), "ns1".to_string()]
);
assert_eq!(ns2.hierarchy(), "ns1_ns2");
// Verify the error message indicates the sanitized name is still invalid
if let Err(e) = &result {
let error_msg = e.to_string();
assert!(
error_msg.contains("123"),
"Error message should mention the sanitized name '123', got: {}",
error_msg
);
}
assert_eq!(ns3.basename(), "ns3");
assert_eq!(
ns3.parent_hierarchy(),
vec!["".to_string(), "ns1".to_string(), "ns2".to_string()]
);
assert_eq!(ns3.hierarchy(), "ns1_ns2_ns3");
// For comparison, show a valid namespace works
let valid_namespace = drt.namespace("test_namespace").unwrap();
let valid_result = valid_namespace.create_counter("test_counter", "A test counter", &[]);
println!("Result with valid namespace 'test_namespace':");
println!("{:?}", valid_result);
assert!(
valid_result.is_ok(),
"Creating metric with valid namespace should succeed"
assert_eq!(component.basename(), "test-component");
assert_eq!(
component.parent_hierarchy(),
vec![
"".to_string(),
"ns1".to_string(),
"ns2".to_string(),
"ns3".to_string()
]
);
assert_eq!(component.hierarchy(), "ns1_ns2_ns3_test-component");
println!("✓ Invalid namespace behavior verified!");
println!("✓ Chained namespace test passed - all prefixes correct");
}
}
#[cfg(feature = "integration")]
#[cfg(test)]
mod test_simple_metricsregistry_trait {
use super::create_test_drt;
mod test_metricsregistry_prometheus_fmt_outputs {
use super::prometheus_names::name_prefix;
use super::prometheus_names::nats as nats_metrics;
use super::prometheus_names::{COMPONENT_NATS_METRICS, DRT_NATS_METRICS};
use super::*;
use prometheus::Counter;
use std::sync::Arc;
#[test]
fn test_component_prometheus_output_contains_custom_label() {
// Arrange: DRT → namespace → component with a custom label
let drt = create_test_drt();
let namespace = drt.namespace("testnamespace").unwrap();
let component = namespace
.component("testcomponent")
.unwrap()
.add_labels(&[("service", "api")])
.unwrap();
// Act: create a simple gauge and render Prometheus text
let gauge = component
.create_gauge("with_label", "Gauge with custom label", &[])
.unwrap();
gauge.set(1.0);
let output = component.prometheus_metrics_fmt().unwrap();
// Assert: custom label is present (don’t rely on label ordering)
assert!(
output.contains("dynamo_component_with_label{") && output.contains(r#"service="api""#),
"Expected custom label service=\"api\" in Prometheus output:\n{}",
output
);
}
#[test]
fn test_factory_methods_via_registry_trait() {
fn test_prometheusfactory_using_metrics_registry_trait() {
// Setup real DRT and registry using the test-friendly constructor
let drt = create_test_drt();
let drt = super::test_helpers::create_test_drt();
// Use a simple constant namespace name
let namespace_name = "testnamespace";
let namespace_name = "ns345";
let namespace = drt.namespace(namespace_name).unwrap();
let component = namespace.component("testcomponent").unwrap();
let endpoint = component.endpoint("testendpoint");
let component = namespace.component("comp345").unwrap();
let endpoint = component.endpoint("ep345");
// Test Counter creation
let counter = endpoint
......@@ -960,15 +1081,18 @@ mod test_simple_metricsregistry_trait {
let epsilon = 0.01;
assert!((counter.get() - 123.456789).abs() < epsilon);
let endpoint_output = endpoint.prometheus_metrics_fmt().unwrap();
let endpoint_output_raw = endpoint.prometheus_metrics_fmt().unwrap();
println!("Endpoint output:");
println!("{}", endpoint_output);
println!("{}", endpoint_output_raw);
// Filter out NATS service metrics for test comparison
let endpoint_output =
super::test_helpers::remove_nats_lines(&endpoint_output_raw).join("\n");
let expected_endpoint_output = format!(
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789
"#
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789"#
);
assert_eq!(
......@@ -988,18 +1112,21 @@ dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="
assert_eq!(gauge.get(), 50000.0);
// Test Prometheus format output for Component (gauge + histogram)
let component_output = component.prometheus_metrics_fmt().unwrap();
let component_output_raw = component.prometheus_metrics_fmt().unwrap();
println!("Component output:");
println!("{}", component_output);
println!("{}", component_output_raw);
// Filter out NATS service metrics for test comparison
let component_output =
super::test_helpers::remove_nats_lines(&component_output_raw).join("\n");
let expected_component_output = format!(
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789
# HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="testnamespace"}} 50000
"#
dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000"#
);
assert_eq!(
......@@ -1018,21 +1145,24 @@ dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="t
assert_eq!(intcounter.get(), 12345);
// Test Prometheus format output for Namespace (int_counter + gauge + histogram)
let namespace_output = namespace.prometheus_metrics_fmt().unwrap();
let namespace_output_raw = namespace.prometheus_metrics_fmt().unwrap();
println!("Namespace output:");
println!("{}", namespace_output);
println!("{}", namespace_output_raw);
// Filter out NATS service metrics for test comparison
let namespace_output =
super::test_helpers::remove_nats_lines(&namespace_output_raw).join("\n");
let expected_namespace_output = format!(
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789
# HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="testnamespace"}} 50000
dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000
# HELP dynamo_component_testintcounter A test int counter
# TYPE dynamo_component_testintcounter counter
dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345
"#
dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345"#
);
assert_eq!(
......@@ -1044,45 +1174,19 @@ dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345
expected_namespace_output, namespace_output
);
// Create a histogram with specified buckets. The Prometheus format output will
// lack labels since the DistributedRuntime is unnamed.
let histogram = drt
.create_histogram(
"testhistogram",
"A test histogram",
&[],
Some(vec![1.0, 2.5, 5.0, 10.0]),
)
.unwrap();
histogram.observe(1.5);
histogram.observe(2.5);
histogram.observe(3.5);
// Test CounterVec creation
let countervec = drt
.create_countervec(
"testcountervec",
"A test counter vector",
&["method", "status"],
&[("service", "api")],
)
.unwrap();
countervec.with_label_values(&["GET", "200"]).inc_by(10.0);
countervec.with_label_values(&["POST", "201"]).inc_by(5.0);
// Test IntGauge creation
let intgauge = drt
let intgauge = namespace
.create_intgauge("testintgauge", "A test int gauge", &[])
.unwrap();
intgauge.set(42);
assert_eq!(intgauge.get(), 42);
// Test IntGaugeVec creation
let intgaugevec = drt
let intgaugevec = namespace
.create_intgaugevec(
"testintgaugevec",
"A test int gauge vector",
&["instance", "status"],
&["instance", "service", "status"],
&[("service", "api")],
)
.unwrap();
......@@ -1093,22 +1197,46 @@ dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345
.with_label_values(&["server2", "inactive"])
.set(0);
// Test Prometheus format output for DRT (which should contain everything)
let drt_output = drt.prometheus_metrics_fmt().unwrap();
// Test CounterVec creation
let countervec = endpoint
.create_countervec(
"testcountervec",
"A test counter vector",
&["method", "status"],
&[("service", "api")],
)
.unwrap();
countervec.with_label_values(&["GET", "200"]).inc_by(10.0);
countervec.with_label_values(&["POST", "201"]).inc_by(5.0);
// Test Histogram creation
let histogram = component
.create_histogram("testhistogram", "A test histogram", &[], None)
.unwrap();
histogram.observe(1.0);
histogram.observe(2.5);
histogram.observe(4.0);
// Test Prometheus format output for DRT (all metrics combined)
let drt_output_raw = drt.prometheus_metrics_fmt().unwrap();
println!("DRT output:");
println!("{}", drt_output);
println!("{}", drt_output_raw);
// Filter out all NATS metrics for comparison
let filtered_drt_output =
super::test_helpers::remove_nats_lines(&drt_output_raw).join("\n");
let expected_drt_output = format!(
r#"# HELP dynamo_component_testcounter A test counter
# TYPE dynamo_component_testcounter counter
dynamo_component_testcounter{{dynamo_component="testcomponent",dynamo_endpoint="testendpoint",dynamo_namespace="testnamespace"}} 123.456789
dynamo_component_testcounter{{dynamo_component="comp345",dynamo_endpoint="ep345",dynamo_namespace="ns345"}} 123.456789
# HELP dynamo_component_testcountervec A test counter vector
# TYPE dynamo_component_testcountervec counter
dynamo_component_testcountervec{{method="GET",service="api",status="200"}} 10
dynamo_component_testcountervec{{method="POST",service="api",status="201"}} 5
# HELP dynamo_component_testgauge A test gauge
# TYPE dynamo_component_testgauge gauge
dynamo_component_testgauge{{dynamo_component="testcomponent",dynamo_namespace="testnamespace"}} 50000
dynamo_component_testgauge{{dynamo_component="comp345",dynamo_namespace="ns345"}} 50000
# HELP dynamo_component_testhistogram A test histogram
# TYPE dynamo_component_testhistogram histogram
dynamo_component_testhistogram_bucket{{le="1"}} 0
......@@ -1120,26 +1248,436 @@ dynamo_component_testhistogram_sum 7.5
dynamo_component_testhistogram_count 3
# HELP dynamo_component_testintcounter A test int counter
# TYPE dynamo_component_testintcounter counter
dynamo_component_testintcounter{{dynamo_namespace="testnamespace"}} 12345
dynamo_component_testintcounter{{dynamo_namespace="ns345"}} 12345
# HELP dynamo_component_testintgauge A test int gauge
# TYPE dynamo_component_testintgauge gauge
dynamo_component_testintgauge 42
# HELP dynamo_component_testintgaugevec A test int gauge vector
# TYPE dynamo_component_testintgaugevec gauge
dynamo_component_testintgaugevec{{instance="server1",service="api",status="active"}} 10
dynamo_component_testintgaugevec{{instance="server2",service="api",status="inactive"}} 0
"#
dynamo_component_testintgaugevec{{instance="server2",service="api",status="inactive"}} 0"#
);
assert_eq!(
filtered_drt_output, expected_drt_output,
"\n=== DRT COMPARISON FAILED ===\n\
Expected:\n{}\n\
Actual:\n{}\n\
Actual (filtered):\n{}\n\
==============================",
expected_drt_output, filtered_drt_output
);
println!("✓ All Prometheus format outputs verified successfully!");
}
#[test]
fn test_refactored_filter_functions() {
// Test data with mixed content
let test_input = r#"# HELP dynamo_component_requests Total requests
# TYPE dynamo_component_requests counter
dynamo_component_requests 42
# HELP dynamo_component_nats_connection_state Connection state
# TYPE dynamo_component_nats_connection_state gauge
dynamo_component_nats_connection_state 1
# HELP dynamo_component_latency Response latency
# TYPE dynamo_component_latency histogram
dynamo_component_latency_bucket{le="0.1"} 10
dynamo_component_latency_bucket{le="0.5"} 25
dynamo_component_nats_total_requests 100
dynamo_component_nats_total_errors 5"#;
// Test remove_nats_lines (excludes NATS lines but keeps help/type)
let filtered_out = super::test_helpers::remove_nats_lines(test_input);
assert_eq!(filtered_out.len(), 7); // 7 non-NATS lines
assert!(!filtered_out.iter().any(|line| line.contains("nats")));
// Test extract_nats_lines (includes all NATS lines including help/type)
let filtered_only = super::test_helpers::extract_nats_lines(test_input);
assert_eq!(filtered_only.len(), 5); // 5 NATS lines
assert!(filtered_only.iter().all(|line| line.contains("nats")));
// Test extract_metrics (only actual metric lines, excluding help/type)
let metrics_only = super::test_helpers::extract_metrics(test_input);
assert_eq!(metrics_only.len(), 6); // 6 actual metric lines (excluding help/type)
assert!(metrics_only
.iter()
.all(|line| line.starts_with("dynamo_component") && !line.starts_with("#")));
println!("✓ All refactored filter functions work correctly!");
}
}
#[cfg(feature = "integration")]
#[cfg(test)]
mod test_metricsregistry_nats {
use super::prometheus_names::name_prefix;
use super::prometheus_names::nats as nats_metrics;
use super::prometheus_names::{COMPONENT_NATS_METRICS, DRT_NATS_METRICS};
use super::*;
use crate::pipeline::PushRouter;
use crate::{DistributedRuntime, Runtime};
use tokio::time::{sleep, Duration};
#[test]
fn test_drt_nats_metrics() {
// Setup real DRT and registry using the test-friendly constructor
let drt = super::test_helpers::create_test_drt();
// Get DRT output which should include NATS client metrics
let drt_output = drt.prometheus_metrics_fmt().unwrap();
println!("DRT output with NATS metrics:");
println!("{}", drt_output);
// Additional checks for NATS client metrics (without checking specific values)
let drt_nats_metrics = super::test_helpers::extract_nats_lines(&drt_output);
// Check that NATS client metrics are present
assert!(
!drt_nats_metrics.is_empty(),
"NATS client metrics should be present"
);
// Check for specific NATS client metric names (without values)
let drt_metrics = super::test_helpers::extract_metrics(&drt_output);
let actual_drt_nats_metrics_sorted: Vec<&str> = drt_metrics
.iter()
.map(|line| {
let without_labels = line.split('{').next().unwrap_or(line);
// Remove the value part (everything after the last space)
without_labels.split(' ').next().unwrap_or(without_labels)
})
.collect();
let expect_drt_nats_metrics_sorted = {
let mut temp = DRT_NATS_METRICS
.iter()
.map(|metric| build_metric_name(metric))
.collect::<Vec<_>>();
temp.sort();
temp
};
// Print both lists for comparison
println!(
"actual_drt_nats_metrics_sorted: {:?}",
actual_drt_nats_metrics_sorted
);
println!(
"expect_drt_nats_metrics_sorted: {:?}",
expect_drt_nats_metrics_sorted
);
// Compare the sorted lists
assert_eq!(
actual_drt_nats_metrics_sorted,
expect_drt_nats_metrics_sorted,
"DRT_NATS_METRICS with prefix and expected_nats_metrics should be identical when sorted"
);
println!("✓ DistributedRuntime NATS metrics integration test passed!");
}
#[test]
fn test_nats_metric_names() {
// This test only tests the existence of the NATS metrics. It does not check
// the values of the metrics.
// Setup real DRT and registry using the test-friendly constructor
let drt = super::test_helpers::create_test_drt();
// Create a namespace and components from the DRT
let namespace = drt.namespace("ns789").unwrap();
let components = namespace.component("comp789").unwrap();
// Get components output which should include NATS client metrics
// Additional checks for NATS client metrics (without checking specific values)
let component_nats_metrics =
super::test_helpers::extract_nats_lines(&components.prometheus_metrics_fmt().unwrap());
println!(
"Component NATS metrics count: {}",
component_nats_metrics.len()
);
// Check that NATS client metrics are present
assert!(
!component_nats_metrics.is_empty(),
"NATS client metrics should be present"
);
// Check for specific NATS client metric names (without values)
let component_metrics =
super::test_helpers::extract_metrics(&components.prometheus_metrics_fmt().unwrap());
let actual_component_nats_metrics_sorted: Vec<&str> = component_metrics
.iter()
.map(|line| {
let without_labels = line.split('{').next().unwrap_or(line);
// Remove the value part (everything after the last space)
without_labels.split(' ').next().unwrap_or(without_labels)
})
.collect();
let expect_component_nats_metrics_sorted = {
let mut temp = COMPONENT_NATS_METRICS
.iter()
.map(|metric| build_metric_name(metric))
.collect::<Vec<_>>();
temp.sort();
temp
};
// Print both lists for comparison
println!(
"actual_component_nats_metrics_sorted: {:?}",
actual_component_nats_metrics_sorted
);
println!(
"expect_component_nats_metrics_sorted: {:?}",
expect_component_nats_metrics_sorted
);
// Compare the sorted lists
assert_eq!(
actual_component_nats_metrics_sorted,
expect_component_nats_metrics_sorted,
"COMPONENT_NATS_METRICS with prefix and expected_nats_metrics should be identical when sorted"
);
// Get both DRT and component output and filter for component metrics
let drt_and_component_metrics =
super::test_helpers::extract_metrics(&drt.prometheus_metrics_fmt().unwrap());
println!(
"DRT and component metrics count: {}",
drt_and_component_metrics.len()
);
// Check that the NATS metrics are present in the component output
assert_eq!(
drt_and_component_metrics.len(),
DRT_NATS_METRICS.len() + COMPONENT_NATS_METRICS.len(),
"DRT at this point should have both the DRT and component NATS metrics"
);
// Check that the NATS metrics are present in the component output
println!("✓ Component NATS metrics integration test passed!");
}
/// Tests NATS metrics values before and after endpoint activity with large message processing.
/// Creates endpoint, sends test messages + 10k byte message, validates metrics (NATS + work handler)
/// at initial state and post-activity state. Ensures byte thresholds, message counts, and processing
/// times are within expected ranges. Tests end-to-end client-server communication and metrics collection.
#[tokio::test]
async fn test_nats_metrics_values() -> anyhow::Result<()> {
struct MessageHandler {}
impl MessageHandler {
fn new() -> std::sync::Arc<Self> {
std::sync::Arc::new(Self {})
}
}
#[async_trait]
impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for MessageHandler {
async fn generate(
&self,
input: SingleIn<String>,
) -> Result<ManyOut<Annotated<String>>, Error> {
let (data, ctx) = input.into_parts();
let response = format!("{}", data);
let stream = stream::iter(vec![Annotated::from_data(response)]);
Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
}
}
println!("\n=== Initializing DistributedRuntime ===");
let runtime = Runtime::from_current()?;
let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = drt.namespace("ns123").unwrap();
let component = namespace.component("comp123").unwrap();
let ingress = Ingress::for_engine(MessageHandler::new()).unwrap();
let _backend_handle = tokio::spawn(async move {
let service = component.service_builder().create().await.unwrap();
let endpoint = service.endpoint("echo").endpoint_builder().handler(ingress);
endpoint.start().await.unwrap();
});
sleep(Duration::from_millis(500)).await;
println!("✓ Launched endpoint service in background successfully");
let drt_output = drt.prometheus_metrics_fmt().unwrap();
let parsed_metrics: Vec<_> = drt_output
.lines()
.filter_map(|line| super::test_helpers::parse_prometheus_metric(line))
.collect();
println!("=== Initial DRT metrics output ===");
println!("{}", drt_output);
println!("\n=== Checking Initial Metric Values ===");
let initial_expected_metric_values = [
// DRT NATS metrics (ordered to match DRT_NATS_METRICS)
(build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Should be connected
(build_metric_name(nats::CONNECTS), 1.0, 1.0), // Should have 1 connection
(build_metric_name(nats::IN_TOTAL_BYTES), 300.0, 500.0), // ~75% to ~125% of 417
(build_metric_name(nats::IN_MESSAGES), 0.0, 0.0), // No messages yet
(build_metric_name(nats::OUT_OVERHEAD_BYTES), 500.0, 700.0), // ~75% to ~125% of 612 (includes endpoint creation overhead)
(build_metric_name(nats::OUT_MESSAGES), 0.0, 0.0), // No messages yet
// Component NATS metrics (ordered to match COMPONENT_NATS_METRICS)
(build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 0.0), // No processing yet
(build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // No errors yet
(build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // No requests yet
(build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 0.0), // No processing yet
(build_metric_name(nats::ACTIVE_SERVICES), 0.0, 0.0), // No services yet
(build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 0.0), // No endpoints yet
];
for (metric_name, min_value, max_value) in &initial_expected_metric_values {
let actual_value = parsed_metrics
.iter()
.find(|(name, _, _)| name == metric_name)
.map(|(_, _, value)| *value)
.unwrap_or_else(|| panic!("Could not find expected metric: {}", metric_name));
assert!(
actual_value >= *min_value && actual_value <= *max_value,
"Initial metric {} should be between {} and {}, but got {}",
metric_name,
min_value,
max_value,
actual_value
);
}
println!("\n=== Client Runtime to hit the endpoint ===");
let client_runtime = Runtime::from_current()?;
let client_distributed = DistributedRuntime::from_settings(client_runtime.clone()).await?;
let namespace = client_distributed.namespace("ns123")?;
let component = namespace.component("comp123")?;
let client = component.endpoint("echo").client().await?;
client.wait_for_instances().await?;
println!("✓ Connected to endpoint, waiting for instances...");
let router =
PushRouter::<String, Annotated<String>>::from_client(client, Default::default())
.await?;
for i in 0..10 {
let msg = i.to_string().repeat(2000); // 2k bytes message
let mut stream = router.random(msg.clone().into()).await?;
while let Some(resp) = stream.next().await {
// Check if response matches the original message
if let Some(data) = &resp.data {
let is_same = data == &msg;
println!(
"Response {}: {} bytes, matches original: {}",
i,
data.len(),
is_same
);
}
}
sleep(Duration::from_millis(100)).await;
}
println!("✓ Sent messages and received responses successfully");
let final_drt_output = drt.prometheus_metrics_fmt().unwrap();
println!("\n=== Final Prometheus DRT output ===");
println!("{}", final_drt_output);
let final_drt_nats_output = super::test_helpers::extract_nats_lines(&final_drt_output);
println!("\n=== Filtered NATS metrics from final DRT output ===");
for line in &final_drt_nats_output {
println!("{}", line);
}
let final_parsed_metrics: Vec<_> = super::test_helpers::extract_metrics(&final_drt_output)
.iter()
.filter_map(|line| super::test_helpers::parse_prometheus_metric(line))
.collect();
let post_expected_metric_values = [
// DRT NATS metrics (ordered to match DRT_NATS_METRICS)
(build_metric_name(nats::CONNECTION_STATE), 1.0, 1.0), // Should remain connected
(build_metric_name(nats::CONNECTS), 1.0, 1.0), // Should remain 1 connection
(build_metric_name(nats::IN_TOTAL_BYTES), 22000.0, 28000.0), // ~75% to ~125% of 24977 (10 messages × 2000 bytes + overhead)
(build_metric_name(nats::IN_MESSAGES), 10.0, 12.0), // Allow small drift (callback may run twice)
(build_metric_name(nats::OUT_OVERHEAD_BYTES), 2076.0, 3461.0), // ~75% to ~125% of 2769 (synchronous metrics collection overhead)
(build_metric_name(nats::OUT_MESSAGES), 10.0, 12.0), // Allow small drift (callback may run twice)
// Component NATS metrics (ordered to match COMPONENT_NATS_METRICS)
(build_metric_name(nats::AVG_PROCESSING_MS), 0.0, 1.0), // Should be low processing time
(build_metric_name(nats::TOTAL_ERRORS), 0.0, 0.0), // Should have no errors
(build_metric_name(nats::TOTAL_REQUESTS), 0.0, 0.0), // NATS metrics don't track work handler requests
(build_metric_name(nats::TOTAL_PROCESSING_MS), 0.0, 5.0), // Should be low total processing time
(build_metric_name(nats::ACTIVE_SERVICES), 0.0, 0.0), // NATS metrics don't track work handler services
(build_metric_name(nats::ACTIVE_ENDPOINTS), 0.0, 0.0), // NATS metrics don't track work handler endpoints
// Work handler metrics with ranges
(build_metric_name(work_handler::REQUESTS_TOTAL), 10.0, 10.0), // Exact count (10 messages)
(
build_metric_name(work_handler::REQUEST_BYTES_TOTAL),
21000.0,
26000.0,
), // ~75% to ~125% of 23520 (10 × 2000 bytes + overhead)
(
build_metric_name(work_handler::RESPONSE_BYTES_TOTAL),
18000.0,
23000.0,
), // ~75% to ~125% of 20660 (10 × 2000 bytes + overhead, but response size varies)
// Additional component metrics
(
build_metric_name(work_handler::CONCURRENT_REQUESTS),
0.0,
1.0,
), // Should be 0 or very low
(
format!(
"{}_count",
build_metric_name(work_handler::REQUEST_DURATION_SECONDS)
),
10.0,
10.0,
), // Exact count (10 messages)
(
format!(
"{}_sum",
build_metric_name(work_handler::REQUEST_DURATION_SECONDS)
),
0.001,
0.999,
), // Processing time sum (10 messages)
];
println!("\n=== Checking Post-Activity All Metrics (NATS + Work Handler) ===");
for (metric_name, min_value, max_value) in &post_expected_metric_values {
let actual_value = final_parsed_metrics
.iter()
.find(|(name, _, _)| name == metric_name)
.map(|(_, _, value)| *value)
.unwrap_or_else(|| {
panic!(
"Could not find expected post-activity metric: {}",
metric_name
)
});
assert!(
actual_value >= *min_value && actual_value <= *max_value,
"Post-activity metric {} should be between {} and {}, but got {}",
metric_name,
min_value,
max_value,
actual_value
);
println!(
"✓ {}: {} (range: {} to {})",
metric_name, actual_value, min_value, max_value
);
}
println!("✓ All NATS and component metrics parsed successfully!");
println!("✓ Byte metrics verified to be >= 100 bytes!");
println!("✓ Post-activity metrics verified with higher thresholds!");
println!("✓ Work handler metrics reflect increased activity!");
Ok(())
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Prometheus metric name constants
//!
//! This module provides centralized Prometheus metric name constants for various components
//! to ensure consistency and avoid duplication across the codebase.
/// Builds a full metric name by prepending the component prefix
pub fn build_metric_name(metric_name: &str) -> String {
format!("{}{}", name_prefix::COMPONENT, metric_name)
}
/// Metric name prefixes used across the metrics system
pub mod name_prefix {
/// Prefix for all Prometheus metric names.
pub const COMPONENT: &str = "dynamo_component_";
// TODO(keivenc): uncomment below for the frontend
// pub const FRONTEND: &str = "dynamo_frontend_";
}
/// Automatically inserted Prometheus label names used across the metrics system
pub mod labels {
/// Label for component identification
pub const COMPONENT: &str = "dynamo_component";
/// Label for namespace identification
pub const NAMESPACE: &str = "dynamo_namespace";
/// Label for endpoint identification
pub const ENDPOINT: &str = "dynamo_endpoint";
}
/// NATS Prometheus metric names
pub mod nats {
/// Prefix for all NATS client metrics
pub const PREFIX: &str = "nats_";
/// ===== DistributedRuntime metrics =====
/// Total number of bytes received by NATS client
pub const IN_TOTAL_BYTES: &str = "nats_in_total_bytes";
/// Total number of bytes sent by NATS client
pub const OUT_OVERHEAD_BYTES: &str = "nats_out_overhead_bytes";
/// Total number of messages received by NATS client
pub const IN_MESSAGES: &str = "nats_in_messages";
/// Total number of messages sent by NATS client
pub const OUT_MESSAGES: &str = "nats_out_messages";
/// Total number of connections established by NATS client
pub const CONNECTS: &str = "nats_connects";
/// Current connection state of NATS client (0=disconnected, 1=connected, 2=reconnecting)
pub const CONNECTION_STATE: &str = "nats_connection_state";
/// ===== Component metrics (ordered to match NatsStatsMetrics fields) =====
/// Average processing time in milliseconds (maps to: average_processing_time in ms)
pub const AVG_PROCESSING_MS: &str = "nats_avg_processing_time_ms";
/// Total errors across all endpoints (maps to: num_errors)
pub const TOTAL_ERRORS: &str = "nats_total_errors";
/// Total requests across all endpoints (maps to: num_requests)
pub const TOTAL_REQUESTS: &str = "nats_total_requests";
/// Total processing time in milliseconds (maps to: processing_time in ms)
pub const TOTAL_PROCESSING_MS: &str = "nats_total_processing_time_ms";
/// Number of active services (derived from ServiceSet.services)
pub const ACTIVE_SERVICES: &str = "nats_active_services";
/// Number of active endpoints (derived from ServiceInfo.endpoints)
pub const ACTIVE_ENDPOINTS: &str = "nats_active_endpoints";
}
/// All NATS client Prometheus metric names as an array for iteration/validation
pub const DRT_NATS_METRICS: &[&str] = &[
nats::CONNECTION_STATE,
nats::CONNECTS,
nats::IN_TOTAL_BYTES,
nats::IN_MESSAGES,
nats::OUT_OVERHEAD_BYTES,
nats::OUT_MESSAGES,
];
/// All component service Prometheus metric names as an array for iteration/validation
/// (ordered to match NatsStatsMetrics fields)
pub const COMPONENT_NATS_METRICS: &[&str] = &[
nats::AVG_PROCESSING_MS, // maps to: average_processing_time (nanoseconds)
nats::TOTAL_ERRORS, // maps to: num_errors
nats::TOTAL_REQUESTS, // maps to: num_requests
nats::TOTAL_PROCESSING_MS, // maps to: processing_time (nanoseconds)
nats::ACTIVE_SERVICES, // derived from ServiceSet.services
nats::ACTIVE_ENDPOINTS, // derived from ServiceInfo.endpoints
];
/// Work handler Prometheus metric names
pub mod work_handler {
/// Total number of requests processed by work handler
pub const REQUESTS_TOTAL: &str = "requests_total";
/// Total number of bytes received in requests by work handler
pub const REQUEST_BYTES_TOTAL: &str = "request_bytes_total";
/// Total number of bytes sent in responses by work handler
pub const RESPONSE_BYTES_TOTAL: &str = "response_bytes_total";
/// Number of requests currently being processed by work handler
pub const CONCURRENT_REQUESTS: &str = "concurrent_requests";
/// Time spent processing requests by work handler (histogram)
pub const REQUEST_DURATION_SECONDS: &str = "request_duration_seconds";
}
......@@ -19,13 +19,22 @@
// we will want to associate the components cancellation token with the
// component's "service state"
use crate::{error, transports::nats, utils::stream, Result};
use crate::{
component::Component,
error,
metrics::{prometheus_names, MetricsRegistry},
traits::*,
transports::nats,
utils::stream,
DistributedRuntime, Result,
};
use async_nats::Message;
use async_stream::try_stream;
use bytes::Bytes;
use derive_getters::Dissolve;
use futures::stream::{StreamExt, TryStreamExt};
use prometheus;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::time::Duration;
......@@ -39,11 +48,55 @@ impl ServiceClient {
}
}
/// ServiceSet contains a collection of services with their endpoints and metrics
///
/// Tree structure:
/// Structure:
/// - ServiceSet
/// - services: Vec<ServiceInfo>
/// - name: String
/// - id: String
/// - version: String
/// - started: String
/// - endpoints: Vec<EndpointInfo>
/// - name: String
/// - subject: String
/// - data: Option<NatsStatsMetrics>
/// - average_processing_time: f64
/// - last_error: String
/// - num_errors: u64
/// - num_requests: u64
/// - processing_time: u64
/// - queue_group: String
/// - data: serde_json::Value (custom stats)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ServiceSet {
services: Vec<ServiceInfo>,
}
/// This is a example JSON from `nats req '$SRV.STATS.dynamo_backend'`:
/// {
/// "type": "io.nats.micro.v1.stats_response",
/// "name": "dynamo_backend",
/// "id": "bdu7nA8tbhy9mEkxIWlkBA",
/// "version": "0.0.1",
/// "started": "2025-08-08T05:07:17.720783523Z",
/// "endpoints": [
/// {
/// "name": "dynamo_backend-generate-694d988806b92e39",
/// "subject": "dynamo_backend.generate-694d988806b92e39",
/// "num_requests": 0,
/// "num_errors": 0,
/// "processing_time": 0,
/// "average_processing_time": 0,
/// "last_error": "",
/// "data": {
/// "val": 10
/// },
/// "queue_group": "q"
/// }
/// ]
/// }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ServiceInfo {
pub name: String,
......@@ -53,13 +106,15 @@ pub struct ServiceInfo {
pub endpoints: Vec<EndpointInfo>,
}
/// Each endpoint has name, subject, num_requests, num_errors, processing_time, average_processing_time, last_error, queue_group, and data
#[derive(Debug, Clone, Serialize, Deserialize, Dissolve)]
pub struct EndpointInfo {
pub name: String,
pub subject: String,
/// Extra fields that don't fit in EndpointInfo will be flattened into the Metrics struct.
#[serde(flatten)]
pub data: Option<Metrics>,
pub data: Option<NatsStatsMetrics>,
}
impl EndpointInfo {
......@@ -79,20 +134,21 @@ impl EndpointInfo {
// for easy deserialization. Ideally, this type already exists or can
// be exposed in the library somewhere.
/// Stats structure returned from NATS service API
/// https://github.com/nats-io/nats.rs/blob/main/async-nats/src/service/endpoint.rs
#[derive(Debug, Clone, Serialize, Deserialize, Dissolve)]
pub struct Metrics {
// Standard NATS Service API fields
pub average_processing_time: f64,
pub struct NatsStatsMetrics {
// Standard NATS Stats Service API fields from $SRV.STATS.<service_name> requests
pub average_processing_time: u64, // in nanoseconds according to nats-io
pub last_error: String,
pub num_errors: u64,
pub num_requests: u64,
pub processing_time: u64,
pub processing_time: u64, // in nanoseconds according to nats-io
pub queue_group: String,
// Field containing custom stats handler data
pub data: serde_json::Value,
}
impl Metrics {
impl NatsStatsMetrics {
pub fn decode<T: for<'de> Deserialize<'de>>(self) -> Result<T> {
serde_json::from_value(self.data).map_err(Into::into)
}
......@@ -154,6 +210,11 @@ impl ServiceSet {
.into_iter()
.flat_map(|s| s.endpoints.into_iter())
}
/// Get a reference to the services in this ServiceSet
pub fn services(&self) -> &[ServiceInfo] {
&self.services
}
}
#[cfg(test)]
......@@ -173,8 +234,8 @@ mod tests {
EndpointInfo {
name: "endpoint1".to_string(),
subject: "subject1".to_string(),
data: Some(Metrics {
average_processing_time: 0.1,
data: Some(NatsStatsMetrics {
average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(),
num_errors: 0,
num_requests: 10,
......@@ -186,8 +247,8 @@ mod tests {
EndpointInfo {
name: "endpoint2-foo".to_string(),
subject: "subject2".to_string(),
data: Some(Metrics {
average_processing_time: 0.1,
data: Some(NatsStatsMetrics {
average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(),
num_errors: 0,
num_requests: 10,
......@@ -207,8 +268,8 @@ mod tests {
EndpointInfo {
name: "endpoint1".to_string(),
subject: "subject1".to_string(),
data: Some(Metrics {
average_processing_time: 0.1,
data: Some(NatsStatsMetrics {
average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(),
num_errors: 0,
num_requests: 10,
......@@ -220,8 +281,8 @@ mod tests {
EndpointInfo {
name: "endpoint2-bar".to_string(),
subject: "subject2".to_string(),
data: Some(Metrics {
average_processing_time: 0.1,
data: Some(NatsStatsMetrics {
average_processing_time: 100_000, // 0.1ms = 100,000 nanoseconds
last_error: "none".to_string(),
num_errors: 0,
num_requests: 10,
......@@ -244,3 +305,135 @@ mod tests {
assert_eq!(endpoints.len(), 2);
}
}
/// Prometheus metrics for component service statistics (ordered to match NatsStatsMetrics)
///
/// ⚠️ IMPORTANT: These Prometheus Gauges are COPIES of NATS data, not live references!
///
/// How it works:
/// 1. NATS provides source data via NatsStatsMetrics
/// 2. Metrics callbacks read current NATS values and update these Prometheus Gauges
/// 3. Prometheus scrapes these Gauge values (snapshots, not live data)
///
/// Flow: NATS Service → NatsStatsMetrics (Counters) → Metrics Callback → Prometheus Gauge
/// Note: These are snapshots updated when execute_metrics_callbacks() is called.
#[derive(Debug, Clone)]
pub struct ComponentNatsPrometheusMetrics {
/// Average processing time in milliseconds (maps to: average_processing_time)
pub avg_processing_ms: prometheus::Gauge,
/// Total errors across all endpoints (maps to: num_errors)
pub total_errors: prometheus::IntGauge,
/// Total requests across all endpoints (maps to: num_requests)
pub total_requests: prometheus::IntGauge,
/// Total processing time in milliseconds (maps to: processing_time)
pub total_processing_ms: prometheus::IntGauge,
/// Number of active services (derived from ServiceSet.services)
pub active_services: prometheus::IntGauge,
/// Number of active endpoints (derived from ServiceInfo.endpoints)
pub active_endpoints: prometheus::IntGauge,
}
impl ComponentNatsPrometheusMetrics {
/// Create new ComponentServiceMetrics using Component's DistributedRuntime's Prometheus constructors
pub fn new(component: &Component) -> Result<Self> {
let avg_processing_ms = component.create_gauge(
prometheus_names::nats::AVG_PROCESSING_MS,
"Average processing time across all component endpoints in milliseconds",
&[],
)?;
let total_errors = component.create_intgauge(
prometheus_names::nats::TOTAL_ERRORS,
"Total number of errors across all component endpoints",
&[],
)?;
let total_requests = component.create_intgauge(
prometheus_names::nats::TOTAL_REQUESTS,
"Total number of requests across all component endpoints",
&[],
)?;
let total_processing_ms = component.create_intgauge(
prometheus_names::nats::TOTAL_PROCESSING_MS,
"Total processing time across all component endpoints in milliseconds",
&[],
)?;
let active_services = component.create_intgauge(
prometheus_names::nats::ACTIVE_SERVICES,
"Number of active services in this component",
&[],
)?;
let active_endpoints = component.create_intgauge(
prometheus_names::nats::ACTIVE_ENDPOINTS,
"Number of active endpoints across all services",
&[],
)?;
Ok(Self {
avg_processing_ms,
total_errors,
total_requests,
total_processing_ms,
active_services,
active_endpoints,
})
}
/// Update metrics from scraped ServiceSet data
pub fn update_from_service_set(&self, service_set: &ServiceSet) {
// Variables ordered to match NatsStatsMetrics fields
let mut processing_time_samples = 0u64; // for average_processing_time calculation
let mut total_errors = 0u64; // maps to: num_errors
let mut total_requests = 0u64; // maps to: num_requests
let mut total_processing_time_nanos = 0u64; // maps to: processing_time (nanoseconds from NATS)
let mut endpoint_count = 0u64; // for derived metrics
let service_count = service_set.services().len() as i64;
for service in service_set.services() {
for endpoint in &service.endpoints {
endpoint_count += 1;
if let Some(ref stats) = endpoint.data {
total_errors += stats.num_errors;
total_requests += stats.num_requests;
total_processing_time_nanos += stats.processing_time;
if stats.num_requests > 0 {
processing_time_samples += 1;
}
}
}
}
// Update metrics (ordered to match NatsStatsMetrics fields)
// Calculate average processing time in milliseconds (maps to: average_processing_time)
if processing_time_samples > 0 && total_requests > 0 {
let avg_time_nanos = total_processing_time_nanos as f64 / total_requests as f64;
let avg_time_ms = avg_time_nanos / 1_000_000.0; // Convert nanoseconds to milliseconds
self.avg_processing_ms.set(avg_time_ms);
} else {
self.avg_processing_ms.set(0.0);
}
self.total_errors.set(total_errors as i64); // maps to: num_errors
self.total_requests.set(total_requests as i64); // maps to: num_requests
self.total_processing_ms
.set((total_processing_time_nanos / 1_000_000) as i64); // maps to: processing_time (converted to milliseconds)
self.active_services.set(service_count); // derived from ServiceSet.services
self.active_endpoints.set(endpoint_count as i64); // derived from ServiceInfo.endpoints
}
/// Reset all metrics to zero. Useful when no data is available or to clear stale values.
pub fn reset_to_zeros(&self) {
self.avg_processing_ms.set(0.0);
self.total_errors.set(0);
self.total_requests.set(0);
self.total_processing_ms.set(0);
self.active_services.set(0);
self.active_endpoints.set(0);
}
}
......@@ -209,6 +209,7 @@ pub async fn spawn_system_status_server(
tracing::error!("System status server error: {}", e);
}
});
Ok((actual_address, handle))
}
......@@ -254,7 +255,18 @@ async fn metrics_handler(state: Arc<SystemStatusState>) -> impl IntoResponse {
// Update the uptime gauge with current value
state.update_uptime_gauge();
// Get metrics from the registry
// Execute all the callbacks starting at the DistributedRuntime level
assert!(state.drt().basename() == "");
let callback_results = state
.drt()
.execute_metrics_callbacks(&state.drt().hierarchy());
for result in callback_results {
if let Err(e) = result {
tracing::error!("Error executing metrics callback: {}", e);
}
}
// Get all metrics from DistributedRuntime (top-level)
match state.drt().prometheus_metrics_fmt() {
Ok(response) => (StatusCode::OK, response),
Err(e) => {
......@@ -341,12 +353,20 @@ mod tests {
let response = runtime_metrics.drt().prometheus_metrics_fmt().unwrap();
println!("Full metrics response:\n{}", response);
// Filter out NATS client metrics for comparison
use crate::metrics::prometheus_names::nats as nats_metrics;
let filtered_response: String = response
.lines()
.filter(|line| !line.contains(nats_metrics::PREFIX))
.collect::<Vec<_>>()
.join("\n");
let expected = "\
# HELP dynamo_component_dynamo_uptime_seconds Total uptime of the DistributedRuntime in seconds
# TYPE dynamo_component_dynamo_uptime_seconds gauge
dynamo_component_dynamo_uptime_seconds 42
";
assert_eq!(response, expected);
dynamo_component_dynamo_uptime_seconds 42";
assert_eq!(filtered_response, expected);
}
#[cfg(feature = "integration")]
......
......@@ -632,15 +632,11 @@ mod tests {
.id();
// Create the key
let result = client
.kv_create(key.to_string(), value.to_vec(), Some(lease_id))
.await;
let result = client.kv_create(key, value.to_vec(), Some(lease_id)).await;
assert!(result.is_ok(), "");
// Try to create the key again - this should fail
let result = client
.kv_create(key.to_string(), value.to_vec(), Some(lease_id))
.await;
let result = client.kv_create(key, value.to_vec(), Some(lease_id)).await;
assert!(result.is_err());
// Create or validate should succeed as the values match
......
......@@ -28,20 +28,23 @@
//! - `NATS_AUTH_CREDENTIALS_FILE`: the path to the credentials file
//!
//! Note: `NATS_AUTH_USERNAME` and `NATS_AUTH_PASSWORD` must be used together.
use crate::Result;
use crate::{metrics::MetricsRegistry, Result};
use async_nats::connection::State;
use async_nats::{client, jetstream, Subscriber};
use bytes::Bytes;
use derive_builder::Builder;
use futures::{StreamExt, TryStreamExt};
use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntCounter, IntGauge, Opts, Registry};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::sync::atomic::Ordering;
use tokio::fs::File as TokioFile;
use tokio::io::AsyncRead;
use tokio::time;
use url::Url;
use validator::{Validate, ValidationError};
use crate::metrics::prometheus_names::nats as nats_metrics;
pub use crate::slug::Slug;
use tracing as log;
......@@ -504,6 +507,109 @@ impl NatsQueue {
}
}
/// Prometheus metrics that mirror the NATS client statistics (in primitive types)
/// to be used for the System Status Server.
///
/// ⚠️ IMPORTANT: These Prometheus Gauges are COPIES of NATS client data, not live references!
///
/// How it works:
/// 1. NATS client provides source data via client.statistics() and connection_state()
/// 2. set_from_client_stats() reads current NATS values and updates these Prometheus Gauges
/// 3. Prometheus scrapes these Gauge values (snapshots, not live data)
///
/// Flow: NATS Client → Client Statistics → set_from_client_stats() → Prometheus Gauge
/// Note: These are snapshots updated when set_from_client_stats() is called.
#[derive(Debug, Clone)]
pub struct DRTNatsPrometheusMetrics {
nats_client: client::Client,
/// Number of bytes received (excluding protocol overhead)
pub in_bytes: IntGauge,
/// Number of bytes sent (excluding protocol overhead)
pub out_bytes: IntGauge,
/// Number of messages received
pub in_messages: IntGauge,
/// Number of messages sent
pub out_messages: IntGauge,
/// Number of times connection was established
pub connects: IntGauge,
/// Current connection state (0 = disconnected, 1 = connected, 2 = reconnecting)
pub connection_state: IntGauge,
}
impl DRTNatsPrometheusMetrics {
/// Create a new instance of NATS client metrics using a DistributedRuntime's Prometheus constructors
pub fn new(drt: &crate::DistributedRuntime, nats_client: client::Client) -> Result<Self> {
let in_bytes = drt.create_intgauge(
nats_metrics::IN_TOTAL_BYTES,
"Total number of bytes received by NATS client",
&[],
)?;
let out_bytes = drt.create_intgauge(
nats_metrics::OUT_OVERHEAD_BYTES,
"Total number of bytes sent by NATS client",
&[],
)?;
let in_messages = drt.create_intgauge(
nats_metrics::IN_MESSAGES,
"Total number of messages received by NATS client",
&[],
)?;
let out_messages = drt.create_intgauge(
nats_metrics::OUT_MESSAGES,
"Total number of messages sent by NATS client",
&[],
)?;
let connects = drt.create_intgauge(
nats_metrics::CONNECTS,
"Total number of connections established by NATS client",
&[],
)?;
let connection_state = drt.create_intgauge(
nats_metrics::CONNECTION_STATE,
"Current connection state of NATS client (0=disconnected, 1=connected, 2=reconnecting)",
&[],
)?;
Ok(Self {
nats_client,
in_bytes,
out_bytes,
in_messages,
out_messages,
connects,
connection_state,
})
}
/// Copy statistics from the stored NATS client to these Prometheus metrics
pub fn set_from_client_stats(&self) {
let stats = self.nats_client.statistics();
// Get current values from the client statistics
let in_bytes = stats.in_bytes.load(Ordering::Relaxed);
let out_bytes = stats.out_bytes.load(Ordering::Relaxed);
let in_messages = stats.in_messages.load(Ordering::Relaxed);
let out_messages = stats.out_messages.load(Ordering::Relaxed);
let connects = stats.connects.load(Ordering::Relaxed);
// Get connection state
let connection_state = match self.nats_client.connection_state() {
State::Connected => 1,
// treat Disconnected and Pending as "down"
State::Disconnected | State::Pending => 0,
};
// Update Prometheus metrics
// Using gauges allows us to set absolute values directly
self.in_bytes.set(in_bytes as i64);
self.out_bytes.set(out_bytes as i64);
self.in_messages.set(in_messages as i64);
self.out_messages.set(out_messages as i64);
self.connects.set(connects as i64);
self.connection_state.set(connection_state);
}
}
#[cfg(test)]
mod tests {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment