// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //! Python bindings for Prometheus metric name constants //! //! ⚠️ **CRITICAL: SYNC WITH RUST SOURCE AND PYTHON TYPE STUBS** ⚠️ //! This file exposes constants from `lib/runtime/src/metrics/prometheus_names.rs` to Python. //! When the source file is modified, you MUST update BOTH files to match: //! //! 1. **This Rust file** - Update the actual Python bindings implementation //! 2. **Python type stubs** - Update `lib/bindings/python/src/dynamo/_core.pyi` //! The .pyi file provides type hints for IDEs and static type checkers. //! Without updating it, IDEs won't recognize new classes/methods for autocomplete. //! //! The constants here should mirror the structure and values from the Rust source. //! Any changes to metric names in the source must be reflected here immediately. //! //! Files to sync: //! - Source: `lib/runtime/src/metrics/prometheus_names.rs` //! - This file: `lib/bindings/python/rust/prometheus_names.rs` //! - Type stubs: `lib/bindings/python/src/dynamo/_core.pyi` //! //! ## Python Usage Example //! //! ```python //! from dynamo._core import prometheus_names //! //! # Access metrics directly (no constructor call needed!) //! frontend = prometheus_names.frontend //! print(frontend.requests_total) # "dynamo_frontend_requests_total" //! print(frontend.queued_requests) # "dynamo_frontend_queued_requests" //! print(frontend.inflight_requests) # "dynamo_frontend_inflight_requests" //! print(frontend.disconnected_clients) # "dynamo_frontend_disconnected_clients" //! print(frontend.request_duration_seconds) # "dynamo_frontend_request_duration_seconds" //! print(frontend.input_sequence_tokens) # "dynamo_frontend_input_sequence_tokens" //! print(frontend.output_sequence_tokens) # "dynamo_frontend_output_sequence_tokens" //! print(frontend.time_to_first_token_seconds) # "dynamo_frontend_time_to_first_token_seconds" //! print(frontend.inter_token_latency_seconds) # "dynamo_frontend_inter_token_latency_seconds" //! print(frontend.model_context_length) # "dynamo_frontend_model_context_length" //! print(frontend.model_kv_cache_block_size) # "dynamo_frontend_model_kv_cache_block_size" //! print(frontend.model_migration_limit) # "dynamo_frontend_model_migration_limit" //! //! work_handler = prometheus_names.work_handler //! print(work_handler.requests_total) # "dynamo_component_requests_total" //! print(work_handler.request_bytes_total) # "dynamo_component_request_bytes_total" //! print(work_handler.response_bytes_total) # "dynamo_component_response_bytes_total" //! print(work_handler.inflight_requests) # "dynamo_component_inflight_requests" //! print(work_handler.request_duration_seconds) # "dynamo_component_request_duration_seconds" //! print(work_handler.errors_total) # "dynamo_component_errors_total" //! //! kvstats = prometheus_names.kvstats //! print(kvstats.active_blocks) # "kvstats_active_blocks" //! print(kvstats.total_blocks) # "kvstats_total_blocks" //! print(kvstats.gpu_cache_usage_percent) # "kvstats_gpu_cache_usage_percent" //! print(kvstats.gpu_prefix_cache_hit_rate) # "kvstats_gpu_prefix_cache_hit_rate" //! //! # Use in Prometheus queries //! query = f"rate({frontend.requests_total}[5m])" //! pattern = rf'{work_handler.requests_total}\{{[^}}]*model="[^"]*"[^}}]*\}}' //! ``` use dynamo_runtime::metrics::prometheus_names::*; use pyo3::prelude::*; /// Main container for all Prometheus metric name constants #[pyclass] pub struct PrometheusNames; #[pymethods] impl PrometheusNames { /// Frontend service metrics #[getter] fn frontend(&self) -> FrontendService { FrontendService } /// Work handler metrics #[getter] fn work_handler(&self) -> WorkHandler { WorkHandler } /// KV stats metrics #[getter] fn kvstats(&self) -> KvStatsMetrics { KvStatsMetrics } } /// Frontend service metrics (LLM HTTP service) /// These methods return the full metric names with the "dynamo_frontend_" prefix /// /// Note: We use instance methods instead of static methods for better Python ergonomics /// - The `concat!` macro only accepts string literals, not const references /// - We need to combine `name_prefix::FRONTEND` + `frontend_service::*` constants at runtime /// - This ensures we use actual Rust constants rather than hardcoded literals #[pyclass] pub struct FrontendService; #[pymethods] impl FrontendService { /// Total number of LLM requests processed #[getter] fn requests_total(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::REQUESTS_TOTAL ) } /// Number of requests waiting in HTTP queue before receiving the first response #[getter] fn queued_requests(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::QUEUED_REQUESTS ) } /// Number of inflight requests going to the engine (vLLM, SGLang, ...) #[getter] fn inflight_requests(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::INFLIGHT_REQUESTS ) } /// Duration of LLM requests #[getter] fn request_duration_seconds(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::REQUEST_DURATION_SECONDS ) } /// Input sequence length in tokens #[getter] fn input_sequence_tokens(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::INPUT_SEQUENCE_TOKENS ) } /// Output sequence length in tokens #[getter] fn output_sequence_tokens(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::OUTPUT_SEQUENCE_TOKENS ) } /// Time to first token in seconds #[getter] fn time_to_first_token_seconds(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::TIME_TO_FIRST_TOKEN_SECONDS ) } /// Inter-token latency in seconds #[getter] fn inter_token_latency_seconds(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::INTER_TOKEN_LATENCY_SECONDS ) } /// Number of disconnected clients #[getter] fn disconnected_clients(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::DISCONNECTED_CLIENTS ) } /// Model total KV blocks #[getter] fn model_total_kv_blocks(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::MODEL_TOTAL_KV_BLOCKS ) } /// Model max number of sequences #[getter] fn model_max_num_seqs(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::MODEL_MAX_NUM_SEQS ) } /// Model max number of batched tokens #[getter] fn model_max_num_batched_tokens(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::MODEL_MAX_NUM_BATCHED_TOKENS ) } /// Model context length #[getter] fn model_context_length(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::MODEL_CONTEXT_LENGTH ) } /// Model KV cache block size #[getter] fn model_kv_cache_block_size(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::MODEL_KV_CACHE_BLOCK_SIZE ) } /// Model migration limit #[getter] fn model_migration_limit(&self) -> String { format!( "{}_{}", name_prefix::FRONTEND, frontend_service::MODEL_MIGRATION_LIMIT ) } } /// Work handler metrics (component request processing) /// These methods return the full metric names with the "dynamo_component_" prefix #[pyclass] pub struct WorkHandler; #[pymethods] impl WorkHandler { /// Total number of requests processed by work handler #[getter] fn requests_total(&self) -> String { format!( "{}_{}", name_prefix::COMPONENT, work_handler::REQUESTS_TOTAL ) } /// Total number of bytes received in requests by work handler #[getter] fn request_bytes_total(&self) -> String { format!( "{}_{}", name_prefix::COMPONENT, work_handler::REQUEST_BYTES_TOTAL ) } /// Total number of bytes sent in responses by work handler #[getter] fn response_bytes_total(&self) -> String { format!( "{}_{}", name_prefix::COMPONENT, work_handler::RESPONSE_BYTES_TOTAL ) } /// Number of requests currently being processed by work handler #[getter] fn inflight_requests(&self) -> String { format!( "{}_{}", name_prefix::COMPONENT, work_handler::INFLIGHT_REQUESTS ) } /// Time spent processing requests by work handler (histogram) #[getter] fn request_duration_seconds(&self) -> String { format!( "{}_{}", name_prefix::COMPONENT, work_handler::REQUEST_DURATION_SECONDS ) } /// Total number of errors in work handler processing #[getter] fn errors_total(&self) -> String { format!("{}_{}", name_prefix::COMPONENT, work_handler::ERRORS_TOTAL) } } /// KV stats metrics (KV cache statistics) /// These methods return the metric names with the "kvstats_" prefix #[pyclass] pub struct KvStatsMetrics; #[pymethods] impl KvStatsMetrics { /// Number of active KV cache blocks currently in use #[getter] fn active_blocks(&self) -> String { kvstats::ACTIVE_BLOCKS.to_string() } /// Total number of KV cache blocks available #[getter] fn total_blocks(&self) -> String { kvstats::TOTAL_BLOCKS.to_string() } /// GPU cache usage as a percentage (0.0-1.0) #[getter] fn gpu_cache_usage_percent(&self) -> String { kvstats::GPU_CACHE_USAGE_PERCENT.to_string() } /// GPU prefix cache hit rate as a percentage (0.0-1.0) #[getter] fn gpu_prefix_cache_hit_rate(&self) -> String { kvstats::GPU_PREFIX_CACHE_HIT_RATE.to_string() } } /// Add prometheus_names module to the Python bindings pub fn add_to_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; // Add a module-level singleton instance for convenience let prometheus_names_instance = PrometheusNames; m.add("prometheus_names", prometheus_names_instance)?; Ok(()) }