# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """ Python constants for Prometheus metric names AUTO-GENERATED from lib/runtime/src/metrics/prometheus_names.rs DO NOT EDIT THIS FILE MANUALLY To regenerate this file after modifying lib/runtime/src/metrics/prometheus_names.rs: cargo run -p dynamo-codegen --bin gen-python-prometheus-names This module provides pure Python access to Prometheus metric name constants without requiring Rust bindings. Usage (both patterns supported): # Pattern 1: Import module from dynamo import prometheus_names print(prometheus_names.frontend_service.REQUESTS_TOTAL) # "requests_total" print(prometheus_names.work_handler.ERRORS_TOTAL) # "errors_total" # Pattern 2: Import specific classes from dynamo.prometheus_names import frontend_service, work_handler print(frontend_service.REQUESTS_TOTAL) # "requests_total" print(work_handler.ERRORS_TOTAL) # "errors_total" """ from __future__ import annotations class component_names: """Well-known component names used as values for the `dynamo_component` label.""" # Component name for the KV router (frontend-side request routing). ROUTER = "router" class distributed_runtime: """DistributedRuntime core metrics""" # Total uptime of the DistributedRuntime in seconds UPTIME_SECONDS = "uptime_seconds" class frontend_service: """Frontend service metrics (LLM HTTP service)""" # Environment variable that overrides the default metric prefix METRICS_PREFIX_ENV = "DYN_METRICS_PREFIX" # Total number of LLM requests processed REQUESTS_TOTAL = "requests_total" # Number of requests waiting in HTTP queue before receiving the first response (gauge) QUEUED_REQUESTS = "queued_requests" # Number of inflight/concurrent requests going to the engine (vLLM, SGLang, ...) # Note: This is a gauge metric (current state) that can go up and down, so no _total suffix INFLIGHT_REQUESTS = "inflight_requests" # Number of disconnected clients (gauge that can go up and down) DISCONNECTED_CLIENTS = "disconnected_clients" # Duration of LLM requests REQUEST_DURATION_SECONDS = "request_duration_seconds" # Input sequence length in tokens INPUT_SEQUENCE_TOKENS = "input_sequence_tokens" # Output sequence length in tokens OUTPUT_SEQUENCE_TOKENS = "output_sequence_tokens" # Predicted KV cache hit rate at routing time (0.0-1.0) KV_HIT_RATE = "kv_hit_rate" # Number of cached tokens (prefix cache hits) per request CACHED_TOKENS = "cached_tokens" # Tokenizer latency in milliseconds TOKENIZER_LATENCY_MS = "tokenizer_latency_ms" # Total number of output tokens generated (counter that updates in real-time) OUTPUT_TOKENS_TOTAL = "output_tokens_total" # Time to first token in seconds TIME_TO_FIRST_TOKEN_SECONDS = "time_to_first_token_seconds" # Inter-token latency in seconds INTER_TOKEN_LATENCY_SECONDS = "inter_token_latency_seconds" # Model configuration metrics # Runtime config metrics (from ModelRuntimeConfig): # Total KV blocks available for a worker serving the model MODEL_TOTAL_KV_BLOCKS = "model_total_kv_blocks" # Maximum number of sequences for a worker serving the model (runtime config) MODEL_MAX_NUM_SEQS = "model_max_num_seqs" # Maximum number of batched tokens for a worker serving the model (runtime config) MODEL_MAX_NUM_BATCHED_TOKENS = "model_max_num_batched_tokens" # MDC metrics (from ModelDeploymentCard): # Maximum context length for a worker serving the model (MDC) MODEL_CONTEXT_LENGTH = "model_context_length" # KV cache block size for a worker serving the model (MDC) MODEL_KV_CACHE_BLOCK_SIZE = "model_kv_cache_block_size" # Request migration limit for a worker serving the model (MDC) MODEL_MIGRATION_LIMIT = "model_migration_limit" # Total number of request migrations due to worker unavailability MODEL_MIGRATION_TOTAL = "model_migration_total" # Active decode blocks (KV cache blocks) per worker # Gauge metric tracking current KV cache block utilization for each worker WORKER_ACTIVE_DECODE_BLOCKS = "worker_active_decode_blocks" # Active prefill tokens per worker # Gauge metric tracking current queued prefill tokens for each worker WORKER_ACTIVE_PREFILL_TOKENS = "worker_active_prefill_tokens" # Last observed time to first token per worker (in seconds) # Gauge metric tracking the most recent TTFT for each worker WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS = "worker_last_time_to_first_token_seconds" # Last observed input sequence tokens per worker # Gauge metric tracking the input token count from the same request as WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS # Updated atomically with TTFT to correlate latency with input size WORKER_LAST_INPUT_SEQUENCE_TOKENS = "worker_last_input_sequence_tokens" # Last observed inter-token latency per worker (in seconds) # Gauge metric tracking the most recent ITL for each worker WORKER_LAST_INTER_TOKEN_LATENCY_SECONDS = "worker_last_inter_token_latency_seconds" # Number of requests pending in the router's scheduler queue (gauge per worker_type) ROUTER_QUEUE_PENDING_REQUESTS = "router_queue_pending_requests" # Label name for the type of migration MIGRATION_TYPE_LABEL = "migration_type" # Label name for tokenizer operation OPERATION_LABEL = "operation" class kvbm: """KVBM""" # The number of offload blocks from device to host OFFLOAD_BLOCKS_D2H = "offload_blocks_d2h" # The number of offload blocks from host to disk OFFLOAD_BLOCKS_H2D = "offload_blocks_h2d" # The number of offload blocks from device to disk (bypassing host memory) OFFLOAD_BLOCKS_D2D = "offload_blocks_d2d" # The number of onboard blocks from host to device ONBOARD_BLOCKS_H2D = "onboard_blocks_h2d" # The number of onboard blocks from disk to device ONBOARD_BLOCKS_D2D = "onboard_blocks_d2d" # The number of matched tokens MATCHED_TOKENS = "matched_tokens" # Host cache hit rate (0.0-1.0) from the sliding window HOST_CACHE_HIT_RATE = "host_cache_hit_rate" # Disk cache hit rate (0.0-1.0) from the sliding window DISK_CACHE_HIT_RATE = "disk_cache_hit_rate" # Object storage cache hit rate (0.0-1.0) from the sliding window OBJECT_CACHE_HIT_RATE = "object_cache_hit_rate" # Number of blocks offloaded from device to object storage OFFLOAD_BLOCKS_D2O = "offload_blocks_d2o" # Number of blocks onboarded from object storage to device ONBOARD_BLOCKS_O2D = "onboard_blocks_o2d" # Bytes transferred to object storage (offload) OFFLOAD_BYTES_OBJECT = "offload_bytes_object" # Bytes transferred from object storage (onboard) ONBOARD_BYTES_OBJECT = "onboard_bytes_object" # Number of failed object storage read operations (blocks) OBJECT_READ_FAILURES = "object_read_failures" # Number of failed object storage write operations (blocks) OBJECT_WRITE_FAILURES = "object_write_failures" class kvrouter: # Number of KV cache events applied to the index (including status) KV_CACHE_EVENTS_APPLIED = "kv_cache_events_applied" class kvstats: # Total number of KV cache blocks available on the worker TOTAL_BLOCKS = "total_blocks" # GPU cache usage as a percentage (0.0-1.0) GPU_CACHE_USAGE_PERCENT = "gpu_cache_usage_percent" class labels: """Automatically inserted Prometheus label names used across the metrics system""" # Label for component identification COMPONENT = "dynamo_component" # Label for namespace identification NAMESPACE = "dynamo_namespace" # Label for endpoint identification ENDPOINT = "dynamo_endpoint" # Label for worker data-parallel rank. # Note: this is not an auto-inserted label like `dynamo_namespace`/`dynamo_component`. # It is used by worker/load-style metrics that need to disambiguate per-worker series. DP_RANK = "dp_rank" # Label for worker instance ID (etcd lease ID). WORKER_ID = "worker_id" # Label for model name/path (OpenAI API standard, injected by Dynamo) # This is the standard label name injected by all backends in metrics_labels=[("model", ...)]. # Ensures compatibility with OpenAI-compatible tooling. MODEL = "model" # Label for model name/path (alternative/native engine label, injected by Dynamo) # Some engines natively use model_name, so we inject both model and model_name # to ensure maximum compatibility with both OpenAI standard and engine-native tooling. # When a metric already has a label, injection does not overwrite it (original is preserved). MODEL_NAME = "model_name" # Label for worker type (e.g., "aggregated", "prefill", "decode", "encoder", etc.) WORKER_TYPE = "worker_type" # Label for router instance (discovery.instance_id() of the frontend) ROUTER_ID = "router_id" class model_info: # Model load time in seconds LOAD_TIME_SECONDS = "model_load_time_seconds" class name_prefix: """Metric name prefixes used across the metrics system""" # Prefix for all Prometheus metric names. COMPONENT = "dynamo_component" # Prefix for frontend service metrics FRONTEND = "dynamo_frontend" # Prefix for KV router metrics (used with router_id label) ROUTER = "dynamo_router" class router: """Router request metrics (component-scoped aggregate histograms + counter)""" # Total number of requests processed by the router REQUESTS_TOTAL = "router_requests_total" # Time to first token observed at the router (seconds) TIME_TO_FIRST_TOKEN_SECONDS = "router_time_to_first_token_seconds" # Average inter-token latency observed at the router (seconds) INTER_TOKEN_LATENCY_SECONDS = "router_inter_token_latency_seconds" # Input sequence length in tokens observed at the router INPUT_SEQUENCE_TOKENS = "router_input_sequence_tokens" # Output sequence length in tokens observed at the router OUTPUT_SEQUENCE_TOKENS = "router_output_sequence_tokens" class router_request: """Router per-request metrics (component-scoped via `MetricsHierarchy`).""" # Prefix prepended to `frontend_service::*` names to form router metric names. # e.g. `"router_"` + `frontend_service::REQUESTS_TOTAL` → `"router_requests_total"`. METRIC_PREFIX = "router_" class routing_overhead: """Routing overhead phase latency histogram suffixes.""" # Time spent computing block hashes BLOCK_HASHING_MS = "overhead_block_hashing_ms" # Time spent in indexer find_matches INDEXER_FIND_MATCHES_MS = "overhead_indexer_find_matches_ms" # Time spent computing sequence hashes SEQ_HASHING_MS = "overhead_seq_hashing_ms" # Time spent in scheduler worker selection SCHEDULING_MS = "overhead_scheduling_ms" # Total routing overhead per request TOTAL_MS = "overhead_total_ms" class task_tracker: """Task tracker Prometheus metric name suffixes""" # Total number of tasks issued/submitted TASKS_ISSUED_TOTAL = "tasks_issued_total" # Total number of tasks started TASKS_STARTED_TOTAL = "tasks_started_total" # Total number of successfully completed tasks TASKS_SUCCESS_TOTAL = "tasks_success_total" # Total number of cancelled tasks TASKS_CANCELLED_TOTAL = "tasks_cancelled_total" # Total number of failed tasks TASKS_FAILED_TOTAL = "tasks_failed_total" # Total number of rejected tasks TASKS_REJECTED_TOTAL = "tasks_rejected_total" class work_handler: """Work handler Prometheus metric names""" # Total number of requests processed by work handler REQUESTS_TOTAL = "requests_total" # Total number of bytes received in requests by work handler REQUEST_BYTES_TOTAL = "request_bytes_total" # Total number of bytes sent in responses by work handler RESPONSE_BYTES_TOTAL = "response_bytes_total" # Number of requests currently being processed by work handler # Note: This is a gauge metric (current state) that can go up and down, so no _total suffix INFLIGHT_REQUESTS = "inflight_requests" # Time spent processing requests by work handler (histogram) REQUEST_DURATION_SECONDS = "request_duration_seconds" # Total number of errors in work handler processing ERRORS_TOTAL = "errors_total" # Label name for error type classification ERROR_TYPE_LABEL = "error_type"