# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Python constants for Prometheus metric names

AUTO-GENERATED from lib/runtime/src/metrics/prometheus_names.rs
DO NOT EDIT THIS FILE MANUALLY

To regenerate this file after modifying lib/runtime/src/metrics/prometheus_names.rs:
    cargo run -p dynamo-codegen --bin gen-python-prometheus-names

This module provides pure Python access to Prometheus metric name constants
without requiring Rust bindings.

Usage (both patterns supported):
    # Pattern 1: Import module
    from dynamo import prometheus_names
    print(prometheus_names.frontend_service.REQUESTS_TOTAL)  # "requests_total"
    print(prometheus_names.work_handler.ERRORS_TOTAL)  # "errors_total"

    # Pattern 2: Import specific classes
    from dynamo.prometheus_names import frontend_service, work_handler
    print(frontend_service.REQUESTS_TOTAL)  # "requests_total"
    print(work_handler.ERRORS_TOTAL)  # "errors_total"
"""

from __future__ import annotations


class component_names:
    """Well-known component names used as values for the `dynamo_component` label."""

    # Component name for the KV router (frontend-side request routing).
    ROUTER = "router"


class distributed_runtime:
    """DistributedRuntime core metrics"""

    # Total uptime of the DistributedRuntime in seconds
    UPTIME_SECONDS = "uptime_seconds"


class frontend_service:
    """Frontend service metrics (LLM HTTP service)"""

    # Environment variable that overrides the default metric prefix
    METRICS_PREFIX_ENV = "DYN_METRICS_PREFIX"
    # Total number of LLM requests processed
    REQUESTS_TOTAL = "requests_total"
    # Number of requests waiting in HTTP queue before receiving the first response (gauge)
    QUEUED_REQUESTS = "queued_requests"
    # Number of inflight/concurrent requests going to the engine (vLLM, SGLang, ...)
    # Note: This is a gauge metric (current state) that can go up and down, so no _total suffix
    INFLIGHT_REQUESTS = "inflight_requests"
    # Number of disconnected clients (gauge that can go up and down)
    DISCONNECTED_CLIENTS = "disconnected_clients"
    # Duration of LLM requests
    REQUEST_DURATION_SECONDS = "request_duration_seconds"
    # Input sequence length in tokens
    INPUT_SEQUENCE_TOKENS = "input_sequence_tokens"
    # Output sequence length in tokens
    OUTPUT_SEQUENCE_TOKENS = "output_sequence_tokens"
    # Predicted KV cache hit rate at routing time (0.0-1.0)
    KV_HIT_RATE = "kv_hit_rate"
    # Number of cached tokens (prefix cache hits) per request
    CACHED_TOKENS = "cached_tokens"
    # Tokenizer latency in milliseconds
    TOKENIZER_LATENCY_MS = "tokenizer_latency_ms"
    # Total number of output tokens generated (counter that updates in real-time)
    OUTPUT_TOKENS_TOTAL = "output_tokens_total"
    # Time to first token in seconds
    TIME_TO_FIRST_TOKEN_SECONDS = "time_to_first_token_seconds"
    # Inter-token latency in seconds
    INTER_TOKEN_LATENCY_SECONDS = "inter_token_latency_seconds"
    # Model configuration metrics
    # Runtime config metrics (from ModelRuntimeConfig):
    # Total KV blocks available for a worker serving the model
    MODEL_TOTAL_KV_BLOCKS = "model_total_kv_blocks"
    # Maximum number of sequences for a worker serving the model (runtime config)
    MODEL_MAX_NUM_SEQS = "model_max_num_seqs"
    # Maximum number of batched tokens for a worker serving the model (runtime config)
    MODEL_MAX_NUM_BATCHED_TOKENS = "model_max_num_batched_tokens"
    # MDC metrics (from ModelDeploymentCard):
    # Maximum context length for a worker serving the model (MDC)
    MODEL_CONTEXT_LENGTH = "model_context_length"
    # KV cache block size for a worker serving the model (MDC)
    MODEL_KV_CACHE_BLOCK_SIZE = "model_kv_cache_block_size"
    # Request migration limit for a worker serving the model (MDC)
    MODEL_MIGRATION_LIMIT = "model_migration_limit"
    # Total number of request migrations due to worker unavailability
    MODEL_MIGRATION_TOTAL = "model_migration_total"
    # Active decode blocks (KV cache blocks) per worker
    # Gauge metric tracking current KV cache block utilization for each worker
    WORKER_ACTIVE_DECODE_BLOCKS = "worker_active_decode_blocks"
    # Active prefill tokens per worker
    # Gauge metric tracking current queued prefill tokens for each worker
    WORKER_ACTIVE_PREFILL_TOKENS = "worker_active_prefill_tokens"
    # Last observed time to first token per worker (in seconds)
    # Gauge metric tracking the most recent TTFT for each worker
    WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS = "worker_last_time_to_first_token_seconds"
    # Last observed input sequence tokens per worker
    # Gauge metric tracking the input token count from the same request as WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS
    # Updated atomically with TTFT to correlate latency with input size
    WORKER_LAST_INPUT_SEQUENCE_TOKENS = "worker_last_input_sequence_tokens"
    # Last observed inter-token latency per worker (in seconds)
    # Gauge metric tracking the most recent ITL for each worker
    WORKER_LAST_INTER_TOKEN_LATENCY_SECONDS = "worker_last_inter_token_latency_seconds"
    # Number of requests pending in the router's scheduler queue (gauge per worker_type)
    ROUTER_QUEUE_PENDING_REQUESTS = "router_queue_pending_requests"
    # Label name for the type of migration
    MIGRATION_TYPE_LABEL = "migration_type"
    # Label name for tokenizer operation
    OPERATION_LABEL = "operation"


class kvbm:
    """KVBM"""

    # The number of offload blocks from device to host
    OFFLOAD_BLOCKS_D2H = "offload_blocks_d2h"
    # The number of offload blocks from host to disk
    OFFLOAD_BLOCKS_H2D = "offload_blocks_h2d"
    # The number of offload blocks from device to disk (bypassing host memory)
    OFFLOAD_BLOCKS_D2D = "offload_blocks_d2d"
    # The number of onboard blocks from host to device
    ONBOARD_BLOCKS_H2D = "onboard_blocks_h2d"
    # The number of onboard blocks from disk to device
    ONBOARD_BLOCKS_D2D = "onboard_blocks_d2d"
    # The number of matched tokens
    MATCHED_TOKENS = "matched_tokens"
    # Host cache hit rate (0.0-1.0) from the sliding window
    HOST_CACHE_HIT_RATE = "host_cache_hit_rate"
    # Disk cache hit rate (0.0-1.0) from the sliding window
    DISK_CACHE_HIT_RATE = "disk_cache_hit_rate"
    # Object storage cache hit rate (0.0-1.0) from the sliding window
    OBJECT_CACHE_HIT_RATE = "object_cache_hit_rate"
    # Number of blocks offloaded from device to object storage
    OFFLOAD_BLOCKS_D2O = "offload_blocks_d2o"
    # Number of blocks onboarded from object storage to device
    ONBOARD_BLOCKS_O2D = "onboard_blocks_o2d"
    # Bytes transferred to object storage (offload)
    OFFLOAD_BYTES_OBJECT = "offload_bytes_object"
    # Bytes transferred from object storage (onboard)
    ONBOARD_BYTES_OBJECT = "onboard_bytes_object"
    # Number of failed object storage read operations (blocks)
    OBJECT_READ_FAILURES = "object_read_failures"
    # Number of failed object storage write operations (blocks)
    OBJECT_WRITE_FAILURES = "object_write_failures"


class kvrouter:
    # Number of KV cache events applied to the index (including status)
    KV_CACHE_EVENTS_APPLIED = "kv_cache_events_applied"


class kvstats:
    # Total number of KV cache blocks available on the worker
    TOTAL_BLOCKS = "total_blocks"
    # GPU cache usage as a percentage (0.0-1.0)
    GPU_CACHE_USAGE_PERCENT = "gpu_cache_usage_percent"


class labels:
    """Automatically inserted Prometheus label names used across the metrics system"""

    # Label for component identification
    COMPONENT = "dynamo_component"
    # Label for namespace identification
    NAMESPACE = "dynamo_namespace"
    # Label for endpoint identification
    ENDPOINT = "dynamo_endpoint"
    # Label for worker data-parallel rank.
    # Note: this is not an auto-inserted label like `dynamo_namespace`/`dynamo_component`.
    # It is used by worker/load-style metrics that need to disambiguate per-worker series.
    DP_RANK = "dp_rank"
    # Label for worker instance ID (etcd lease ID).
    WORKER_ID = "worker_id"
    # Label for model name/path (OpenAI API standard, injected by Dynamo)
    # This is the standard label name injected by all backends in metrics_labels=[("model", ...)].
    # Ensures compatibility with OpenAI-compatible tooling.
    MODEL = "model"
    # Label for model name/path (alternative/native engine label, injected by Dynamo)
    # Some engines natively use model_name, so we inject both model and model_name
    # to ensure maximum compatibility with both OpenAI standard and engine-native tooling.
    # When a metric already has a label, injection does not overwrite it (original is preserved).
    MODEL_NAME = "model_name"
    # Label for worker type (e.g., "aggregated", "prefill", "decode", "encoder", etc.)
    WORKER_TYPE = "worker_type"
    # Label for router instance (discovery.instance_id() of the frontend)
    ROUTER_ID = "router_id"


class model_info:
    # Model load time in seconds
    LOAD_TIME_SECONDS = "model_load_time_seconds"


class name_prefix:
    """Metric name prefixes used across the metrics system"""

    # Prefix for all Prometheus metric names.
    COMPONENT = "dynamo_component"
    # Prefix for frontend service metrics
    FRONTEND = "dynamo_frontend"
    # Prefix for KV router metrics (used with router_id label)
    ROUTER = "dynamo_router"


class router:
    """Router request metrics (component-scoped aggregate histograms + counter)"""

    # Total number of requests processed by the router
    REQUESTS_TOTAL = "router_requests_total"
    # Time to first token observed at the router (seconds)
    TIME_TO_FIRST_TOKEN_SECONDS = "router_time_to_first_token_seconds"
    # Average inter-token latency observed at the router (seconds)
    INTER_TOKEN_LATENCY_SECONDS = "router_inter_token_latency_seconds"
    # Input sequence length in tokens observed at the router
    INPUT_SEQUENCE_TOKENS = "router_input_sequence_tokens"
    # Output sequence length in tokens observed at the router
    OUTPUT_SEQUENCE_TOKENS = "router_output_sequence_tokens"


class router_request:
    """Router per-request metrics (component-scoped via `MetricsHierarchy`)."""

    # Prefix prepended to `frontend_service::*` names to form router metric names.
    # e.g. `"router_"` + `frontend_service::REQUESTS_TOTAL` → `"router_requests_total"`.
    METRIC_PREFIX = "router_"


class routing_overhead:
    """Routing overhead phase latency histogram suffixes."""

    # Time spent computing block hashes
    BLOCK_HASHING_MS = "overhead_block_hashing_ms"
    # Time spent in indexer find_matches
    INDEXER_FIND_MATCHES_MS = "overhead_indexer_find_matches_ms"
    # Time spent computing sequence hashes
    SEQ_HASHING_MS = "overhead_seq_hashing_ms"
    # Time spent in scheduler worker selection
    SCHEDULING_MS = "overhead_scheduling_ms"
    # Total routing overhead per request
    TOTAL_MS = "overhead_total_ms"


class task_tracker:
    """Task tracker Prometheus metric name suffixes"""

    # Total number of tasks issued/submitted
    TASKS_ISSUED_TOTAL = "tasks_issued_total"
    # Total number of tasks started
    TASKS_STARTED_TOTAL = "tasks_started_total"
    # Total number of successfully completed tasks
    TASKS_SUCCESS_TOTAL = "tasks_success_total"
    # Total number of cancelled tasks
    TASKS_CANCELLED_TOTAL = "tasks_cancelled_total"
    # Total number of failed tasks
    TASKS_FAILED_TOTAL = "tasks_failed_total"
    # Total number of rejected tasks
    TASKS_REJECTED_TOTAL = "tasks_rejected_total"


class work_handler:
    """Work handler Prometheus metric names"""

    # Total number of requests processed by work handler
    REQUESTS_TOTAL = "requests_total"
    # Total number of bytes received in requests by work handler
    REQUEST_BYTES_TOTAL = "request_bytes_total"
    # Total number of bytes sent in responses by work handler
    RESPONSE_BYTES_TOTAL = "response_bytes_total"
    # Number of requests currently being processed by work handler
    # Note: This is a gauge metric (current state) that can go up and down, so no _total suffix
    INFLIGHT_REQUESTS = "inflight_requests"
    # Time spent processing requests by work handler (histogram)
    REQUEST_DURATION_SECONDS = "request_duration_seconds"
    # Total number of errors in work handler processing
    ERRORS_TOTAL = "errors_total"
    # Label name for error type classification
    ERROR_TYPE_LABEL = "error_type"