prometheus.py 2.78 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

import os
import tempfile
from typing import Optional

from prometheus_client import REGISTRY, CollectorRegistry, multiprocess

from vllm.logger import init_logger

logger = init_logger(__name__)

# Global temporary directory for prometheus multiprocessing
_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None


def setup_multiprocess_prometheus():
    """Set up prometheus multiprocessing directory if not already configured.
    
    """
    global _prometheus_multiproc_dir

    if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
        # Make TemporaryDirectory for prometheus multiprocessing
        # Note: global TemporaryDirectory will be automatically
        # cleaned up upon exit.
        _prometheus_multiproc_dir = tempfile.TemporaryDirectory()
        os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
        logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s",
                     _prometheus_multiproc_dir.name)
    else:
        logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. "
                       "This directory must be wiped between vLLM runs or "
                       "you will find inaccurate metrics. Unset the variable "
                       "and vLLM will properly handle cleanup.")


39
def get_prometheus_registry() -> CollectorRegistry:
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    """Get the appropriate prometheus registry based on multiprocessing 
    configuration.
    
    Returns:
        Registry: A prometheus registry
    """
    if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None:
        logger.debug("Using multiprocess registry for prometheus metrics")
        registry = CollectorRegistry()
        multiprocess.MultiProcessCollector(registry)
        return registry

    return REGISTRY


def unregister_vllm_metrics():
    """Unregister any existing vLLM collectors from the prometheus registry.
    
    This is useful for testing and CI/CD where metrics may be registered
    multiple times across test runs.
    
    Also, in case of multiprocess, we need to unregister the metrics from the 
    global registry.
    """
    registry = REGISTRY
    # Unregister any existing vLLM collectors
    for collector in list(registry._collector_to_names):
        if hasattr(collector, "_name") and "vllm" in collector._name:
            registry.unregister(collector)


def shutdown_prometheus():
    """Shutdown prometheus metrics."""
73
74
75
76

    path = _prometheus_multiproc_dir
    if path is None:
        return
77
78
    try:
        pid = os.getpid()
79
        multiprocess.mark_process_dead(pid, path)
80
81
82
        logger.debug("Marked Prometheus metrics for process %d as dead", pid)
    except Exception as e:
        logger.error("Error during metrics cleanup: %s", str(e))