[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>

[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
233df6f5 · Mark McLoughlin · GitHub · 18016a5e · 233df6f5 · 233df6f5
Unverified Commit 233df6f5 authored Feb 05, 2025 by Mark McLoughlin Committed by GitHub Feb 04, 2025
7 changed files
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -205,6 +205,7 @@ EXPECTED_METRICS_V1 = [
    "vllm:gpu_cache_usage_perc",
    "vllm:prompt_tokens_total",
    "vllm:generation_tokens_total",
+    "vllm:request_success_total",
    "vllm:request_prompt_tokens_sum",
    "vllm:request_prompt_tokens_bucket",
    "vllm:request_prompt_tokens_count",

--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -15,6 +15,23 @@ if TYPE_CHECKING:
    from vllm.sampling_params import SamplingParams
+class RequestFinishedReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, or abort.
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted for another reason
+    """
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+    def __str__(self):
+        return self.name.lower()
 @dataclass
 class EngineCoreRequest:
@@ -45,7 +62,7 @@ class EngineCoreOutput(
    request_id: str
    new_token_ids: List[int]
    finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
    stop_reason: Union[int, str, None] = None
@@ -56,7 +73,7 @@ class EngineCoreOutputs(
        gc=False):  # type: ignore[call-arg]
    #NOTE(Nick): We could consider ways to make this more compact,
-    # e.g. columnwise layout and using an int enum for finish/stop reason
+    # e.g. columnwise layout
    # [num_reqs]
    outputs: List[EngineCoreOutput]

--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,8 @@ from vllm.logger import init_logger
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
+                            RequestFinishedReason)
 logger = init_logger(__name__)
@@ -18,7 +19,7 @@ class DetokenizerOutput:
    output_text: str
    token_ids: List[int]
    finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
    stop_reason: Union[int, str, None] = None
@@ -147,13 +148,13 @@ class IncrementalDetokenizer:
                stop_str, truncate_to = stop
                if truncate_to != -1:
                    self.output_text = self.output_text[:truncate_to]
-                finish_reason = "stop"  # TODO: use constant
+                finish_reason = RequestFinishedReason.STOP
                stop_reason = stop_str
        # TODO: handle stop_token_ids here too?
        # 3) Update the RequestOutput object with the new text.
-        finished = bool(finish_reason)
+        finished = finish_reason is not None
        if self.output_kind == RequestOutputKind.FINAL_ONLY \
            and not finished:
            return None

--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -161,8 +161,10 @@ class OutputProcessor:
                engine_core_output)
            # 3) Create and handle RequestOutput objects.
-            if request_output := self._make_request_output(
+            if detokenizer_output is not None:
-                    req_state, detokenizer_output):
+                request_output = self._make_request_output(
+                    req_state, detokenizer_output)
                if req_state.queue is not None:
                    # AsyncLLM: put into queue for handling by generate().
                    req_state.queue.put_nowait(request_output)
@@ -172,6 +174,8 @@ class OutputProcessor:
                # Free completed requests.
                if request_output.finished:
+                    assert detokenizer_output.finish_reason is not None
                    self.request_states.pop(req_id)
                    if not engine_core_output.finished:
                        # If req not finished in EngineCore, but Detokenizer
@@ -180,7 +184,8 @@ class OutputProcessor:
                    # Track per-request stats
                    iteration_stats.update_from_finished_request(
-                        request_output, req_state.stats)
+                        detokenizer_output.finish_reason, request_output,
+                        req_state.stats)
        return OutputProcessorOutput(
            request_outputs=request_outputs,
@@ -191,12 +196,8 @@ class OutputProcessor:
    @staticmethod
    def _make_request_output(
        request_state: RequestState,
-        detokenizer_output: Optional[DetokenizerOutput],
+        detokenizer_output: DetokenizerOutput,
-    ) -> Optional[RequestOutput]:
+    ) -> RequestOutput:
-        if detokenizer_output is None:
-            return None
        request_output = RequestOutput.new(
            request_state.request_id,
            request_state.prompt,
@@ -207,7 +208,8 @@ class OutputProcessor:
        )
        if detokenizer_output.finished:
            completion_output = request_output.outputs[0]
-            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.finish_reason = str(
+                detokenizer_output.finish_reason)
            completion_output.stop_reason = detokenizer_output.stop_reason
        return request_output
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -2,13 +2,14 @@
 import time
 from abc import ABC, abstractmethod
-from typing import List
+from typing import Dict, List
 import numpy as np
 import prometheus_client
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
+from vllm.v1.engine import RequestFinishedReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 logger = init_logger(__name__)
@@ -116,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase):
            documentation="Number of generation tokens processed.",
            labelnames=labelnames).labels(*labelvalues)
+        self.counter_request_success: Dict[RequestFinishedReason,
+                                           prometheus_client.Counter] = {}
+        counter_request_success_base = prometheus_client.Counter(
+            name="vllm:request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + ["finished_reason"])
+        for reason in RequestFinishedReason:
+            self.counter_request_success[
+                reason] = counter_request_success_base.labels(*(labelvalues +
+                                                                [str(reason)]))
        self.histogram_num_prompt_tokens_request = \
            prometheus_client.Histogram(
                name="vllm:request_prompt_tokens",
@@ -163,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase):
            iteration_stats.num_generation_tokens)
        for finished_request in iteration_stats.finished_requests:
+            self.counter_request_success[finished_request.finish_reason].inc()
            self.histogram_num_prompt_tokens_request.observe(
                finished_request.num_prompt_tokens)
            self.histogram_num_generation_tokens_request.observe(

--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List
 if TYPE_CHECKING:
    from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput
+    from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
 @dataclass
@@ -32,6 +32,7 @@ class RequestStateStats:
 class FinishedRequestStats:
    """Stats associated with a finished request."""
+    finish_reason: "RequestFinishedReason"
    num_prompt_tokens: int = 0
    num_generation_tokens: int = 0
@@ -73,8 +74,11 @@ class IterationStats:
        request_state_stats.num_generation_tokens += num_new_generation_tokens
        request_state_stats.last_token_time = now
-    def update_from_finished_request(self, request_output: "RequestOutput",
+    def update_from_finished_request(self,
+                                     finish_reason: "RequestFinishedReason",
+                                     request_output: "RequestOutput",
                                     request_state_stats: RequestStateStats):
        self.finished_requests.append(
-            FinishedRequestStats(len(request_output.prompt_token_ids),
+            FinishedRequestStats(finish_reason,
+                                 len(request_output.prompt_token_ids),
                                 request_state_stats.num_generation_tokens))
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
 from vllm.v1.utils import ConstantList
 if TYPE_CHECKING:
@@ -109,7 +109,7 @@ class Request:
    def is_finished(self) -> bool:
        return RequestStatus.is_finished(self.status)
-    def get_finished_reason(self) -> Union[str, None]:
+    def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
        return RequestStatus.get_finished_reason(self.status)
    def has_encoder_inputs(self) -> bool:
@@ -149,7 +149,8 @@ class RequestStatus(enum.IntEnum):
        return status > RequestStatus.PREEMPTED
    @staticmethod
-    def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
+    def get_finished_reason(
+            status: "RequestStatus") -> Union[RequestFinishedReason, None]:
        return _FINISHED_REASON_MAP.get(status)
@@ -158,8 +159,8 @@ class RequestStatus(enum.IntEnum):
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
-    RequestStatus.FINISHED_STOPPED: "stop",
+    RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
-    RequestStatus.FINISHED_LENGTH_CAPPED: "length",
+    RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
-    RequestStatus.FINISHED_ABORTED: "abort",
+    RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
-    RequestStatus.FINISHED_IGNORED: "length",
+    RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
 }