Unverified Commit 233df6f5 authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent 18016a5e
...@@ -205,6 +205,7 @@ EXPECTED_METRICS_V1 = [ ...@@ -205,6 +205,7 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc", "vllm:gpu_cache_usage_perc",
"vllm:prompt_tokens_total", "vllm:prompt_tokens_total",
"vllm:generation_tokens_total", "vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket", "vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count", "vllm:request_prompt_tokens_count",
......
...@@ -15,6 +15,23 @@ if TYPE_CHECKING: ...@@ -15,6 +15,23 @@ if TYPE_CHECKING:
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
class RequestFinishedReason(enum.IntEnum):
"""
Reason a request finished - stop, length, or abort.
stop - a stop string was emitted
length - max_tokens was consumed, or max_model_len was reached
abort - aborted for another reason
"""
STOP = 0
LENGTH = 1
ABORT = 2
def __str__(self):
return self.name.lower()
@dataclass @dataclass
class EngineCoreRequest: class EngineCoreRequest:
...@@ -45,7 +62,7 @@ class EngineCoreOutput( ...@@ -45,7 +62,7 @@ class EngineCoreOutput(
request_id: str request_id: str
new_token_ids: List[int] new_token_ids: List[int]
finished: bool finished: bool
finish_reason: Optional[str] = None finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None stop_reason: Union[int, str, None] = None
...@@ -56,7 +73,7 @@ class EngineCoreOutputs( ...@@ -56,7 +73,7 @@ class EngineCoreOutputs(
gc=False): # type: ignore[call-arg] gc=False): # type: ignore[call-arg]
#NOTE(Nick): We could consider ways to make this more compact, #NOTE(Nick): We could consider ways to make this more compact,
# e.g. columnwise layout and using an int enum for finish/stop reason # e.g. columnwise layout
# [num_reqs] # [num_reqs]
outputs: List[EngineCoreOutput] outputs: List[EngineCoreOutput]
......
...@@ -8,7 +8,8 @@ from vllm.logger import init_logger ...@@ -8,7 +8,8 @@ from vllm.logger import init_logger
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.transformers_utils.detokenizer_utils import ( from vllm.transformers_utils.detokenizer_utils import (
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
RequestFinishedReason)
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -18,7 +19,7 @@ class DetokenizerOutput: ...@@ -18,7 +19,7 @@ class DetokenizerOutput:
output_text: str output_text: str
token_ids: List[int] token_ids: List[int]
finished: bool finished: bool
finish_reason: Optional[str] = None finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None stop_reason: Union[int, str, None] = None
...@@ -147,13 +148,13 @@ class IncrementalDetokenizer: ...@@ -147,13 +148,13 @@ class IncrementalDetokenizer:
stop_str, truncate_to = stop stop_str, truncate_to = stop
if truncate_to != -1: if truncate_to != -1:
self.output_text = self.output_text[:truncate_to] self.output_text = self.output_text[:truncate_to]
finish_reason = "stop" # TODO: use constant finish_reason = RequestFinishedReason.STOP
stop_reason = stop_str stop_reason = stop_str
# TODO: handle stop_token_ids here too? # TODO: handle stop_token_ids here too?
# 3) Update the RequestOutput object with the new text. # 3) Update the RequestOutput object with the new text.
finished = bool(finish_reason) finished = finish_reason is not None
if self.output_kind == RequestOutputKind.FINAL_ONLY \ if self.output_kind == RequestOutputKind.FINAL_ONLY \
and not finished: and not finished:
return None return None
......
...@@ -161,8 +161,10 @@ class OutputProcessor: ...@@ -161,8 +161,10 @@ class OutputProcessor:
engine_core_output) engine_core_output)
# 3) Create and handle RequestOutput objects. # 3) Create and handle RequestOutput objects.
if request_output := self._make_request_output( if detokenizer_output is not None:
req_state, detokenizer_output): request_output = self._make_request_output(
req_state, detokenizer_output)
if req_state.queue is not None: if req_state.queue is not None:
# AsyncLLM: put into queue for handling by generate(). # AsyncLLM: put into queue for handling by generate().
req_state.queue.put_nowait(request_output) req_state.queue.put_nowait(request_output)
...@@ -172,6 +174,8 @@ class OutputProcessor: ...@@ -172,6 +174,8 @@ class OutputProcessor:
# Free completed requests. # Free completed requests.
if request_output.finished: if request_output.finished:
assert detokenizer_output.finish_reason is not None
self.request_states.pop(req_id) self.request_states.pop(req_id)
if not engine_core_output.finished: if not engine_core_output.finished:
# If req not finished in EngineCore, but Detokenizer # If req not finished in EngineCore, but Detokenizer
...@@ -180,7 +184,8 @@ class OutputProcessor: ...@@ -180,7 +184,8 @@ class OutputProcessor:
# Track per-request stats # Track per-request stats
iteration_stats.update_from_finished_request( iteration_stats.update_from_finished_request(
request_output, req_state.stats) detokenizer_output.finish_reason, request_output,
req_state.stats)
return OutputProcessorOutput( return OutputProcessorOutput(
request_outputs=request_outputs, request_outputs=request_outputs,
...@@ -191,12 +196,8 @@ class OutputProcessor: ...@@ -191,12 +196,8 @@ class OutputProcessor:
@staticmethod @staticmethod
def _make_request_output( def _make_request_output(
request_state: RequestState, request_state: RequestState,
detokenizer_output: Optional[DetokenizerOutput], detokenizer_output: DetokenizerOutput,
) -> Optional[RequestOutput]: ) -> RequestOutput:
if detokenizer_output is None:
return None
request_output = RequestOutput.new( request_output = RequestOutput.new(
request_state.request_id, request_state.request_id,
request_state.prompt, request_state.prompt,
...@@ -207,7 +208,8 @@ class OutputProcessor: ...@@ -207,7 +208,8 @@ class OutputProcessor:
) )
if detokenizer_output.finished: if detokenizer_output.finished:
completion_output = request_output.outputs[0] completion_output = request_output.outputs[0]
completion_output.finish_reason = detokenizer_output.finish_reason completion_output.finish_reason = str(
detokenizer_output.finish_reason)
completion_output.stop_reason = detokenizer_output.stop_reason completion_output.stop_reason = detokenizer_output.stop_reason
return request_output return request_output
...@@ -2,13 +2,14 @@ ...@@ -2,13 +2,14 @@
import time import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import Dict, List
import numpy as np import numpy as np
import prometheus_client import prometheus_client
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.engine import RequestFinishedReason
from vllm.v1.metrics.stats import IterationStats, SchedulerStats from vllm.v1.metrics.stats import IterationStats, SchedulerStats
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -116,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -116,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase):
documentation="Number of generation tokens processed.", documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues) labelnames=labelnames).labels(*labelvalues)
self.counter_request_success: Dict[RequestFinishedReason,
prometheus_client.Counter] = {}
counter_request_success_base = prometheus_client.Counter(
name="vllm:request_success_total",
documentation="Count of successfully processed requests.",
labelnames=labelnames + ["finished_reason"])
for reason in RequestFinishedReason:
self.counter_request_success[
reason] = counter_request_success_base.labels(*(labelvalues +
[str(reason)]))
self.histogram_num_prompt_tokens_request = \ self.histogram_num_prompt_tokens_request = \
prometheus_client.Histogram( prometheus_client.Histogram(
name="vllm:request_prompt_tokens", name="vllm:request_prompt_tokens",
...@@ -163,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -163,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase):
iteration_stats.num_generation_tokens) iteration_stats.num_generation_tokens)
for finished_request in iteration_stats.finished_requests: for finished_request in iteration_stats.finished_requests:
self.counter_request_success[finished_request.finish_reason].inc()
self.histogram_num_prompt_tokens_request.observe( self.histogram_num_prompt_tokens_request.observe(
finished_request.num_prompt_tokens) finished_request.num_prompt_tokens)
self.histogram_num_generation_tokens_request.observe( self.histogram_num_generation_tokens_request.observe(
......
...@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List ...@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
@dataclass @dataclass
...@@ -32,6 +32,7 @@ class RequestStateStats: ...@@ -32,6 +32,7 @@ class RequestStateStats:
class FinishedRequestStats: class FinishedRequestStats:
"""Stats associated with a finished request.""" """Stats associated with a finished request."""
finish_reason: "RequestFinishedReason"
num_prompt_tokens: int = 0 num_prompt_tokens: int = 0
num_generation_tokens: int = 0 num_generation_tokens: int = 0
...@@ -73,8 +74,11 @@ class IterationStats: ...@@ -73,8 +74,11 @@ class IterationStats:
request_state_stats.num_generation_tokens += num_new_generation_tokens request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now request_state_stats.last_token_time = now
def update_from_finished_request(self, request_output: "RequestOutput", def update_from_finished_request(self,
finish_reason: "RequestFinishedReason",
request_output: "RequestOutput",
request_state_stats: RequestStateStats): request_state_stats: RequestStateStats):
self.finished_requests.append( self.finished_requests.append(
FinishedRequestStats(len(request_output.prompt_token_ids), FinishedRequestStats(finish_reason,
len(request_output.prompt_token_ids),
request_state_stats.num_generation_tokens)) request_state_stats.num_generation_tokens))
...@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union ...@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import RequestMetrics from vllm.sequence import RequestMetrics
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
from vllm.v1.utils import ConstantList from vllm.v1.utils import ConstantList
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -109,7 +109,7 @@ class Request: ...@@ -109,7 +109,7 @@ class Request:
def is_finished(self) -> bool: def is_finished(self) -> bool:
return RequestStatus.is_finished(self.status) return RequestStatus.is_finished(self.status)
def get_finished_reason(self) -> Union[str, None]: def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
return RequestStatus.get_finished_reason(self.status) return RequestStatus.get_finished_reason(self.status)
def has_encoder_inputs(self) -> bool: def has_encoder_inputs(self) -> bool:
...@@ -149,7 +149,8 @@ class RequestStatus(enum.IntEnum): ...@@ -149,7 +149,8 @@ class RequestStatus(enum.IntEnum):
return status > RequestStatus.PREEMPTED return status > RequestStatus.PREEMPTED
@staticmethod @staticmethod
def get_finished_reason(status: "RequestStatus") -> Union[str, None]: def get_finished_reason(
status: "RequestStatus") -> Union[RequestFinishedReason, None]:
return _FINISHED_REASON_MAP.get(status) return _FINISHED_REASON_MAP.get(status)
...@@ -158,8 +159,8 @@ class RequestStatus(enum.IntEnum): ...@@ -158,8 +159,8 @@ class RequestStatus(enum.IntEnum):
# are longer than the model's length cap. Therefore, the stop # are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API. # reason should also be "length" as in OpenAI API.
_FINISHED_REASON_MAP = { _FINISHED_REASON_MAP = {
RequestStatus.FINISHED_STOPPED: "stop", RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
RequestStatus.FINISHED_LENGTH_CAPPED: "length", RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
RequestStatus.FINISHED_ABORTED: "abort", RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
RequestStatus.FINISHED_IGNORED: "length", RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment