Unverified Commit a0b782f9 authored by SungMinCho's avatar SungMinCho Committed by GitHub
Browse files

[Metrics] Model FLOPs Utilization estimation (#30738)


Signed-off-by: default avatarSungMinCho <tjdals4565@gmail.com>
Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
Co-authored-by: default avatarMark McLoughlin <markmc@redhat.com>
parent ed2897f3
This diff is collapsed.
...@@ -64,6 +64,9 @@ class ObservabilityConfig: ...@@ -64,6 +64,9 @@ class ObservabilityConfig:
module in the model and attach informations such as input/output shapes to module in the model and attach informations such as input/output shapes to
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled.""" nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
enable_mfu_metrics: bool = False
"""Enable Model FLOPs Utilization (MFU) metrics."""
@cached_property @cached_property
def collect_model_forward_time(self) -> bool: def collect_model_forward_time(self) -> bool:
"""Whether to collect model forward time for the request.""" """Whether to collect model forward time for the request."""
......
...@@ -523,6 +523,7 @@ class EngineArgs: ...@@ -523,6 +523,7 @@ class EngineArgs:
enable_layerwise_nvtx_tracing: bool = ( enable_layerwise_nvtx_tracing: bool = (
ObservabilityConfig.enable_layerwise_nvtx_tracing ObservabilityConfig.enable_layerwise_nvtx_tracing
) )
enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
...@@ -1042,6 +1043,10 @@ class EngineArgs: ...@@ -1042,6 +1043,10 @@ class EngineArgs:
"--enable-layerwise-nvtx-tracing", "--enable-layerwise-nvtx-tracing",
**observability_kwargs["enable_layerwise_nvtx_tracing"], **observability_kwargs["enable_layerwise_nvtx_tracing"],
) )
observability_group.add_argument(
"--enable-mfu-metrics",
**observability_kwargs["enable_mfu_metrics"],
)
# Scheduler arguments # Scheduler arguments
scheduler_kwargs = get_kwargs(SchedulerConfig) scheduler_kwargs = get_kwargs(SchedulerConfig)
...@@ -1689,6 +1694,7 @@ class EngineArgs: ...@@ -1689,6 +1694,7 @@ class EngineArgs:
kv_cache_metrics_sample=self.kv_cache_metrics_sample, kv_cache_metrics_sample=self.kv_cache_metrics_sample,
cudagraph_metrics=self.cudagraph_metrics, cudagraph_metrics=self.cudagraph_metrics,
enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing, enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
enable_mfu_metrics=self.enable_mfu_metrics,
) )
# Compilation config overrides # Compilation config overrides
......
...@@ -244,6 +244,7 @@ if TYPE_CHECKING: ...@@ -244,6 +244,7 @@ if TYPE_CHECKING:
VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256 VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
VLLM_USE_V2_MODEL_RUNNER: bool = False VLLM_USE_V2_MODEL_RUNNER: bool = False
VLLM_DEBUG_MFU_METRICS: bool = False
def get_default_cache_root(): def get_default_cache_root():
...@@ -1565,6 +1566,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1565,6 +1566,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_V2_MODEL_RUNNER": lambda: bool( "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0")) int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
), ),
# Debug logging for --enable-mfu-metrics
"VLLM_DEBUG_MFU_METRICS": lambda: bool(
int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
),
} }
# --8<-- [end:env-vars-definition] # --8<-- [end:env-vars-definition]
......
...@@ -43,6 +43,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu ...@@ -43,6 +43,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
from vllm.v1.core.sched.utils import check_stop, remove_all from vllm.v1.core.sched.utils import check_stop, remove_all
from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.metrics.perf import ModelMetrics, PerfStats
from vllm.v1.metrics.stats import ( from vllm.v1.metrics.stats import (
PrefixCacheStats, PrefixCacheStats,
SchedulerStats, SchedulerStats,
...@@ -219,6 +220,10 @@ class Scheduler(SchedulerInterface): ...@@ -219,6 +220,10 @@ class Scheduler(SchedulerInterface):
self.use_pp = self.parallel_config.pipeline_parallel_size > 1 self.use_pp = self.parallel_config.pipeline_parallel_size > 1
self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
self.perf_metrics: ModelMetrics | None = None
if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
self.perf_metrics = ModelMetrics(vllm_config)
def schedule(self) -> SchedulerOutput: def schedule(self) -> SchedulerOutput:
# NOTE(woosuk) on the scheduling algorithm: # NOTE(woosuk) on the scheduling algorithm:
# There's no "decoding phase" nor "prefill phase" in the scheduler. # There's no "decoding phase" nor "prefill phase" in the scheduler.
...@@ -1066,6 +1071,10 @@ class Scheduler(SchedulerInterface): ...@@ -1066,6 +1071,10 @@ class Scheduler(SchedulerInterface):
kv_connector_output = model_runner_output.kv_connector_output kv_connector_output = model_runner_output.kv_connector_output
cudagraph_stats = model_runner_output.cudagraph_stats cudagraph_stats = model_runner_output.cudagraph_stats
perf_stats: PerfStats | None = None
if self.perf_metrics and self.perf_metrics.is_enabled():
perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)
outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
spec_decoding_stats: SpecDecodingStats | None = None spec_decoding_stats: SpecDecodingStats | None = None
kv_connector_stats: KVConnectorStats | None = ( kv_connector_stats: KVConnectorStats | None = (
...@@ -1262,7 +1271,7 @@ class Scheduler(SchedulerInterface): ...@@ -1262,7 +1271,7 @@ class Scheduler(SchedulerInterface):
if ( if (
stats := self.make_stats( stats := self.make_stats(
spec_decoding_stats, kv_connector_stats, cudagraph_stats spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats
) )
) is not None: ) is not None:
# Return stats to only one of the front-ends. # Return stats to only one of the front-ends.
...@@ -1485,6 +1494,7 @@ class Scheduler(SchedulerInterface): ...@@ -1485,6 +1494,7 @@ class Scheduler(SchedulerInterface):
spec_decoding_stats: SpecDecodingStats | None = None, spec_decoding_stats: SpecDecodingStats | None = None,
kv_connector_stats: KVConnectorStats | None = None, kv_connector_stats: KVConnectorStats | None = None,
cudagraph_stats: CUDAGraphStat | None = None, cudagraph_stats: CUDAGraphStat | None = None,
perf_stats: PerfStats | None = None,
) -> SchedulerStats | None: ) -> SchedulerStats | None:
if not self.log_stats: if not self.log_stats:
return None return None
...@@ -1510,6 +1520,7 @@ class Scheduler(SchedulerInterface): ...@@ -1510,6 +1520,7 @@ class Scheduler(SchedulerInterface):
spec_decoding_stats=spec_stats, spec_decoding_stats=spec_stats,
kv_connector_stats=connector_stats_payload, kv_connector_stats=connector_stats_payload,
cudagraph_stats=cudagraph_stats, cudagraph_stats=cudagraph_stats,
perf_stats=perf_stats,
) )
def make_spec_decoding_stats( def make_spec_decoding_stats(
......
...@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( ...@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
from vllm.v1.engine import FinishReason from vllm.v1.engine import FinishReason
from vllm.v1.metrics.perf import PerfMetricsLogging
from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.prometheus import unregister_vllm_metrics
from vllm.v1.metrics.stats import ( from vllm.v1.metrics.stats import (
CachingMetrics, CachingMetrics,
...@@ -118,6 +119,9 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -118,6 +119,9 @@ class LoggingStatLogger(StatLoggerBase):
self.engine_is_idle = False self.engine_is_idle = False
self.aggregated = False self.aggregated = False
if self._enable_perf_stats():
self.perf_metrics_logging = PerfMetricsLogging(vllm_config)
def _reset(self, now): def _reset(self, now):
self.last_log_time = now self.last_log_time = now
...@@ -127,6 +131,9 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -127,6 +131,9 @@ class LoggingStatLogger(StatLoggerBase):
self.num_corrupted_reqs: int = 0 self.num_corrupted_reqs: int = 0
self.num_preemptions: int = 0 self.num_preemptions: int = 0
def _enable_perf_stats(self) -> bool:
return self.vllm_config.observability_config.enable_mfu_metrics
def _track_iteration_stats(self, iteration_stats: IterationStats): def _track_iteration_stats(self, iteration_stats: IterationStats):
# Save tracked stats for token counters. # Save tracked stats for token counters.
self.num_prompt_tokens += iteration_stats.num_prompt_tokens self.num_prompt_tokens += iteration_stats.num_prompt_tokens
...@@ -175,6 +182,8 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -175,6 +182,8 @@ class LoggingStatLogger(StatLoggerBase):
self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats) self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
if not self.aggregated: if not self.aggregated:
self.last_scheduler_stats = scheduler_stats self.last_scheduler_stats = scheduler_stats
if (perf_stats := scheduler_stats.perf_stats) and self._enable_perf_stats():
self.perf_metrics_logging.observe(perf_stats)
if mm_cache_stats: if mm_cache_stats:
self.mm_caching_metrics.observe(mm_cache_stats) self.mm_caching_metrics.observe(mm_cache_stats)
...@@ -211,7 +220,7 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -211,7 +220,7 @@ class LoggingStatLogger(StatLoggerBase):
"Running: %d reqs", "Running: %d reqs",
"Waiting: %d reqs", "Waiting: %d reqs",
] ]
log_args = [ log_args: list[int | float | str] = [
self.last_prompt_throughput, self.last_prompt_throughput,
self.last_generation_throughput, self.last_generation_throughput,
self.last_scheduler_stats.num_running_reqs, self.last_scheduler_stats.num_running_reqs,
...@@ -254,6 +263,8 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -254,6 +263,8 @@ class LoggingStatLogger(StatLoggerBase):
self.kv_connector_logging.log(log_fn=log_fn) self.kv_connector_logging.log(log_fn=log_fn)
if self.cudagraph_logging is not None: if self.cudagraph_logging is not None:
self.cudagraph_logging.log(log_fn=log_fn) self.cudagraph_logging.log(log_fn=log_fn)
if self._enable_perf_stats():
self.perf_metrics_logging.log(log_fn=log_fn, log_prefix=self.log_prefix)
def log_engine_initialized(self): def log_engine_initialized(self):
if self.vllm_config.cache_config.num_gpu_blocks: if self.vllm_config.cache_config.num_gpu_blocks:
...@@ -282,6 +293,10 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase): ...@@ -282,6 +293,10 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
def log_prefix(self): def log_prefix(self):
return "{} Engines Aggregated: ".format(len(self.engine_indexes)) return "{} Engines Aggregated: ".format(len(self.engine_indexes))
def _enable_perf_stats(self) -> bool:
# Adding per_gpu perf stats across engines can lead to misleading numbers.
return False
def record( def record(
self, self,
scheduler_stats: SchedulerStats | None, scheduler_stats: SchedulerStats | None,
......
This diff is collapsed.
...@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any ...@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any
import vllm.envs as envs import vllm.envs as envs
from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.compilation.cuda_graph import CUDAGraphStat
from vllm.v1.metrics.perf import PerfStats
from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.spec_decode.metrics import SpecDecodingStats
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -186,6 +187,8 @@ class SchedulerStats: ...@@ -186,6 +187,8 @@ class SchedulerStats:
cudagraph_stats: CUDAGraphStat | None = None cudagraph_stats: CUDAGraphStat | None = None
perf_stats: PerfStats | None = None
@dataclass @dataclass
class RequestStateStats: class RequestStateStats:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment