Unverified Commit a0b782f9 authored by SungMinCho's avatar SungMinCho Committed by GitHub
Browse files

[Metrics] Model FLOPs Utilization estimation (#30738)


Signed-off-by: default avatarSungMinCho <tjdals4565@gmail.com>
Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
Co-authored-by: default avatarMark McLoughlin <markmc@redhat.com>
parent ed2897f3
This diff is collapsed.
......@@ -64,6 +64,9 @@ class ObservabilityConfig:
module in the model and attach informations such as input/output shapes to
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
enable_mfu_metrics: bool = False
"""Enable Model FLOPs Utilization (MFU) metrics."""
@cached_property
def collect_model_forward_time(self) -> bool:
"""Whether to collect model forward time for the request."""
......
......@@ -523,6 +523,7 @@ class EngineArgs:
enable_layerwise_nvtx_tracing: bool = (
ObservabilityConfig.enable_layerwise_nvtx_tracing
)
enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
......@@ -1042,6 +1043,10 @@ class EngineArgs:
"--enable-layerwise-nvtx-tracing",
**observability_kwargs["enable_layerwise_nvtx_tracing"],
)
observability_group.add_argument(
"--enable-mfu-metrics",
**observability_kwargs["enable_mfu_metrics"],
)
# Scheduler arguments
scheduler_kwargs = get_kwargs(SchedulerConfig)
......@@ -1689,6 +1694,7 @@ class EngineArgs:
kv_cache_metrics_sample=self.kv_cache_metrics_sample,
cudagraph_metrics=self.cudagraph_metrics,
enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
enable_mfu_metrics=self.enable_mfu_metrics,
)
# Compilation config overrides
......
......@@ -244,6 +244,7 @@ if TYPE_CHECKING:
VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
VLLM_USE_V2_MODEL_RUNNER: bool = False
VLLM_DEBUG_MFU_METRICS: bool = False
def get_default_cache_root():
......@@ -1565,6 +1566,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
),
# Debug logging for --enable-mfu-metrics
"VLLM_DEBUG_MFU_METRICS": lambda: bool(
int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
),
}
# --8<-- [end:env-vars-definition]
......
......@@ -43,6 +43,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
from vllm.v1.core.sched.utils import check_stop, remove_all
from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.metrics.perf import ModelMetrics, PerfStats
from vllm.v1.metrics.stats import (
PrefixCacheStats,
SchedulerStats,
......@@ -219,6 +220,10 @@ class Scheduler(SchedulerInterface):
self.use_pp = self.parallel_config.pipeline_parallel_size > 1
self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
self.perf_metrics: ModelMetrics | None = None
if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
self.perf_metrics = ModelMetrics(vllm_config)
def schedule(self) -> SchedulerOutput:
# NOTE(woosuk) on the scheduling algorithm:
# There's no "decoding phase" nor "prefill phase" in the scheduler.
......@@ -1066,6 +1071,10 @@ class Scheduler(SchedulerInterface):
kv_connector_output = model_runner_output.kv_connector_output
cudagraph_stats = model_runner_output.cudagraph_stats
perf_stats: PerfStats | None = None
if self.perf_metrics and self.perf_metrics.is_enabled():
perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)
outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
spec_decoding_stats: SpecDecodingStats | None = None
kv_connector_stats: KVConnectorStats | None = (
......@@ -1262,7 +1271,7 @@ class Scheduler(SchedulerInterface):
if (
stats := self.make_stats(
spec_decoding_stats, kv_connector_stats, cudagraph_stats
spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats
)
) is not None:
# Return stats to only one of the front-ends.
......@@ -1485,6 +1494,7 @@ class Scheduler(SchedulerInterface):
spec_decoding_stats: SpecDecodingStats | None = None,
kv_connector_stats: KVConnectorStats | None = None,
cudagraph_stats: CUDAGraphStat | None = None,
perf_stats: PerfStats | None = None,
) -> SchedulerStats | None:
if not self.log_stats:
return None
......@@ -1510,6 +1520,7 @@ class Scheduler(SchedulerInterface):
spec_decoding_stats=spec_stats,
kv_connector_stats=connector_stats_payload,
cudagraph_stats=cudagraph_stats,
perf_stats=perf_stats,
)
def make_spec_decoding_stats(
......
......@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
from vllm.logger import init_logger
from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
from vllm.v1.engine import FinishReason
from vllm.v1.metrics.perf import PerfMetricsLogging
from vllm.v1.metrics.prometheus import unregister_vllm_metrics
from vllm.v1.metrics.stats import (
CachingMetrics,
......@@ -118,6 +119,9 @@ class LoggingStatLogger(StatLoggerBase):
self.engine_is_idle = False
self.aggregated = False
if self._enable_perf_stats():
self.perf_metrics_logging = PerfMetricsLogging(vllm_config)
def _reset(self, now):
self.last_log_time = now
......@@ -127,6 +131,9 @@ class LoggingStatLogger(StatLoggerBase):
self.num_corrupted_reqs: int = 0
self.num_preemptions: int = 0
def _enable_perf_stats(self) -> bool:
return self.vllm_config.observability_config.enable_mfu_metrics
def _track_iteration_stats(self, iteration_stats: IterationStats):
# Save tracked stats for token counters.
self.num_prompt_tokens += iteration_stats.num_prompt_tokens
......@@ -175,6 +182,8 @@ class LoggingStatLogger(StatLoggerBase):
self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
if not self.aggregated:
self.last_scheduler_stats = scheduler_stats
if (perf_stats := scheduler_stats.perf_stats) and self._enable_perf_stats():
self.perf_metrics_logging.observe(perf_stats)
if mm_cache_stats:
self.mm_caching_metrics.observe(mm_cache_stats)
......@@ -211,7 +220,7 @@ class LoggingStatLogger(StatLoggerBase):
"Running: %d reqs",
"Waiting: %d reqs",
]
log_args = [
log_args: list[int | float | str] = [
self.last_prompt_throughput,
self.last_generation_throughput,
self.last_scheduler_stats.num_running_reqs,
......@@ -254,6 +263,8 @@ class LoggingStatLogger(StatLoggerBase):
self.kv_connector_logging.log(log_fn=log_fn)
if self.cudagraph_logging is not None:
self.cudagraph_logging.log(log_fn=log_fn)
if self._enable_perf_stats():
self.perf_metrics_logging.log(log_fn=log_fn, log_prefix=self.log_prefix)
def log_engine_initialized(self):
if self.vllm_config.cache_config.num_gpu_blocks:
......@@ -282,6 +293,10 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
def log_prefix(self):
return "{} Engines Aggregated: ".format(len(self.engine_indexes))
def _enable_perf_stats(self) -> bool:
# Adding per_gpu perf stats across engines can lead to misleading numbers.
return False
def record(
self,
scheduler_stats: SchedulerStats | None,
......
This diff is collapsed.
......@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any
import vllm.envs as envs
from vllm.compilation.cuda_graph import CUDAGraphStat
from vllm.v1.metrics.perf import PerfStats
from vllm.v1.spec_decode.metrics import SpecDecodingStats
if TYPE_CHECKING:
......@@ -186,6 +187,8 @@ class SchedulerStats:
cudagraph_stats: CUDAGraphStat | None = None
perf_stats: PerfStats | None = None
@dataclass
class RequestStateStats:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment