Unverified Commit 24523a1c authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat(vllm): add self-benchmark mode to InstrumentedScheduler (#7779)

parent a873045c
...@@ -273,6 +273,41 @@ def update_engine_config_with_dynamo( ...@@ -273,6 +273,41 @@ def update_engine_config_with_dynamo(
f"--scheduler-cls or subclass InstrumentedScheduler." f"--scheduler-cls or subclass InstrumentedScheduler."
) )
if dynamo_config.benchmark_mode is not None:
if dynamo_config.multimodal_worker or dynamo_config.multimodal_decode_worker:
logger.warning(
"--benchmark-mode is not supported for multimodal workers. "
"Benchmark data will be collected but not served via endpoint."
)
existing_cls = getattr(engine_config, "scheduler_cls", None)
if existing_cls is None and not envs.is_set("DYN_FORWARDPASS_METRIC_PORT"):
defaults[
"scheduler_cls"
] = "dynamo.vllm.instrumented_scheduler.InstrumentedScheduler"
logger.info("Benchmark mode: auto-enabling InstrumentedScheduler")
elif existing_cls is not None and "InstrumentedScheduler" not in str(
existing_cls
):
raise ValueError(
f"--benchmark-mode requires InstrumentedScheduler but "
f"--scheduler-cls is set to '{existing_cls}'. Either remove "
f"--scheduler-cls or use a subclass of InstrumentedScheduler."
)
dynamo_config._benchmark_additional_config = { # type: ignore[attr-defined]
"mode": dynamo_config.benchmark_mode,
"prefill_isl_granularity": dynamo_config.benchmark_prefill_granularity,
"decode_length_granularity": dynamo_config.benchmark_decode_length_granularity,
"decode_batch_size_granularity": dynamo_config.benchmark_decode_batch_granularity,
"warmup_iterations": dynamo_config.benchmark_warmup_iterations,
"output_path": dynamo_config.benchmark_output_path,
"timeout": dynamo_config.benchmark_timeout,
}
logger.info(
"Benchmark mode=%s configured (output=%s)",
dynamo_config.benchmark_mode,
dynamo_config.benchmark_output_path,
)
logger.debug("Setting Dynamo defaults for vLLM") logger.debug("Setting Dynamo defaults for vLLM")
for key, value in defaults.items(): for key, value in defaults.items():
if hasattr(engine_config, key): if hasattr(engine_config, key):
......
...@@ -171,6 +171,78 @@ class DynamoVllmArgGroup(ArgGroup): ...@@ -171,6 +171,78 @@ class DynamoVllmArgGroup(ArgGroup):
), ),
) )
# Benchmark / self-profiling
add_argument(
g,
flag_name="--benchmark-mode",
env_var="DYN_BENCHMARK_MODE",
default=None,
choices=["prefill", "decode", "agg"],
help=(
"Run self-benchmark on startup before accepting requests. "
"Sweeps prefill ISLs and/or decode (context_length x batch_size) "
"points, collecting ForwardPassMetrics at each operating point."
),
)
add_argument(
g,
flag_name="--benchmark-prefill-granularity",
env_var="DYN_BENCHMARK_PREFILL_GRANULARITY",
default=16,
type=int,
help="Number of ISL sample points for prefill sweep (default: 16).",
)
add_argument(
g,
flag_name="--benchmark-decode-length-granularity",
env_var="DYN_BENCHMARK_DECODE_LENGTH_GRANULARITY",
default=6,
type=int,
help=(
"Number of context length sample points for decode sweep "
"(default: 6)."
),
)
add_argument(
g,
flag_name="--benchmark-decode-batch-granularity",
env_var="DYN_BENCHMARK_DECODE_BATCH_GRANULARITY",
default=6,
type=int,
help=(
"Number of batch size sample points per context length " "(default: 6)."
),
)
add_argument(
g,
flag_name="--benchmark-warmup-iterations",
env_var="DYN_BENCHMARK_WARMUP_ITERATIONS",
default=5,
type=int,
help="Warmup iterations before benchmark (default: 5).",
)
add_argument(
g,
flag_name="--benchmark-output-path",
env_var="DYN_BENCHMARK_OUTPUT_PATH",
default="/tmp/benchmark_results.json",
help=(
"Path to write benchmark results JSON "
"(default: /tmp/benchmark_results.json)."
),
)
add_argument(
g,
flag_name="--benchmark-timeout",
env_var="DYN_BENCHMARK_TIMEOUT",
default=300,
type=int,
help=(
"Maximum seconds to wait for benchmark to complete "
"(default: 300). Worker startup fails if exceeded."
),
)
# @dataclass() # @dataclass()
class DynamoVllmConfig(ConfigBase): class DynamoVllmConfig(ConfigBase):
...@@ -204,6 +276,15 @@ class DynamoVllmConfig(ConfigBase): ...@@ -204,6 +276,15 @@ class DynamoVllmConfig(ConfigBase):
# GMS shadow mode # GMS shadow mode
gms_shadow_mode: bool = False gms_shadow_mode: bool = False
# Benchmark / self-profiling
benchmark_mode: Optional[str] = None
benchmark_prefill_granularity: int = 16
benchmark_decode_length_granularity: int = 6
benchmark_decode_batch_granularity: int = 6
benchmark_warmup_iterations: int = 5
benchmark_output_path: str = "/tmp/benchmark_results.json"
benchmark_timeout: int = 300
def validate(self) -> None: def validate(self) -> None:
"""Validate vLLM wrapper configuration.""" """Validate vLLM wrapper configuration."""
self._resolve_disaggregation_mode() self._resolve_disaggregation_mode()
......
...@@ -354,6 +354,8 @@ class BaseWorkerHandler(ABC, Generic[RequestT, ResponseT]): ...@@ -354,6 +354,8 @@ class BaseWorkerHandler(ABC, Generic[RequestT, ResponseT]):
Request handler for the generate and clear_kv_blocks endpoints. Request handler for the generate and clear_kv_blocks endpoints.
""" """
_benchmark_results: Optional[dict] = None
def __init__( def __init__(
self, self,
runtime, runtime,
...@@ -683,6 +685,14 @@ class BaseWorkerHandler(ABC, Generic[RequestT, ResponseT]): ...@@ -683,6 +685,14 @@ class BaseWorkerHandler(ABC, Generic[RequestT, ResponseT]):
except Exception as e: except Exception as e:
yield {"status": "error", "message": str(e)} yield {"status": "error", "message": str(e)}
async def get_perf_metrics(self, request=None):
"""Return self-benchmark FPM results, or an error dict if none."""
result = getattr(self, "_benchmark_results", None)
if result is None:
yield {"status": "error", "message": "no benchmark data"}
else:
yield result
def add_temp_dir(self, temp_dir: tempfile.TemporaryDirectory) -> None: def add_temp_dir(self, temp_dir: tempfile.TemporaryDirectory) -> None:
"""Add a temporary directory to be cleaned up later.""" """Add a temporary directory to be cleaned up later."""
if temp_dir is not None: if temp_dir is not None:
......
...@@ -556,6 +556,15 @@ def setup_vllm_engine( ...@@ -556,6 +556,15 @@ def setup_vllm_engine(
if fpm_worker_id is not None: if fpm_worker_id is not None:
vllm_config.additional_config["fpm_worker_id"] = fpm_worker_id vllm_config.additional_config["fpm_worker_id"] = fpm_worker_id
# Pass benchmark config to InstrumentedScheduler via additional_config.
if hasattr(config, "_benchmark_additional_config"):
bench = config._benchmark_additional_config
if fpm_worker_id and bench["output_path"] == "/tmp/benchmark_results.json":
short_id = fpm_worker_id[-8:]
bench["output_path"] = f"/tmp/benchmark_results_{short_id}.json"
vllm_config.additional_config["benchmark"] = bench
logger.info("Benchmark config injected into additional_config")
factory = [] factory = []
if stat_logger: if stat_logger:
factory.append(stat_logger) factory.append(stat_logger)
......
...@@ -596,3 +596,128 @@ class TestVllmOmniOptionalDependency: ...@@ -596,3 +596,128 @@ class TestVllmOmniOptionalDependency:
sys.modules.pop(mod, None) sys.modules.pop(mod, None)
# Restore original state # Restore original state
sys.modules.update(saved) sys.modules.update(saved)
# ---------------------------------------------------------------------------
# Benchmark mode unit tests
# ---------------------------------------------------------------------------
class TestBenchmarkConfig:
"""Tests for BenchmarkConfig dataclass and grid generation."""
def test_benchmark_config_defaults(self):
from dynamo.vllm.instrumented_scheduler import BenchmarkConfig
cfg = BenchmarkConfig()
assert cfg.mode == "agg"
assert cfg.prefill_isl_granularity == 16
assert cfg.decode_length_granularity == 6
assert cfg.decode_batch_size_granularity == 6
assert cfg.warmup_iterations == 5
assert cfg.output_path == "/tmp/benchmark_results.json"
def test_benchmark_config_from_dict(self):
from dynamo.vllm.instrumented_scheduler import BenchmarkConfig
cfg = BenchmarkConfig(
mode="decode",
prefill_isl_granularity=4,
decode_length_granularity=3,
decode_batch_size_granularity=3,
warmup_iterations=2,
output_path="/tmp/test.json",
)
assert cfg.mode == "decode"
assert cfg.prefill_isl_granularity == 4
def test_benchmark_config_kwargs_unpack(self):
from dynamo.vllm.instrumented_scheduler import BenchmarkConfig
d = {"mode": "prefill", "warmup_iterations": 1}
cfg = BenchmarkConfig(**d)
assert cfg.mode == "prefill"
assert cfg.warmup_iterations == 1
assert cfg.prefill_isl_granularity == 16
class TestBenchmarkGrid:
"""Tests for benchmark grid generation logic (no GPU required)."""
def _make_grid_helper(self):
"""Return (prefill_grid_fn, decode_grid_fn) that operate on plain params."""
import numpy as np
def generate_prefill_grid(max_num_scheduled_tokens, granularity):
isls = np.unique(
np.linspace(10, max_num_scheduled_tokens, granularity, dtype=int)
)
return [int(x) for x in isls]
def generate_decode_grid(
block_size,
max_model_len,
max_num_running_reqs,
num_gpu_blocks,
length_granularity,
batch_granularity,
):
total_kv_tokens = num_gpu_blocks * block_size
ctx_lens = np.unique(
np.linspace(block_size, max_model_len, length_granularity, dtype=int)
)
points = []
for ctx_len in ctx_lens:
ctx_len = int(ctx_len)
max_batch = min(max_num_running_reqs, total_kv_tokens // ctx_len)
if max_batch < 1:
continue
batch_sizes = np.unique(
np.linspace(1, max_batch, batch_granularity, dtype=int)
)
for bs in batch_sizes:
points.append((ctx_len, int(bs)))
return points
return generate_prefill_grid, generate_decode_grid
def test_prefill_grid_count(self):
gen_prefill, _ = self._make_grid_helper()
isls = gen_prefill(max_num_scheduled_tokens=8192, granularity=10)
assert len(isls) == 10
assert isls[0] == 10
assert isls[-1] == 8192
def test_prefill_grid_dedup(self):
gen_prefill, _ = self._make_grid_helper()
isls = gen_prefill(max_num_scheduled_tokens=20, granularity=100)
assert len(isls) == len(set(isls))
def test_decode_grid_batch_capped(self):
_, gen_decode = self._make_grid_helper()
points = gen_decode(
block_size=16,
max_model_len=4096,
max_num_running_reqs=64,
num_gpu_blocks=256,
length_granularity=3,
batch_granularity=3,
)
total_kv = 256 * 16
for ctx_len, bs in points:
assert bs <= min(64, total_kv // ctx_len)
assert bs >= 1
def test_decode_grid_skips_large_ctx(self):
_, gen_decode = self._make_grid_helper()
points = gen_decode(
block_size=16,
max_model_len=100000,
max_num_running_reqs=64,
num_gpu_blocks=100,
length_granularity=5,
batch_granularity=3,
)
total_kv = 100 * 16
for ctx_len, bs in points:
assert ctx_len <= total_kv
...@@ -4,9 +4,12 @@ ...@@ -4,9 +4,12 @@
"""Worker initialization factory for vLLM workers.""" """Worker initialization factory for vLLM workers."""
import asyncio import asyncio
import json
import logging import logging
import os import os
import time as _time
from collections.abc import Awaitable, Callable from collections.abc import Awaitable, Callable
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -20,7 +23,7 @@ from dynamo.runtime import DistributedRuntime ...@@ -20,7 +23,7 @@ from dynamo.runtime import DistributedRuntime
from .args import Config from .args import Config
from .constants import DisaggregationMode from .constants import DisaggregationMode
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker
from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
from .multimodal_handlers import EncodeWorkerHandler from .multimodal_handlers import EncodeWorkerHandler
from .publisher import StatLoggerFactory from .publisher import StatLoggerFactory
...@@ -30,6 +33,66 @@ logger = logging.getLogger(__name__) ...@@ -30,6 +33,66 @@ logger = logging.getLogger(__name__)
# (engine_client, vllm_config, default_sampling_params, prometheus_temp_dir, component_gauges) # (engine_client, vllm_config, default_sampling_params, prometheus_temp_dir, component_gauges)
EngineSetupResult = tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics] EngineSetupResult = tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics]
async def _wait_and_load_benchmark(bench_cfg: dict, vllm_config: VllmConfig) -> dict:
"""Wait for benchmark result files and aggregate across DP ranks."""
base_path = Path(bench_cfg["output_path"])
timeout = int(bench_cfg.get("timeout", 300))
try:
dp_start, dp_size = get_dp_range_for_worker(vllm_config)
except Exception:
logger.warning(
"Could not determine DP range, assuming single rank",
exc_info=True,
)
dp_start, dp_size = 0, 1
rank_paths = []
for dp_rank in range(dp_start, dp_start + dp_size):
if dp_rank == 0:
rank_paths.append(base_path)
else:
stem, ext = os.path.splitext(str(base_path))
rank_paths.append(Path(f"{stem}_dp{dp_rank}{ext}"))
logger.info(
"Waiting for benchmark to complete (files: %s, timeout: %ds)...",
rank_paths,
timeout,
)
deadline = _time.monotonic() + timeout
for p in rank_paths:
while not p.exists():
if _time.monotonic() > deadline:
raise TimeoutError(
f"Benchmark did not complete within {timeout}s. " f"Missing: {p}"
)
await asyncio.sleep(0.1)
merged: dict = {}
for i, p in enumerate(rank_paths):
with open(p) as f:
data = json.load(f)
if i == 0:
merged = data
for r in merged.get("results", []):
r["point"]["dp_rank"] = dp_start
else:
dp_rank = dp_start + i
for r in data.get("results", []):
r["point"]["dp_rank"] = dp_rank
merged.setdefault("results", []).extend(data.get("results", []))
logger.info(
"Benchmark complete, %d points across %d rank(s)",
len(merged.get("results", [])),
len(rank_paths),
)
return merged
SetupVllmEngineFn = Callable[..., EngineSetupResult] SetupVllmEngineFn = Callable[..., EngineSetupResult]
SetupKvEventPublisherFn = Callable[..., Optional[Any]] SetupKvEventPublisherFn = Callable[..., Optional[Any]]
RegisterVllmModelFn = Callable[..., Awaitable[None]] RegisterVllmModelFn = Callable[..., Awaitable[None]]
...@@ -64,6 +127,9 @@ class WorkerFactory: ...@@ -64,6 +127,9 @@ class WorkerFactory:
) -> None: ) -> None:
"""Create the appropriate multimodal worker based on config flags.""" """Create the appropriate multimodal worker based on config flags."""
# NOTE: --benchmark-mode is only supported for prefill/decode workers.
# The encode worker path does not wire benchmark waiting or
# the get_perf_metrics endpoint.
if config.disaggregation_mode == DisaggregationMode.ENCODE: if config.disaggregation_mode == DisaggregationMode.ENCODE:
await self._create_multimodal_encode_worker( await self._create_multimodal_encode_worker(
runtime, config, shutdown_event, shutdown_endpoints runtime, config, shutdown_event, shutdown_endpoints
...@@ -296,6 +362,13 @@ class WorkerFactory: ...@@ -296,6 +362,13 @@ class WorkerFactory:
handler._quiesce_controller.mark_resumed() handler._quiesce_controller.mark_resumed()
logger.info("[Shadow] Engine awake, registering with discovery") logger.info("[Shadow] Engine awake, registering with discovery")
# Wait for self-benchmark to complete before registering.
bench_cfg = vllm_config.additional_config.get("benchmark")
if bench_cfg:
handler._benchmark_results = await _wait_and_load_benchmark(
bench_cfg, vllm_config
)
await self.register_vllm_model( await self.register_vllm_model(
model_input, model_input,
model_type, model_type,
...@@ -309,6 +382,11 @@ class WorkerFactory: ...@@ -309,6 +382,11 @@ class WorkerFactory:
engine_client, use_text_input=config.use_vllm_tokenizer engine_client, use_text_input=config.use_vllm_tokenizer
).to_dict() ).to_dict()
perf_endpoint = runtime.endpoint(
f"{config.namespace}.{config.component}.get_perf_metrics"
)
shutdown_endpoints.append(perf_endpoint)
try: try:
logger.debug("Starting serve_endpoint for decode worker") logger.debug("Starting serve_endpoint for decode worker")
...@@ -336,6 +414,10 @@ class WorkerFactory: ...@@ -336,6 +414,10 @@ class WorkerFactory:
handler.clear_kv_blocks, handler.clear_kv_blocks,
metrics_labels=model_metrics_labels, metrics_labels=model_metrics_labels,
), ),
perf_endpoint.serve_endpoint(
handler.get_perf_metrics,
metrics_labels=model_metrics_labels,
),
] ]
if lora_enabled: if lora_enabled:
...@@ -467,7 +549,17 @@ class WorkerFactory: ...@@ -467,7 +549,17 @@ class WorkerFactory:
"Registered engine routes: /engine/sleep, /engine/wake_up, /engine/scale_elastic_ep" "Registered engine routes: /engine/sleep, /engine/wake_up, /engine/scale_elastic_ep"
) )
shutdown_endpoints[:] = [generate_endpoint, clear_endpoint] # Wait for self-benchmark to complete before registering.
bench_cfg = vllm_config.additional_config.get("benchmark")
if bench_cfg:
handler._benchmark_results = await _wait_and_load_benchmark(
bench_cfg, vllm_config
)
perf_endpoint = runtime.endpoint(
f"{config.namespace}.{config.component}.get_perf_metrics"
)
shutdown_endpoints[:] = [generate_endpoint, clear_endpoint, perf_endpoint]
# Register prefill model with ModelType.Prefill # Register prefill model with ModelType.Prefill
model_input = ( model_input = (
...@@ -486,18 +578,7 @@ class WorkerFactory: ...@@ -486,18 +578,7 @@ class WorkerFactory:
engine_client, use_text_input=config.use_vllm_tokenizer engine_client, use_text_input=config.use_vllm_tokenizer
).to_dict() ).to_dict()
try: prefill_metrics_labels = [
logger.debug("Starting serve_endpoint for prefill worker")
await asyncio.gather(
# for prefill, we want to shutdown the engine after all prefill requests are finished because
# (temp reason): we don't support re-routing prefill requests
# (long-term reason): prefill engine should pull from a global queue so there is
# only a few in-flight requests that can be quickly finished
generate_endpoint.serve_endpoint(
handler.generate, # type: ignore
graceful_shutdown=True,
# In practice config.served_model_name is always set, but mypy needs the "or" here.
metrics_labels=[
( (
prometheus_names.labels.MODEL, prometheus_names.labels.MODEL,
config.served_model_name or config.model, config.served_model_name or config.model,
...@@ -506,21 +587,24 @@ class WorkerFactory: ...@@ -506,21 +587,24 @@ class WorkerFactory:
prometheus_names.labels.MODEL_NAME, prometheus_names.labels.MODEL_NAME,
config.served_model_name or config.model, config.served_model_name or config.model,
), ),
], ]
try:
logger.debug("Starting serve_endpoint for prefill worker")
await asyncio.gather(
generate_endpoint.serve_endpoint(
handler.generate, # type: ignore
graceful_shutdown=True,
metrics_labels=prefill_metrics_labels,
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
clear_endpoint.serve_endpoint( clear_endpoint.serve_endpoint(
handler.clear_kv_blocks, # type: ignore handler.clear_kv_blocks, # type: ignore
metrics_labels=[ metrics_labels=prefill_metrics_labels,
(
prometheus_names.labels.MODEL,
config.served_model_name or config.model,
),
(
prometheus_names.labels.MODEL_NAME,
config.served_model_name or config.model,
), ),
], perf_endpoint.serve_endpoint(
handler.get_perf_metrics,
metrics_labels=prefill_metrics_labels,
), ),
) )
logger.debug("serve_endpoint completed for prefill worker") logger.debug("serve_endpoint completed for prefill worker")
......
# vLLM RFC: Per-Iteration Forward Pass Metrics via ZMQ
> For submission to https://github.com/vllm-project/vllm/issues/new?template=750-RFC.yml
---
## Title
`[RFC]: Per-iteration forward pass metrics with accurate engine-level timing`
---
## Motivation
**Problem: orchestration systems need per-iteration scheduler telemetry, but vLLM only exposes aggregated Prometheus metrics.**
Inference orchestrators (autoscalers, routers, disaggregated serving planners) need to understand the *per-iteration* cost structure of a running vLLM engine:
- How many prefill vs decode requests were in each batch?
- What was the KV cache depth distribution across decode requests?
- How many tokens were computed vs cache-hit?
- How long did the GPU forward pass actually take?
- How many requests are queued and waiting?
Today, vLLM exposes Prometheus gauge/histogram metrics that are **scraped asynchronously** by an external collector. This has fundamental limitations for per-iteration telemetry:
1. **Lossy**: Prometheus scraping is pull-based at a configurable interval. With iteration times of 10-100ms, the scraper can miss 90%+ of iterations. Gauge values reflect only the most recent state at scrape time, not the full distribution. Aggregated metrics inevitably lose information.
2. **Unsynchronized**: The scraper runs on a separate timer from the engine loop. Metrics from different gauges may reflect different iterations, making it impossible to correlate prefill/decode counts with wall time for the same batch.
3. **No per-iteration history**: There is no way to reconstruct the sequence of batch compositions over time. An autoscaler cannot build a cost model from Prometheus data because it only sees snapshots.
4. **Latency**: Push-based Prometheus (Pushgateway) uses HTTP, adding latency and overhead proportional to scrape frequency. For per-iteration emission at 100+ iterations/second, this is prohibitive.
**Why this matters for the ecosystem:**
- **NVIDIA Dynamo** currently implements this as an out-of-tree `--scheduler-cls` subclass ([InstrumentedScheduler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/instrumented_scheduler.py)), but measuring wall time from the scheduler is inherently imprecise because the scheduler cannot observe the GPU forward pass boundary (see Proposed Change).
- **Autoscalers** (Kubernetes HPA, custom planners) need per-iteration throughput signals to make scaling decisions within seconds, not minutes.
---
## Proposed Change
### 1. Add `wall_time` measurement in EngineCore
Measure the GPU forward pass time at the exact boundary -- around `future.result()` in `EngineCore.step()` / `step_with_batch_queue()`:
```python
# In EngineCore.step():
scheduler_output = self.scheduler.schedule()
future = self.model_executor.execute_model(scheduler_output, non_block=True)
...
t_start = time.monotonic()
model_output = future.result() # blocks until GPU finishes
wall_time = time.monotonic() - t_start
...
self.scheduler.update_from_output(scheduler_output, model_output, wall_time=wall_time)
```
This is the **only** place in the codebase with direct access to both the GPU wait boundary and the scheduler output. The scheduler cannot measure this accurately because:
- In sync mode: `schedule()` returns before `execute_model` runs
- In async mode: `schedule(N+1)` runs concurrently with GPU batch N, so scheduler-side timestamps include overlap from adjacent batches
Pass `wall_time` to `update_from_output()` as a new optional kwarg so the scheduler can include it in metrics.
### 2. Define a per-iteration metrics struct
A compact, versioned struct emitted once per forward pass:
```python
class ForwardPassMetrics(msgspec.Struct, frozen=True):
version: int = 1 # can include more info in later versions
# Identity
worker_id: str = "" # unique engine instance identifier
dp_rank: int = 0 # data parallel rank
counter_id: int = 0 # monotonic sequence number
# Timing (measured in EngineCore)
wall_time: float = 0.0 # seconds, GPU forward pass time
# Scheduled batch composition
num_prefill_requests: int = 0
sum_prefill_tokens: int = 0 # tokens being computed this iteration
var_prefill_length: float = 0.0 # variance of total prompt lengths
sum_prefill_kv_tokens: int = 0 # KV tokens read (cache hits + prior chunks)
num_decode_requests: int = 0
sum_decode_kv_tokens: int = 0 # total KV depth across decode requests
var_decode_kv_tokens: float = 0.0
# Queue state
num_queued_prefill: int = 0
sum_queued_prefill_tokens: int = 0
num_queued_decode: int = 0 # preempted requests waiting
sum_queued_decode_kv_tokens: int = 0
```
**Why these specific fields:**
- An autoscaler needs `wall_time` + `num_prefill_requests` + `num_decode_requests` + token counts to build a cost model of the form `latency = f(prefill_tokens, decode_batch_size, kv_depth)`.
- Variance fields enable detecting heterogeneous batches (mix of short and long sequences) which affect padding overhead and CUDA graph efficiency.
- Queue metrics enable load-aware routing and backpressure signals.
- `msgspec.Struct` is zero-copy serializable and already used by vLLM for KV cache events.
### 3. Emit via ZMQ PUB/SUB (not Prometheus)
Publish the struct over a ZMQ PUB socket bound to a configurable localhost port, using msgpack serialization:
```
ZMQ message: [topic_bytes, sequence_bytes, msgpack_payload]
```
**Why ZMQ over Prometheus:**
| | ZMQ PUB/SUB | Prometheus |
|---|---|---|
| **Delivery** | Push, every iteration | Pull, scraper interval |
| **Completeness** | Every iteration captured | 90%+ iterations missed |
| **Correlation** | All fields from same iteration in one message | Gauges may reflect different iterations |
| **Latency** | ~10us per message (IPC) | HTTP round-trip per scrape |
| **CPU overhead** | Background thread, non-blocking send | Metric registry lock contention |
| **Consumers** | Multiple SUB sockets, zero-copy | One scraper endpoint |
| **Format** | Versioned, typed, extensible (msgspec) | Flat key-value gauges |
The ZMQ publisher runs in a background daemon thread (same pattern as vLLM's existing `ZmqEventPublisher` for KV cache events). The scheduler hot path only pays for `queue.put_nowait()` on a bounded queue -- no serialization, no I/O.
**Backward compatibility: Prometheus "most recent" gauges.** For users who only need approximate metrics via existing Prometheus infrastructure, we can optionally expose the most recent `ForwardPassMetrics` as Prometheus gauges (updated in-place each iteration, scraped at whatever interval the collector uses). This is strictly less capable than the ZMQ stream but maintains compatibility with existing monitoring dashboards.
### 4. Data parallel support
Each DP rank runs its own EngineCore with its own scheduler. Each rank binds its own ZMQ PUB socket on `base_port + dp_rank`, emitting independent FPM streams tagged with `dp_rank`.
**Attention DP (non-MoE):** Each rank is fully independent (`dp_size=1` locally). Each rank emits its own FPM stream. No cross-rank coordination needed -- the consumer (autoscaler, planner) subscribes to each rank's ZMQ port independently and aggregates as needed.
**DP+EP (MoE):** Each rank has its own scheduler and emits its own FPM. Although the GPU forward pass is synchronized across ranks via collectives (`coordinate_batch_across_dp`), each rank's `wall_time` is measured locally at its own `future.result()` boundary. The measurements are nearly identical across ranks (collectives force sync), so any rank's data is representative. Consumers can average or use rank 0's data.
This is the **same approach used by KV cache events** today: each DP rank publishes to its own ZMQ port, and the relay/consumer layer handles multi-rank aggregation outside the engine.
### 5. Activation
Controlled by a new engine argument:
```
--forward-pass-metrics-port PORT # 0 = disabled (default), >0 = ZMQ PUB base port
```
For DP deployments, rank N binds on `PORT + N`. When enabled, the scheduler base class (or a thin mixin) handles metric extraction and ZMQ publishing. No subclass override needed -- this should work with any scheduler implementation.
### 6. Wire format and versioning
- **Serialization**: msgpack via `msgspec.msgpack.Encoder` (same as KV cache events)
- **ZMQ multipart**: `[b"", seq.to_bytes(8, "big"), msgpack_payload]`
- Empty topic allows future topic-based filtering
- 8-byte big-endian sequence number for ordering / gap detection
- msgpack payload is the serialized `ForwardPassMetrics`
- **Versioning**: `version` field in the struct. Consumers must check version before interpreting fields. Bump on incompatible changes.
### 7. Implementation scope
| Component | Change |
|-----------|--------|
| `EngineCore.step()` / `step_with_batch_queue()` | Measure `wall_time` around `future.result()`, pass to `update_from_output()` |
| `Scheduler.update_from_output()` | Accept optional `wall_time` kwarg |
| `SchedulerInterface` | New optional method `get_forward_pass_metrics()` or mixin |
| New: `ForwardPassMetrics` struct | In `vllm/v1/metrics/` or `vllm/v1/core/sched/` |
| New: `FpmPublisher` (ZMQ background thread) | Modeled after existing `ZmqEventPublisher` |
| `AsyncEngineArgs` | New `--forward-pass-metrics-port` argument |
| Optional: Prometheus stat logger | Expose most-recent FPM fields as gauges |
---
## Feedback Period
2 weeks.
---
## CC List
@simon-mo @youkaichao @WoosukKwon @robertgshaw2-redhat
---
## Any Other Things
**Reference implementation:** NVIDIA Dynamo's [InstrumentedScheduler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/instrumented_scheduler.py) implements this as an out-of-tree scheduler subclass with scheduler-side timing. Moving the timing into EngineCore and the ZMQ publisher into vLLM core would:
1. Eliminate the need for `--scheduler-cls` overrides for metrics
2. Provide accurate GPU timing (not scheduler-approximate)
3. Allow any orchestration system (not just Dynamo) to consume per-iteration metrics
4. Reuse existing ZMQ infrastructure from KV cache events
**Existing ZMQ precedent in vLLM:** The KV cache event system (`KVEventsConfig`, `ZmqEventPublisher`) already uses this exact pattern -- ZMQ PUB on localhost, msgpack serialization, background thread. Forward pass metrics would follow the same architecture.
**Not in scope:** How consumers (Dynamo, custom autoscalers, etc.) subscribe, relay, or aggregate these metrics. That is consumer-side logic. This RFC only covers emission from vLLM.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment