feat(vllm): add self-benchmark mode to InstrumentedScheduler (#7779)

24523a1c · Hongkuan Zhou · GitHub · a873045c · 24523a1c · 24523a1c
Unverified Commit 24523a1c authored Apr 01, 2026 by Hongkuan Zhou Committed by GitHub Apr 02, 2026
8 changed files
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -273,6 +273,41 @@ def update_engine_config_with_dynamo(
                f"--scheduler-cls or subclass InstrumentedScheduler."
            )
+    if dynamo_config.benchmark_mode is not None:
+        if dynamo_config.multimodal_worker or dynamo_config.multimodal_decode_worker:
+            logger.warning(
+                "--benchmark-mode is not supported for multimodal workers. "
+                "Benchmark data will be collected but not served via endpoint."
+            )
+        existing_cls = getattr(engine_config, "scheduler_cls", None)
+        if existing_cls is None and not envs.is_set("DYN_FORWARDPASS_METRIC_PORT"):
+            defaults[
+                "scheduler_cls"
+            ] = "dynamo.vllm.instrumented_scheduler.InstrumentedScheduler"
+            logger.info("Benchmark mode: auto-enabling InstrumentedScheduler")
+        elif existing_cls is not None and "InstrumentedScheduler" not in str(
+            existing_cls
+        ):
+            raise ValueError(
+                f"--benchmark-mode requires InstrumentedScheduler but "
+                f"--scheduler-cls is set to '{existing_cls}'. Either remove "
+                f"--scheduler-cls or use a subclass of InstrumentedScheduler."
+            )
+        dynamo_config._benchmark_additional_config = {  # type: ignore[attr-defined]
+            "mode": dynamo_config.benchmark_mode,
+            "prefill_isl_granularity": dynamo_config.benchmark_prefill_granularity,
+            "decode_length_granularity": dynamo_config.benchmark_decode_length_granularity,
+            "decode_batch_size_granularity": dynamo_config.benchmark_decode_batch_granularity,
+            "warmup_iterations": dynamo_config.benchmark_warmup_iterations,
+            "output_path": dynamo_config.benchmark_output_path,
+            "timeout": dynamo_config.benchmark_timeout,
+        }
+        logger.info(
+            "Benchmark mode=%s configured (output=%s)",
+            dynamo_config.benchmark_mode,
+            dynamo_config.benchmark_output_path,
+        )
    logger.debug("Setting Dynamo defaults for vLLM")
    for key, value in defaults.items():
        if hasattr(engine_config, key):

--- a/components/src/dynamo/vllm/backend_args.py
+++ b/components/src/dynamo/vllm/backend_args.py
@@ -171,6 +171,78 @@ class DynamoVllmArgGroup(ArgGroup):
            ),
        )
+        # Benchmark / self-profiling
+        add_argument(
+            g,
+            flag_name="--benchmark-mode",
+            env_var="DYN_BENCHMARK_MODE",
+            default=None,
+            choices=["prefill", "decode", "agg"],
+            help=(
+                "Run self-benchmark on startup before accepting requests. "
+                "Sweeps prefill ISLs and/or decode (context_length x batch_size) "
+                "points, collecting ForwardPassMetrics at each operating point."
+            ),
+        )
+        add_argument(
+            g,
+            flag_name="--benchmark-prefill-granularity",
+            env_var="DYN_BENCHMARK_PREFILL_GRANULARITY",
+            default=16,
+            type=int,
+            help="Number of ISL sample points for prefill sweep (default: 16).",
+        )
+        add_argument(
+            g,
+            flag_name="--benchmark-decode-length-granularity",
+            env_var="DYN_BENCHMARK_DECODE_LENGTH_GRANULARITY",
+            default=6,
+            type=int,
+            help=(
+                "Number of context length sample points for decode sweep "
+                "(default: 6)."
+            ),
+        )
+        add_argument(
+            g,
+            flag_name="--benchmark-decode-batch-granularity",
+            env_var="DYN_BENCHMARK_DECODE_BATCH_GRANULARITY",
+            default=6,
+            type=int,
+            help=(
+                "Number of batch size sample points per context length " "(default: 6)."
+            ),
+        )
+        add_argument(
+            g,
+            flag_name="--benchmark-warmup-iterations",
+            env_var="DYN_BENCHMARK_WARMUP_ITERATIONS",
+            default=5,
+            type=int,
+            help="Warmup iterations before benchmark (default: 5).",
+        )
+        add_argument(
+            g,
+            flag_name="--benchmark-output-path",
+            env_var="DYN_BENCHMARK_OUTPUT_PATH",
+            default="/tmp/benchmark_results.json",
+            help=(
+                "Path to write benchmark results JSON "
+                "(default: /tmp/benchmark_results.json)."
+            ),
+        )
+        add_argument(
+            g,
+            flag_name="--benchmark-timeout",
+            env_var="DYN_BENCHMARK_TIMEOUT",
+            default=300,
+            type=int,
+            help=(
+                "Maximum seconds to wait for benchmark to complete "
+                "(default: 300). Worker startup fails if exceeded."
+            ),
+        )
 # @dataclass()
 class DynamoVllmConfig(ConfigBase):
@@ -204,6 +276,15 @@ class DynamoVllmConfig(ConfigBase):
    # GMS shadow mode
    gms_shadow_mode: bool = False
+    # Benchmark / self-profiling
+    benchmark_mode: Optional[str] = None
+    benchmark_prefill_granularity: int = 16
+    benchmark_decode_length_granularity: int = 6
+    benchmark_decode_batch_granularity: int = 6
+    benchmark_warmup_iterations: int = 5
+    benchmark_output_path: str = "/tmp/benchmark_results.json"
+    benchmark_timeout: int = 300
    def validate(self) -> None:
        """Validate vLLM wrapper configuration."""
        self._resolve_disaggregation_mode()

--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -354,6 +354,8 @@ class BaseWorkerHandler(ABC, Generic[RequestT, ResponseT]):
    Request handler for the generate and clear_kv_blocks endpoints.
    """
+    _benchmark_results: Optional[dict] = None
    def __init__(
        self,
        runtime,
@@ -683,6 +685,14 @@ class BaseWorkerHandler(ABC, Generic[RequestT, ResponseT]):
        except Exception as e:
            yield {"status": "error", "message": str(e)}
+    async def get_perf_metrics(self, request=None):
+        """Return self-benchmark FPM results, or an error dict if none."""
+        result = getattr(self, "_benchmark_results", None)
+        if result is None:
+            yield {"status": "error", "message": "no benchmark data"}
+        else:
+            yield result
    def add_temp_dir(self, temp_dir: tempfile.TemporaryDirectory) -> None:
        """Add a temporary directory to be cleaned up later."""
        if temp_dir is not None:

--- a/components/src/dynamo/vllm/instrumented_scheduler.py
+++ b/components/src/dynamo/vllm/instrumented_scheduler.py
--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -556,6 +556,15 @@ def setup_vllm_engine(
    if fpm_worker_id is not None:
        vllm_config.additional_config["fpm_worker_id"] = fpm_worker_id
+    # Pass benchmark config to InstrumentedScheduler via additional_config.
+    if hasattr(config, "_benchmark_additional_config"):
+        bench = config._benchmark_additional_config
+        if fpm_worker_id and bench["output_path"] == "/tmp/benchmark_results.json":
+            short_id = fpm_worker_id[-8:]
+            bench["output_path"] = f"/tmp/benchmark_results_{short_id}.json"
+        vllm_config.additional_config["benchmark"] = bench
+        logger.info("Benchmark config injected into additional_config")
    factory = []
    if stat_logger:
        factory.append(stat_logger)

--- a/components/src/dynamo/vllm/tests/test_vllm_unit.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_unit.py
@@ -596,3 +596,128 @@ class TestVllmOmniOptionalDependency:
                    sys.modules.pop(mod, None)
            # Restore original state
            sys.modules.update(saved)
+# ---------------------------------------------------------------------------
+# Benchmark mode unit tests
+# ---------------------------------------------------------------------------
+class TestBenchmarkConfig:
+    """Tests for BenchmarkConfig dataclass and grid generation."""
+    def test_benchmark_config_defaults(self):
+        from dynamo.vllm.instrumented_scheduler import BenchmarkConfig
+        cfg = BenchmarkConfig()
+        assert cfg.mode == "agg"
+        assert cfg.prefill_isl_granularity == 16
+        assert cfg.decode_length_granularity == 6
+        assert cfg.decode_batch_size_granularity == 6
+        assert cfg.warmup_iterations == 5
+        assert cfg.output_path == "/tmp/benchmark_results.json"
+    def test_benchmark_config_from_dict(self):
+        from dynamo.vllm.instrumented_scheduler import BenchmarkConfig
+        cfg = BenchmarkConfig(
+            mode="decode",
+            prefill_isl_granularity=4,
+            decode_length_granularity=3,
+            decode_batch_size_granularity=3,
+            warmup_iterations=2,
+            output_path="/tmp/test.json",
+        )
+        assert cfg.mode == "decode"
+        assert cfg.prefill_isl_granularity == 4
+    def test_benchmark_config_kwargs_unpack(self):
+        from dynamo.vllm.instrumented_scheduler import BenchmarkConfig
+        d = {"mode": "prefill", "warmup_iterations": 1}
+        cfg = BenchmarkConfig(**d)
+        assert cfg.mode == "prefill"
+        assert cfg.warmup_iterations == 1
+        assert cfg.prefill_isl_granularity == 16
+class TestBenchmarkGrid:
+    """Tests for benchmark grid generation logic (no GPU required)."""
+    def _make_grid_helper(self):
+        """Return (prefill_grid_fn, decode_grid_fn) that operate on plain params."""
+        import numpy as np
+        def generate_prefill_grid(max_num_scheduled_tokens, granularity):
+            isls = np.unique(
+                np.linspace(10, max_num_scheduled_tokens, granularity, dtype=int)
+            )
+            return [int(x) for x in isls]
+        def generate_decode_grid(
+            block_size,
+            max_model_len,
+            max_num_running_reqs,
+            num_gpu_blocks,
+            length_granularity,
+            batch_granularity,
+        ):
+            total_kv_tokens = num_gpu_blocks * block_size
+            ctx_lens = np.unique(
+                np.linspace(block_size, max_model_len, length_granularity, dtype=int)
+            )
+            points = []
+            for ctx_len in ctx_lens:
+                ctx_len = int(ctx_len)
+                max_batch = min(max_num_running_reqs, total_kv_tokens // ctx_len)
+                if max_batch < 1:
+                    continue
+                batch_sizes = np.unique(
+                    np.linspace(1, max_batch, batch_granularity, dtype=int)
+                )
+                for bs in batch_sizes:
+                    points.append((ctx_len, int(bs)))
+            return points
+        return generate_prefill_grid, generate_decode_grid
+    def test_prefill_grid_count(self):
+        gen_prefill, _ = self._make_grid_helper()
+        isls = gen_prefill(max_num_scheduled_tokens=8192, granularity=10)
+        assert len(isls) == 10
+        assert isls[0] == 10
+        assert isls[-1] == 8192
+    def test_prefill_grid_dedup(self):
+        gen_prefill, _ = self._make_grid_helper()
+        isls = gen_prefill(max_num_scheduled_tokens=20, granularity=100)
+        assert len(isls) == len(set(isls))
+    def test_decode_grid_batch_capped(self):
+        _, gen_decode = self._make_grid_helper()
+        points = gen_decode(
+            block_size=16,
+            max_model_len=4096,
+            max_num_running_reqs=64,
+            num_gpu_blocks=256,
+            length_granularity=3,
+            batch_granularity=3,
+        )
+        total_kv = 256 * 16
+        for ctx_len, bs in points:
+            assert bs <= min(64, total_kv // ctx_len)
+            assert bs >= 1
+    def test_decode_grid_skips_large_ctx(self):
+        _, gen_decode = self._make_grid_helper()
+        points = gen_decode(
+            block_size=16,
+            max_model_len=100000,
+            max_num_running_reqs=64,
+            num_gpu_blocks=100,
+            length_granularity=5,
+            batch_granularity=3,
+        )
+        total_kv = 100 * 16
+        for ctx_len, bs in points:
+            assert ctx_len <= total_kv
--- a/components/src/dynamo/vllm/worker_factory.py
+++ b/components/src/dynamo/vllm/worker_factory.py
@@ -4,9 +4,12 @@
 """Worker initialization factory for vLLM workers."""
 import asyncio
+import json
 import logging
 import os
+import time as _time
 from collections.abc import Awaitable, Callable
+from pathlib import Path
 from typing import Any, Optional
 from vllm.config import VllmConfig
@@ -20,7 +23,7 @@ from dynamo.runtime import DistributedRuntime
 from .args import Config
 from .constants import DisaggregationMode
-from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
+from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker
 from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
 from .multimodal_handlers import EncodeWorkerHandler
 from .publisher import StatLoggerFactory
@@ -30,6 +33,66 @@ logger = logging.getLogger(__name__)
 # (engine_client, vllm_config, default_sampling_params, prometheus_temp_dir, component_gauges)
 EngineSetupResult = tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics]
+async def _wait_and_load_benchmark(bench_cfg: dict, vllm_config: VllmConfig) -> dict:
+    """Wait for benchmark result files and aggregate across DP ranks."""
+    base_path = Path(bench_cfg["output_path"])
+    timeout = int(bench_cfg.get("timeout", 300))
+    try:
+        dp_start, dp_size = get_dp_range_for_worker(vllm_config)
+    except Exception:
+        logger.warning(
+            "Could not determine DP range, assuming single rank",
+            exc_info=True,
+        )
+        dp_start, dp_size = 0, 1
+    rank_paths = []
+    for dp_rank in range(dp_start, dp_start + dp_size):
+        if dp_rank == 0:
+            rank_paths.append(base_path)
+        else:
+            stem, ext = os.path.splitext(str(base_path))
+            rank_paths.append(Path(f"{stem}_dp{dp_rank}{ext}"))
+    logger.info(
+        "Waiting for benchmark to complete (files: %s, timeout: %ds)...",
+        rank_paths,
+        timeout,
+    )
+    deadline = _time.monotonic() + timeout
+    for p in rank_paths:
+        while not p.exists():
+            if _time.monotonic() > deadline:
+                raise TimeoutError(
+                    f"Benchmark did not complete within {timeout}s. " f"Missing: {p}"
+                )
+            await asyncio.sleep(0.1)
+    merged: dict = {}
+    for i, p in enumerate(rank_paths):
+        with open(p) as f:
+            data = json.load(f)
+        if i == 0:
+            merged = data
+            for r in merged.get("results", []):
+                r["point"]["dp_rank"] = dp_start
+        else:
+            dp_rank = dp_start + i
+            for r in data.get("results", []):
+                r["point"]["dp_rank"] = dp_rank
+            merged.setdefault("results", []).extend(data.get("results", []))
+    logger.info(
+        "Benchmark complete, %d points across %d rank(s)",
+        len(merged.get("results", [])),
+        len(rank_paths),
+    )
+    return merged
 SetupVllmEngineFn = Callable[..., EngineSetupResult]
 SetupKvEventPublisherFn = Callable[..., Optional[Any]]
 RegisterVllmModelFn = Callable[..., Awaitable[None]]
@@ -64,6 +127,9 @@ class WorkerFactory:
    ) -> None:
        """Create the appropriate multimodal worker based on config flags."""
+        # NOTE: --benchmark-mode is only supported for prefill/decode workers.
+        # The encode worker path does not wire benchmark waiting or
+        # the get_perf_metrics endpoint.
        if config.disaggregation_mode == DisaggregationMode.ENCODE:
            await self._create_multimodal_encode_worker(
                runtime, config, shutdown_event, shutdown_endpoints
@@ -296,6 +362,13 @@ class WorkerFactory:
            handler._quiesce_controller.mark_resumed()
            logger.info("[Shadow] Engine awake, registering with discovery")
+        # Wait for self-benchmark to complete before registering.
+        bench_cfg = vllm_config.additional_config.get("benchmark")
+        if bench_cfg:
+            handler._benchmark_results = await _wait_and_load_benchmark(
+                bench_cfg, vllm_config
+            )
        await self.register_vllm_model(
            model_input,
            model_type,
@@ -309,6 +382,11 @@ class WorkerFactory:
            engine_client, use_text_input=config.use_vllm_tokenizer
        ).to_dict()
+        perf_endpoint = runtime.endpoint(
+            f"{config.namespace}.{config.component}.get_perf_metrics"
+        )
+        shutdown_endpoints.append(perf_endpoint)
        try:
            logger.debug("Starting serve_endpoint for decode worker")
@@ -336,6 +414,10 @@ class WorkerFactory:
                    handler.clear_kv_blocks,
                    metrics_labels=model_metrics_labels,
                ),
+                perf_endpoint.serve_endpoint(
+                    handler.get_perf_metrics,
+                    metrics_labels=model_metrics_labels,
+                ),
            ]
            if lora_enabled:
@@ -467,7 +549,17 @@ class WorkerFactory:
            "Registered engine routes: /engine/sleep, /engine/wake_up, /engine/scale_elastic_ep"
        )
-        shutdown_endpoints[:] = [generate_endpoint, clear_endpoint]
+        # Wait for self-benchmark to complete before registering.
+        bench_cfg = vllm_config.additional_config.get("benchmark")
+        if bench_cfg:
+            handler._benchmark_results = await _wait_and_load_benchmark(
+                bench_cfg, vllm_config
+            )
+        perf_endpoint = runtime.endpoint(
+            f"{config.namespace}.{config.component}.get_perf_metrics"
+        )
+        shutdown_endpoints[:] = [generate_endpoint, clear_endpoint, perf_endpoint]
        # Register prefill model with ModelType.Prefill
        model_input = (
@@ -486,18 +578,7 @@ class WorkerFactory:
            engine_client, use_text_input=config.use_vllm_tokenizer
        ).to_dict()
-        try:
+        prefill_metrics_labels = [
-            logger.debug("Starting serve_endpoint for prefill worker")
-            await asyncio.gather(
-                # for prefill, we want to shutdown the engine after all prefill requests are finished because
-                #     (temp reason): we don't support re-routing prefill requests
-                #     (long-term reason): prefill engine should pull from a global queue so there is
-                #                         only a few in-flight requests that can be quickly finished
-                generate_endpoint.serve_endpoint(
-                    handler.generate,  # type: ignore
-                    graceful_shutdown=True,
-                    # In practice config.served_model_name is always set, but mypy needs the "or" here.
-                    metrics_labels=[
            (
                prometheus_names.labels.MODEL,
                config.served_model_name or config.model,
@@ -506,21 +587,24 @@ class WorkerFactory:
                prometheus_names.labels.MODEL_NAME,
                config.served_model_name or config.model,
            ),
-                    ],
+        ]
+        try:
+            logger.debug("Starting serve_endpoint for prefill worker")
+            await asyncio.gather(
+                generate_endpoint.serve_endpoint(
+                    handler.generate,  # type: ignore
+                    graceful_shutdown=True,
+                    metrics_labels=prefill_metrics_labels,
                    health_check_payload=health_check_payload,
                ),
                clear_endpoint.serve_endpoint(
                    handler.clear_kv_blocks,  # type: ignore
-                    metrics_labels=[
+                    metrics_labels=prefill_metrics_labels,
-                        (
-                            prometheus_names.labels.MODEL,
-                            config.served_model_name or config.model,
-                        ),
-                        (
-                            prometheus_names.labels.MODEL_NAME,
-                            config.served_model_name or config.model,
                ),
-                    ],
+                perf_endpoint.serve_endpoint(
+                    handler.get_perf_metrics,
+                    metrics_labels=prefill_metrics_labels,
                ),
            )
            logger.debug("serve_endpoint completed for prefill worker")

--- a/docs/proposals/vllm-rfc-forward-pass-metrics.md
+++ b/docs/proposals/vllm-rfc-forward-pass-metrics.md
+# vLLM RFC: Per-Iteration Forward Pass Metrics via ZMQ
+> For submission to https://github.com/vllm-project/vllm/issues/new?template=750-RFC.yml
+---
+## Title
+`[RFC]: Per-iteration forward pass metrics with accurate engine-level timing`
+---
+## Motivation
+**Problem: orchestration systems need per-iteration scheduler telemetry, but vLLM only exposes aggregated Prometheus metrics.**
+Inference orchestrators (autoscalers, routers, disaggregated serving planners) need to understand the *per-iteration* cost structure of a running vLLM engine:
+- How many prefill vs decode requests were in each batch?
+- What was the KV cache depth distribution across decode requests?
+- How many tokens were computed vs cache-hit?
+- How long did the GPU forward pass actually take?
+- How many requests are queued and waiting?
+Today, vLLM exposes Prometheus gauge/histogram metrics that are **scraped asynchronously** by an external collector. This has fundamental limitations for per-iteration telemetry:
+1. **Lossy**: Prometheus scraping is pull-based at a configurable interval. With iteration times of 10-100ms, the scraper can miss 90%+ of iterations. Gauge values reflect only the most recent state at scrape time, not the full distribution. Aggregated metrics inevitably lose information.
+2. **Unsynchronized**: The scraper runs on a separate timer from the engine loop. Metrics from different gauges may reflect different iterations, making it impossible to correlate prefill/decode counts with wall time for the same batch.
+3. **No per-iteration history**: There is no way to reconstruct the sequence of batch compositions over time. An autoscaler cannot build a cost model from Prometheus data because it only sees snapshots.
+4. **Latency**: Push-based Prometheus (Pushgateway) uses HTTP, adding latency and overhead proportional to scrape frequency. For per-iteration emission at 100+ iterations/second, this is prohibitive.
+**Why this matters for the ecosystem:**
+- **NVIDIA Dynamo** currently implements this as an out-of-tree `--scheduler-cls` subclass ([InstrumentedScheduler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/instrumented_scheduler.py)), but measuring wall time from the scheduler is inherently imprecise because the scheduler cannot observe the GPU forward pass boundary (see Proposed Change).
+- **Autoscalers** (Kubernetes HPA, custom planners) need per-iteration throughput signals to make scaling decisions within seconds, not minutes.
+---
+## Proposed Change
+### 1. Add `wall_time` measurement in EngineCore
+Measure the GPU forward pass time at the exact boundary -- around `future.result()` in `EngineCore.step()` / `step_with_batch_queue()`:
+```python
+# In EngineCore.step():
+scheduler_output = self.scheduler.schedule()
+future = self.model_executor.execute_model(scheduler_output, non_block=True)
+...
+t_start = time.monotonic()
+model_output = future.result()   # blocks until GPU finishes
+wall_time = time.monotonic() - t_start
+...
+self.scheduler.update_from_output(scheduler_output, model_output, wall_time=wall_time)
+```
+This is the **only** place in the codebase with direct access to both the GPU wait boundary and the scheduler output. The scheduler cannot measure this accurately because:
+- In sync mode: `schedule()` returns before `execute_model` runs
+- In async mode: `schedule(N+1)` runs concurrently with GPU batch N, so scheduler-side timestamps include overlap from adjacent batches
+Pass `wall_time` to `update_from_output()` as a new optional kwarg so the scheduler can include it in metrics.
+### 2. Define a per-iteration metrics struct
+A compact, versioned struct emitted once per forward pass:
+```python
+class ForwardPassMetrics(msgspec.Struct, frozen=True):
+    version: int = 1             # can include more info in later versions
+    # Identity
+    worker_id: str = ""          # unique engine instance identifier
+    dp_rank: int = 0             # data parallel rank
+    counter_id: int = 0          # monotonic sequence number
+    # Timing (measured in EngineCore)
+    wall_time: float = 0.0       # seconds, GPU forward pass time
+    # Scheduled batch composition
+    num_prefill_requests: int = 0
+    sum_prefill_tokens: int = 0       # tokens being computed this iteration
+    var_prefill_length: float = 0.0   # variance of total prompt lengths
+    sum_prefill_kv_tokens: int = 0    # KV tokens read (cache hits + prior chunks)
+    num_decode_requests: int = 0
+    sum_decode_kv_tokens: int = 0     # total KV depth across decode requests
+    var_decode_kv_tokens: float = 0.0
+    # Queue state
+    num_queued_prefill: int = 0
+    sum_queued_prefill_tokens: int = 0
+    num_queued_decode: int = 0        # preempted requests waiting
+    sum_queued_decode_kv_tokens: int = 0
+```
+**Why these specific fields:**
+- An autoscaler needs `wall_time` + `num_prefill_requests` + `num_decode_requests` + token counts to build a cost model of the form `latency = f(prefill_tokens, decode_batch_size, kv_depth)`.
+- Variance fields enable detecting heterogeneous batches (mix of short and long sequences) which affect padding overhead and CUDA graph efficiency.
+- Queue metrics enable load-aware routing and backpressure signals.
+- `msgspec.Struct` is zero-copy serializable and already used by vLLM for KV cache events.
+### 3. Emit via ZMQ PUB/SUB (not Prometheus)
+Publish the struct over a ZMQ PUB socket bound to a configurable localhost port, using msgpack serialization:
+```
+ZMQ message: [topic_bytes, sequence_bytes, msgpack_payload]
+```
+**Why ZMQ over Prometheus:**
+| | ZMQ PUB/SUB | Prometheus |
+|---|---|---|
+| **Delivery** | Push, every iteration | Pull, scraper interval |
+| **Completeness** | Every iteration captured | 90%+ iterations missed |
+| **Correlation** | All fields from same iteration in one message | Gauges may reflect different iterations |
+| **Latency** | ~10us per message (IPC) | HTTP round-trip per scrape |
+| **CPU overhead** | Background thread, non-blocking send | Metric registry lock contention |
+| **Consumers** | Multiple SUB sockets, zero-copy | One scraper endpoint |
+| **Format** | Versioned, typed, extensible (msgspec) | Flat key-value gauges |
+The ZMQ publisher runs in a background daemon thread (same pattern as vLLM's existing `ZmqEventPublisher` for KV cache events). The scheduler hot path only pays for `queue.put_nowait()` on a bounded queue -- no serialization, no I/O.
+**Backward compatibility: Prometheus "most recent" gauges.** For users who only need approximate metrics via existing Prometheus infrastructure, we can optionally expose the most recent `ForwardPassMetrics` as Prometheus gauges (updated in-place each iteration, scraped at whatever interval the collector uses). This is strictly less capable than the ZMQ stream but maintains compatibility with existing monitoring dashboards.
+### 4. Data parallel support
+Each DP rank runs its own EngineCore with its own scheduler. Each rank binds its own ZMQ PUB socket on `base_port + dp_rank`, emitting independent FPM streams tagged with `dp_rank`.
+**Attention DP (non-MoE):** Each rank is fully independent (`dp_size=1` locally). Each rank emits its own FPM stream. No cross-rank coordination needed -- the consumer (autoscaler, planner) subscribes to each rank's ZMQ port independently and aggregates as needed.
+**DP+EP (MoE):** Each rank has its own scheduler and emits its own FPM. Although the GPU forward pass is synchronized across ranks via collectives (`coordinate_batch_across_dp`), each rank's `wall_time` is measured locally at its own `future.result()` boundary. The measurements are nearly identical across ranks (collectives force sync), so any rank's data is representative. Consumers can average or use rank 0's data.
+This is the **same approach used by KV cache events** today: each DP rank publishes to its own ZMQ port, and the relay/consumer layer handles multi-rank aggregation outside the engine.
+### 5. Activation
+Controlled by a new engine argument:
+```
+--forward-pass-metrics-port PORT   # 0 = disabled (default), >0 = ZMQ PUB base port
+```
+For DP deployments, rank N binds on `PORT + N`. When enabled, the scheduler base class (or a thin mixin) handles metric extraction and ZMQ publishing. No subclass override needed -- this should work with any scheduler implementation.
+### 6. Wire format and versioning
+- **Serialization**: msgpack via `msgspec.msgpack.Encoder` (same as KV cache events)
+- **ZMQ multipart**: `[b"", seq.to_bytes(8, "big"), msgpack_payload]`
+  - Empty topic allows future topic-based filtering
+  - 8-byte big-endian sequence number for ordering / gap detection
+  - msgpack payload is the serialized `ForwardPassMetrics`
+- **Versioning**: `version` field in the struct. Consumers must check version before interpreting fields. Bump on incompatible changes.
+### 7. Implementation scope
+| Component | Change |
+|-----------|--------|
+| `EngineCore.step()` / `step_with_batch_queue()` | Measure `wall_time` around `future.result()`, pass to `update_from_output()` |
+| `Scheduler.update_from_output()` | Accept optional `wall_time` kwarg |
+| `SchedulerInterface` | New optional method `get_forward_pass_metrics()` or mixin |
+| New: `ForwardPassMetrics` struct | In `vllm/v1/metrics/` or `vllm/v1/core/sched/` |
+| New: `FpmPublisher` (ZMQ background thread) | Modeled after existing `ZmqEventPublisher` |
+| `AsyncEngineArgs` | New `--forward-pass-metrics-port` argument |
+| Optional: Prometheus stat logger | Expose most-recent FPM fields as gauges |
+---
+## Feedback Period
+2 weeks.
+---
+## CC List
+@simon-mo @youkaichao @WoosukKwon @robertgshaw2-redhat
+---
+## Any Other Things
+**Reference implementation:** NVIDIA Dynamo's [InstrumentedScheduler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/instrumented_scheduler.py) implements this as an out-of-tree scheduler subclass with scheduler-side timing. Moving the timing into EngineCore and the ZMQ publisher into vLLM core would:
+1. Eliminate the need for `--scheduler-cls` overrides for metrics
+2. Provide accurate GPU timing (not scheduler-approximate)
+3. Allow any orchestration system (not just Dynamo) to consume per-iteration metrics
+4. Reuse existing ZMQ infrastructure from KV cache events
+**Existing ZMQ precedent in vLLM:** The KV cache event system (`KVEventsConfig`, `ZmqEventPublisher`) already uses this exact pattern -- ZMQ PUB on localhost, msgpack serialization, background thread. Forward pass metrics would follow the same architecture.
+**Not in scope:** How consumers (Dynamo, custom autoscalers, etc.) subscribe, relay, or aggregate these metrics. That is consumer-side logic. This RFC only covers emission from vLLM.