feat(mocker): add multi-worker replay and router startup fixes (#7553)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat(mocker): add multi-worker replay and router startup fixes (#7553)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
b7fe46b1 · Yan Ru Pei · GitHub · 82794761 · b7fe46b1 · b7fe46b1
Unverified Commit b7fe46b1 authored Mar 23, 2026 by Yan Ru Pei Committed by GitHub Mar 23, 2026
20 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2082,7 +2082,6 @@ dependencies = [
 "derive-getters",
 "derive_builder",
 "dynamo-kv-router",
- "dynamo-runtime",
 "dynamo-tokens",
 "ndarray 0.16.1",
 "ndarray-interp",
@@ -2092,6 +2091,7 @@ dependencies = [
 "serde",
 "serde_json",
 "slotmap",
+ "tempfile",
 "tokio",
 "tokio-timerfd",
 "tokio-util",

--- a/components/src/dynamo/common/configuration/groups/kv_router_args.py
+++ b/components/src/dynamo/common/configuration/groups/kv_router_args.py
@@ -3,7 +3,7 @@
 """Shared KV router configuration ArgGroup.
-Defines the 17 KvRouterConfig parameters once so that both
+Defines the shared KvRouterConfig parameters once so that both
 ``dynamo.frontend`` and ``dynamo.router`` can reuse them without duplication.
 Field names on ``KvRouterConfigBase`` match the ``KvRouterConfig`` Python
 constructor kwargs 1:1, so ``kv_router_kwargs()`` returns a dict that can be
@@ -34,13 +34,14 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
    "router_queue_threshold",
    "router_event_threads",
    "router_enable_cache_control",
+    "min_initial_workers",
    "router_queue_policy",
    "remote_indexer_component",
 )
 class KvRouterConfigBase(ConfigBase):
-    """Mixin carrying the 17 KvRouterConfig fields."""
+    """Mixin carrying the shared KvRouterConfig fields."""
    overlap_score_weight: float
    router_temperature: float
@@ -58,6 +59,7 @@ class KvRouterConfigBase(ConfigBase):
    router_queue_threshold: Optional[float]
    router_event_threads: int
    router_enable_cache_control: bool
+    min_initial_workers: int
    router_queue_policy: str
    remote_indexer_component: Optional[str]
@@ -67,7 +69,7 @@ class KvRouterConfigBase(ConfigBase):
 class KvRouterArgGroup(ArgGroup):
-    """CLI arguments for the 17 KvRouterConfig parameters."""
+    """CLI arguments for the shared KvRouterConfig parameters."""
    def add_arguments(self, parser) -> None:
        g = parser.add_argument_group("KV Router Options")
@@ -226,7 +228,7 @@ class KvRouterArgGroup(ArgGroup):
            g,
            flag_name="--router-queue-threshold",
            env_var="DYN_ROUTER_QUEUE_THRESHOLD",
-            default=2.0,
+            default=4.0,
            help=(
                "KV Router: Queue threshold fraction for prefill token capacity. "
                "Requests are queued if all workers exceed this fraction of "
@@ -258,6 +260,18 @@ class KvRouterArgGroup(ArgGroup):
                "requests with nvext.cache_control."
            ),
        )
+        add_argument(
+            g,
+            flag_name="--router-min-initial-workers",
+            env_var="DYN_ROUTER_MIN_INITIAL_WORKERS",
+            default=1,
+            help=(
+                "KV Router: Minimum number of workers that must be discovered before "
+                "router startup continues. Ignored when skip_initial_worker_wait is enabled."
+            ),
+            arg_type=int,
+            dest="min_initial_workers",
+        )
        add_argument(
            g,
            flag_name="--router-queue-policy",

--- a/components/src/dynamo/mocker/args.py
+++ b/components/src/dynamo/mocker/args.py
@@ -2,7 +2,6 @@
 #  SPDX-License-Identifier: Apache-2.0
 import argparse
-import json
 import logging
 import os
 import tempfile
@@ -90,92 +89,6 @@ def resolve_planner_profile_data(
    )
-def create_temp_engine_args_file(args: argparse.Namespace) -> Path:
-    """
-    Create a temporary JSON file with MockEngineArgs from CLI arguments.
-    Returns the path to the temporary file.
-    """
-    engine_args = {}
-    # Only include non-None values that differ from defaults
-    # Note: argparse converts hyphens to underscores in attribute names
-    # Extract all potential engine arguments, using None as default for missing attributes
-    engine_args = {
-        "num_gpu_blocks": getattr(args, "num_gpu_blocks", None),
-        "block_size": getattr(args, "block_size", None),
-        "max_num_seqs": getattr(args, "max_num_seqs", None),
-        "max_num_batched_tokens": getattr(args, "max_num_batched_tokens", None),
-        "enable_prefix_caching": getattr(args, "enable_prefix_caching", None),
-        "enable_chunked_prefill": getattr(args, "enable_chunked_prefill", None),
-        "preemption_mode": getattr(args, "preemption_mode", None),
-        "speedup_ratio": getattr(args, "speedup_ratio", None),
-        "decode_speedup_ratio": getattr(args, "decode_speedup_ratio", None),
-        "dp_size": getattr(args, "dp_size", None),
-        "startup_time": getattr(args, "startup_time", None),
-        "planner_profile_data": (
-            str(getattr(args, "planner_profile_data", None))
-            if getattr(args, "planner_profile_data", None)
-            else None
-        ),
-        "is_prefill": getattr(args, "is_prefill_worker", None),
-        "is_decode": getattr(args, "is_decode_worker", None),
-        "enable_local_indexer": not getattr(args, "durable_kv_events", False),
-        # Note: bootstrap_port and zmq_kv_events_port are NOT included here
-        # - they are per-worker and set in launch_workers()
-        # Note: kv_bytes_per_token and kv_cache_dtype are NOT included here
-        # - kv_bytes_per_token is auto-computed in main.py after model prefetch,
-        # - kv_cache_dtype is only used Python-side for the auto-computation.
-        "kv_transfer_bandwidth": getattr(args, "kv_transfer_bandwidth", None),
-        "engine_type": getattr(args, "engine_type", None),
-    }
-    # If --aic-perf-model is set, add AIC fields
-    if getattr(args, "aic_perf_model", False):
-        engine_type = getattr(args, "engine_type", None) or "vllm"
-        engine_args["aic_backend"] = engine_type
-        if getattr(args, "aic_system", None):
-            engine_args["aic_system"] = args.aic_system
-        if getattr(args, "aic_backend_version", None):
-            engine_args["aic_backend_version"] = args.aic_backend_version
-        if getattr(args, "aic_tp_size", None):
-            engine_args["aic_tp_size"] = args.aic_tp_size
-        if getattr(args, "model_path", None):
-            engine_args["aic_model_path"] = args.model_path
-    # Parse --reasoning JSON string into a nested object
-    reasoning_str = getattr(args, "reasoning", None)
-    if reasoning_str:
-        engine_args["reasoning"] = json.loads(reasoning_str)
-    # Build nested sglang config from individual CLI flags
-    sglang_args = {
-        "schedule_policy": getattr(args, "sglang_schedule_policy", None),
-        "page_size": getattr(args, "sglang_page_size", None),
-        "max_prefill_tokens": getattr(args, "sglang_max_prefill_tokens", None),
-        "chunked_prefill_size": getattr(args, "sglang_chunked_prefill_size", None),
-        "clip_max_new_tokens": getattr(args, "sglang_clip_max_new_tokens", None),
-        "schedule_conservativeness": getattr(
-            args, "sglang_schedule_conservativeness", None
-        ),
-    }
-    sglang_args = {k: v for k, v in sglang_args.items() if v is not None}
-    if sglang_args:
-        engine_args["sglang"] = sglang_args
-    # Remove None values to only include explicitly set arguments
-    engine_args = {k: v for k, v in engine_args.items() if v is not None}
-    # Create temporary file
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-        json.dump(engine_args, f, indent=2)
-        temp_path = Path(f.name)
-    logger.debug(f"Created temporary MockEngineArgs file at {temp_path}")
-    logger.debug(f"MockEngineArgs: {engine_args}")
-    return temp_path
 def validate_worker_type_args(args: argparse.Namespace) -> None:
    """
    Resolve disaggregation mode from --disaggregation-mode or legacy boolean flags.
@@ -261,31 +174,13 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
        default=None,
        help="Model name for API responses (default: derived from model-path)",
    )
-    parser.add_argument(
-        "--trace-file",
-        type=Path,
-        default=None,
-        help="Run offline trace replay from a Mooncake-style JSONL trace file.",
-    )
-    parser.add_argument(
-        "--output-file",
-        type=Path,
-        default=None,
-        help="Write replay metrics JSON to this path. Defaults to a replay JSON next to the trace file.",
-    )
-    parser.add_argument(
-        "--replay-concurrency",
-        type=int,
-        default=None,
-        help="Run offline replay in closed-loop concurrency mode with this many in-flight requests.",
-    )
    # MockEngineArgs parameters (similar to vLLM style)
    parser.add_argument(
        "--num-gpu-blocks-override",
        type=int,
        dest="num_gpu_blocks",  # Maps to num_gpu_blocks in MockEngineArgs
-        default=None,
+        default=16384,
        help="Number of GPU blocks for KV cache (default: 16384)",
    )
    parser.add_argument(
@@ -297,20 +192,20 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser.add_argument(
        "--max-num-seqs",
        type=int,
-        default=None,
+        default=256,
        help="Maximum number of sequences per iteration (default: 256)",
    )
    parser.add_argument(
        "--max-num-batched-tokens",
        type=int,
-        default=None,
+        default=8192,
        help="Maximum number of batched tokens per iteration (default: 8192)",
    )
    parser.add_argument(
        "--enable-prefix-caching",
        action="store_true",
        dest="enable_prefix_caching",
-        default=None,
+        default=True,
        help="Enable automatic prefix caching (default: True)",
    )
    parser.add_argument(
@@ -324,7 +219,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
        "--enable-chunked-prefill",
        action="store_true",
        dest="enable_chunked_prefill",
-        default=None,
+        default=True,
        help="Enable chunked prefill (default: True)",
    )
    parser.add_argument(
@@ -337,7 +232,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser.add_argument(
        "--preemption-mode",
        type=str,
-        default=None,
+        default="lifo",
        choices=["lifo", "fifo"],
        help="Preemption mode for decode eviction under memory pressure. "
        "'lifo' (default) evicts the newest request (matches vLLM v1), "
@@ -346,13 +241,13 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser.add_argument(
        "--speedup-ratio",
        type=float,
-        default=None,
+        default=1.0,
        help="Speedup ratio for mock execution (default: 1.0). Use 0 for infinite speedup (no simulation delays).",
    )
    parser.add_argument(
        "--decode-speedup-ratio",
        type=float,
-        default=None,
+        default=1.0,
        help="Additional speedup multiplier applied only to decode steps (default: 1.0). "
        "Models speculative decoding (e.g. Eagle) where decode throughput improves "
        "without affecting prefill latency. Effective decode speedup is speedup_ratio * decode_speedup_ratio.",
@@ -361,7 +256,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
        "--data-parallel-size",
        type=int,
        dest="dp_size",
-        default=None,
+        default=1,
        help="Number of data parallel replicas (default: 1)",
    )
    parser.add_argument(
@@ -426,7 +321,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser.add_argument(
        "--engine-type",
        type=str,
-        default=None,
+        default="vllm",
        choices=["vllm", "sglang"],
        help="Engine simulation type: 'vllm' (default) or 'sglang'.",
    )
@@ -604,9 +499,6 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    args = parser.parse_args(argv)
    validate_worker_type_args(args)
-    if args.replay_concurrency is not None and args.trace_file is None:
-        raise ValueError("--replay-concurrency requires --trace-file")
    # Validate num_workers
    if args.num_workers < 1:
        raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}")

--- a/components/src/dynamo/mocker/config.py
+++ b/components/src/dynamo/mocker/config.py
+#  SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+import socket
+from dynamo.llm import MockEngineArgs, ModelRuntimeConfig, ReasoningConfig, SglangArgs
+_DEFAULT_NUM_GPU_BLOCKS = 16384
+_DEFAULT_MAX_NUM_SEQS = 256
+_DEFAULT_MAX_NUM_BATCHED_TOKENS = 8192
+def _parse_reasoning_config(reasoning_json: str | None) -> ReasoningConfig | None:
+    if not reasoning_json:
+        return None
+    reasoning = json.loads(reasoning_json)
+    return ReasoningConfig(
+        start_thinking_token_id=reasoning["start_thinking_token_id"],
+        end_thinking_token_id=reasoning["end_thinking_token_id"],
+        thinking_ratio=reasoning["thinking_ratio"],
+    )
+def _build_sglang_args(args: argparse.Namespace) -> SglangArgs | None:
+    sglang_args = {
+        "schedule_policy": getattr(args, "sglang_schedule_policy", None),
+        "page_size": getattr(args, "sglang_page_size", None),
+        "max_prefill_tokens": getattr(args, "sglang_max_prefill_tokens", None),
+        "chunked_prefill_size": getattr(args, "sglang_chunked_prefill_size", None),
+        "clip_max_new_tokens": getattr(args, "sglang_clip_max_new_tokens", None),
+        "schedule_conservativeness": getattr(
+            args, "sglang_schedule_conservativeness", None
+        ),
+    }
+    if not any(value is not None for value in sglang_args.values()):
+        return None
+    return SglangArgs(**sglang_args)
+def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
+    aic_backend = None
+    aic_system = None
+    aic_backend_version = None
+    aic_tp_size = None
+    aic_model_path = None
+    if getattr(args, "aic_perf_model", False):
+        aic_backend = getattr(args, "engine_type", None) or "vllm"
+        aic_system = getattr(args, "aic_system", None)
+        aic_backend_version = getattr(args, "aic_backend_version", None)
+        aic_tp_size = getattr(args, "aic_tp_size", None)
+        aic_model_path = getattr(args, "model_path", None)
+    return MockEngineArgs(
+        engine_type=getattr(args, "engine_type", None) or "vllm",
+        num_gpu_blocks=getattr(args, "num_gpu_blocks", _DEFAULT_NUM_GPU_BLOCKS),
+        block_size=getattr(args, "block_size", 0) or 0,
+        max_num_seqs=getattr(args, "max_num_seqs", _DEFAULT_MAX_NUM_SEQS),
+        max_num_batched_tokens=getattr(
+            args, "max_num_batched_tokens", _DEFAULT_MAX_NUM_BATCHED_TOKENS
+        ),
+        enable_prefix_caching=getattr(args, "enable_prefix_caching", True),
+        enable_chunked_prefill=getattr(args, "enable_chunked_prefill", True),
+        preemption_mode=getattr(args, "preemption_mode", "lifo"),
+        speedup_ratio=getattr(args, "speedup_ratio", 1.0),
+        decode_speedup_ratio=getattr(args, "decode_speedup_ratio", 1.0),
+        dp_size=getattr(args, "dp_size", 1),
+        startup_time=getattr(args, "startup_time", None),
+        worker_type=(
+            "prefill"
+            if getattr(args, "is_prefill_worker", False)
+            else "decode"
+            if getattr(args, "is_decode_worker", False)
+            else "aggregated"
+        ),
+        aic_backend=aic_backend,
+        aic_system=aic_system,
+        aic_backend_version=aic_backend_version,
+        aic_tp_size=aic_tp_size,
+        aic_model_path=aic_model_path,
+        enable_local_indexer=not getattr(args, "durable_kv_events", False),
+        kv_transfer_bandwidth=getattr(args, "kv_transfer_bandwidth", None),
+        reasoning=_parse_reasoning_config(getattr(args, "reasoning", None)),
+        sglang=_build_sglang_args(args),
+    )
+def load_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
+    if args.extra_engine_args:
+        return MockEngineArgs.from_json(args.extra_engine_args.read_text())
+    return build_mocker_engine_args(args)
+def apply_worker_engine_args_overrides(
+    engine_args: MockEngineArgs,
+    *,
+    kv_bytes_per_token: int | None = None,
+    bootstrap_port: int | None = None,
+    zmq_kv_events_port: int | None = None,
+    zmq_replay_port: int | None = None,
+) -> MockEngineArgs:
+    return engine_args.with_overrides(
+        bootstrap_port=bootstrap_port,
+        zmq_kv_events_port=zmq_kv_events_port,
+        zmq_replay_port=zmq_replay_port,
+        kv_bytes_per_token=kv_bytes_per_token,
+    )
+def build_runtime_config(
+    engine_args: MockEngineArgs,
+) -> tuple[int, ModelRuntimeConfig]:
+    rc = ModelRuntimeConfig()
+    rc.total_kv_blocks = engine_args.num_gpu_blocks
+    rc.max_num_seqs = engine_args.max_num_seqs
+    if rc.max_num_seqs is None:
+        rc.max_num_seqs = _DEFAULT_MAX_NUM_SEQS
+    rc.max_num_batched_tokens = engine_args.max_num_batched_tokens
+    if rc.max_num_batched_tokens is None:
+        rc.max_num_batched_tokens = _DEFAULT_MAX_NUM_BATCHED_TOKENS
+    rc.enable_local_indexer = (
+        engine_args.enable_local_indexer and not engine_args.is_decode()
+    )
+    rc.data_parallel_size = engine_args.dp_size
+    bootstrap_port = engine_args.bootstrap_port
+    if engine_args.is_prefill() and bootstrap_port is not None:
+        host = os.environ.get(
+            "DYN_HTTP_RPC_HOST", socket.gethostbyname(socket.gethostname())
+        )
+        rc.set_disaggregated_endpoint(
+            bootstrap_host=host, bootstrap_port=bootstrap_port
+        )
+    return engine_args.block_size, rc
--- a/components/src/dynamo/mocker/main.py
+++ b/components/src/dynamo/mocker/main.py
@@ -6,12 +6,9 @@
 import argparse
 import asyncio
-import json
 import logging
 import os
 import signal
-import socket
-import tempfile
 from pathlib import Path
 import uvloop
@@ -19,18 +16,15 @@ import uvloop
 os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
 from dynamo.common.utils.runtime import create_runtime
-from dynamo.llm import (
+from dynamo.llm import EngineType, EntrypointArgs, fetch_model, make_engine, run_input
-    EngineType,
-    EntrypointArgs,
-    ModelRuntimeConfig,
-    fetch_model,
-    make_engine,
-    run_input,
-)
 from dynamo.runtime.logging import configure_dynamo_logging
-from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data
+from .args import parse_args, resolve_planner_profile_data
-from .replay import run_trace_replay
+from .config import (
+    apply_worker_engine_args_overrides,
+    build_runtime_config,
+    load_mocker_engine_args,
+)
 from .utils.kv_cache import compute_kv_bytes_per_token
 configure_dynamo_logging()
@@ -77,73 +71,32 @@ async def worker():
    profile_data_result = resolve_planner_profile_data(args.planner_profile_data)
    args.planner_profile_data = profile_data_result.npz_path
-    # Offline replay does not need planner profile conversion or runtime setup.
-    if args.trace_file is not None:
-        if args.extra_engine_args:
-            extra_engine_args_path = args.extra_engine_args
-            logger.info(f"Using provided MockEngineArgs from {extra_engine_args_path}")
-        else:
-            extra_engine_args_path = create_temp_engine_args_file(args)
-            logger.info("Created MockEngineArgs from CLI arguments")
-        try:
-            run_trace_replay(
-                trace_file=args.trace_file,
-                output_file=args.output_file,
-                extra_engine_args=extra_engine_args_path,
-                num_workers=args.num_workers,
-                replay_concurrency=args.replay_concurrency,
-            )
-            return
-        finally:
-            if not args.extra_engine_args and extra_engine_args_path.exists():
-                try:
-                    extra_engine_args_path.unlink()
-                    logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
-                except Exception as e:
-                    logger.warning(f"Failed to clean up temporary file: {e}")
-    # Handle extra_engine_args: either use provided file or create from CLI args
-    if args.extra_engine_args:
-        # User provided explicit JSON file
-        extra_engine_args_path = args.extra_engine_args
-        logger.info(f"Using provided MockEngineArgs from {extra_engine_args_path}")
-    else:
-        # Create temporary JSON file from CLI arguments
-        extra_engine_args_path = create_temp_engine_args_file(args)
-        logger.info("Created MockEngineArgs from CLI arguments")
    try:
        # Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers
        if args.num_workers > 1 and args.model_path:
            await prefetch_model(args.model_path)
+        engine_args = load_mocker_engine_args(args)
+        logger.info(
+            "Loaded MockEngineArgs from JSON file"
+            if args.extra_engine_args
+            else "Created MockEngineArgs from CLI arguments"
+        )
        # Auto-compute kv_bytes_per_token from model config if not explicitly set
        if args.kv_bytes_per_token is None and args.model_path:
            args.kv_bytes_per_token = compute_kv_bytes_per_token(
                args.model_path, args.kv_cache_dtype
            )
+        engine_args = apply_worker_engine_args_overrides(
-        # Inject kv_bytes_per_token into engine args JSON (computed after model prefetch)
+            engine_args, kv_bytes_per_token=args.kv_bytes_per_token
-        if args.kv_bytes_per_token is not None and not args.extra_engine_args:
+        )
-            with open(extra_engine_args_path) as f:
-                engine_args = json.load(f)
-            engine_args["kv_bytes_per_token"] = args.kv_bytes_per_token
-            with open(extra_engine_args_path, "w") as f:
-                json.dump(engine_args, f, indent=2)
        logger.info(
            f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances"
        )
-        await launch_workers(args, extra_engine_args_path)
+        await launch_workers(args, engine_args)
    finally:
-        # Clean up temporary file if we created one
-        if not args.extra_engine_args and extra_engine_args_path.exists():
-            try:
-                extra_engine_args_path.unlink()
-                logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
-            except Exception as e:
-                logger.warning(f"Failed to clean up temporary file: {e}")
        if profile_data_result is not None:
            del profile_data_result  # Triggers tmpdir cleanup via __del__
@@ -170,47 +123,7 @@ def compute_stagger_delay(num_workers: int, stagger_delay: float) -> float:
        return 0.2
-def _build_runtime_config(
+async def launch_workers(args: argparse.Namespace, base_engine_args):
-    engine_args: dict,
-) -> tuple[int, ModelRuntimeConfig]:
-    """Build a ModelRuntimeConfig from the engine args dict.
-    Returns (kv_cache_block_size, runtime_config). Defaults match
-    the Rust MockEngineArgsBuilder so hand-crafted JSON files that
-    omit fields behave identically.
-    """
-    is_prefill = engine_args.get("is_prefill", False)
-    is_decode = engine_args.get("is_decode", False)
-    rc = ModelRuntimeConfig()
-    rc.total_kv_blocks = engine_args.get("num_gpu_blocks", 16384)
-    if (v := engine_args.get("max_num_seqs")) is not None:
-        rc.max_num_seqs = v
-    if (v := engine_args.get("max_num_batched_tokens")) is not None:
-        rc.max_num_batched_tokens = v
-    rc.enable_local_indexer = (
-        engine_args.get("enable_local_indexer", False) and not is_decode
-    )
-    rc.data_parallel_size = engine_args.get("dp_size", 1)
-    bootstrap_port = engine_args.get("bootstrap_port")
-    if is_prefill and bootstrap_port is not None:
-        host = os.environ.get(
-            "DYN_HTTP_RPC_HOST", socket.gethostbyname(socket.gethostname())
-        )
-        rc.set_disaggregated_endpoint(
-            bootstrap_host=host, bootstrap_port=bootstrap_port
-        )
-        logger.info(
-            "Mocker prefill worker: publishing bootstrap endpoint to discovery "
-            f"(bootstrap_port={bootstrap_port})"
-        )
-    block_size = engine_args.get("block_size", 64)
-    return block_size, rc
-async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path):
    """Launch mocker worker(s) with isolated DistributedRuntime instances.
    Each worker gets its own DistributedRuntime, which means:
@@ -221,7 +134,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
    """
    futures = []
    runtimes = []
-    per_worker_temp_files: list[Path] = []
    stagger_delay = compute_stagger_delay(args.num_workers, args.stagger_delay)
    batch_size = 32
@@ -238,11 +150,7 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
            f"(estimated total: {total_time:.1f}s)"
        )
-    # Always load base engine args for runtime config construction
+    needs_per_worker_overrides = bool(
-    with open(extra_engine_args_path) as f:
-        base_engine_args = json.load(f)
-    needs_per_worker_args = bool(
        args.bootstrap_ports_list
        or args.zmq_kv_events_ports_list
        or args.zmq_replay_ports_list
@@ -261,30 +169,29 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
        )
        runtimes.append(runtime)
-        # Determine which engine args file and dict to use
+        if needs_per_worker_overrides:
-        worker_engine_args_path: Path | str
+            worker_engine_args = apply_worker_engine_args_overrides(
-        if needs_per_worker_args:
+                base_engine_args,
-            worker_args = base_engine_args.copy()
+                bootstrap_port=(
-            if args.bootstrap_ports_list:
+                    args.bootstrap_ports_list[worker_id]
-                worker_args["bootstrap_port"] = args.bootstrap_ports_list[worker_id]
+                    if args.bootstrap_ports_list
-            if args.zmq_kv_events_ports_list:
+                    else None
-                worker_args["zmq_kv_events_port"] = args.zmq_kv_events_ports_list[
+                ),
-                    worker_id
+                zmq_kv_events_port=(
-                ]
+                    args.zmq_kv_events_ports_list[worker_id]
-            if args.zmq_replay_ports_list:
+                    if args.zmq_kv_events_ports_list
-                worker_args["zmq_replay_port"] = args.zmq_replay_ports_list[worker_id]
+                    else None
-            with tempfile.NamedTemporaryFile(
+                ),
-                mode="w", suffix=".json", delete=False
+                zmq_replay_port=(
-            ) as tmp:
+                    args.zmq_replay_ports_list[worker_id]
-                json.dump(worker_args, tmp)
+                    if args.zmq_replay_ports_list
-                worker_engine_args_path = Path(tmp.name)
+                    else None
-            per_worker_temp_files.append(worker_engine_args_path)
+                ),
-            logger.debug(f"Worker {worker_id}: per-worker args {worker_args}")
+            )
        else:
-            worker_args = base_engine_args
+            worker_engine_args = base_engine_args
-            worker_engine_args_path = extra_engine_args_path
-        kv_cache_block_size, runtime_config = _build_runtime_config(worker_args)
+        kv_cache_block_size, runtime_config = build_runtime_config(worker_engine_args)
        # Create EntrypointArgs for this worker
        entrypoint_args = EntrypointArgs(
@@ -293,7 +200,8 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
            model_name=args.model_name,
            endpoint_id=args.endpoint,
            context_length=0,
-            extra_engine_args=str(worker_engine_args_path),
+            extra_engine_args=None,
+            mocker_engine_args=worker_engine_args,
            runtime_config=runtime_config,
            kv_cache_block_size=kv_cache_block_size,
            is_prefill=args.is_prefill_worker,
@@ -337,13 +245,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
        for runtime in runtimes:
            runtime.shutdown()
-        # Clean up per-worker temp files
-        for temp_file in per_worker_temp_files:
-            try:
-                temp_file.unlink()
-            except Exception:
-                pass
 def main():
    uvloop.run(worker())

--- a/components/src/dynamo/mocker/replay.py
+++ b/components/src/dynamo/mocker/replay.py
-#  SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#  SPDX-License-Identifier: Apache-2.0
-import json
-from pathlib import Path
-from typing import Any
-from dynamo.llm import run_mocker_trace_replay
-def default_replay_output_path(trace_file: Path) -> Path:
-    return trace_file.with_name(f"{trace_file.stem}.replay.json")
-def format_table(headers: list[str], rows: list[list[str]]) -> str:
-    widths = [len(header) for header in headers]
-    for row in rows:
-        for idx, cell in enumerate(row):
-            widths[idx] = max(widths[idx], len(cell))
-    def format_row(row: list[str]) -> str:
-        return " | ".join(cell.ljust(widths[idx]) for idx, cell in enumerate(row))
-    separator = "-+-".join("-" * width for width in widths)
-    return "\n".join(
-        [format_row(headers), separator, *(format_row(row) for row in rows)]
-    )
-def format_ms(value: float | None) -> str:
-    if value is None:
-        return "-"
-    return f"{value:.3f}"
-def format_number(value: float | None) -> str:
-    if value is None:
-        return "-"
-    return f"{value:.3f}"
-def print_replay_summary(report: dict[str, Any], output_file: Path) -> None:
-    scalar_rows = [
-        ["Request count", str(report["num_requests"])],
-        ["Completed requests", str(report["completed_requests"])],
-        ["Virtual duration (ms)", f"{report['duration_ms']:.3f}"],
-        ["Wall time (ms)", f"{report['wall_time_ms']:.3f}"],
-        ["Input tokens", str(report["total_input_tokens"])],
-        ["Output tokens", str(report["total_output_tokens"])],
-        ["Request throughput (req/s)", f"{report['request_throughput_rps']:.3f}"],
-        ["Input throughput (tok/s)", f"{report['input_throughput_tok_s']:.3f}"],
-        ["Output throughput (tok/s)", f"{report['output_throughput_tok_s']:.3f}"],
-        ["Total throughput (tok/s)", f"{report['total_throughput_tok_s']:.3f}"],
-        ["Prefix cache reused ratio", f"{report['prefix_cache_reused_ratio']:.6f}"],
-    ]
-    latency_rows = [
-        [
-            "TTFT",
-            format_ms(report["mean_ttft_ms"]),
-            format_ms(report["min_ttft_ms"]),
-            format_ms(report["max_ttft_ms"]),
-            format_ms(report["p99_ttft_ms"]),
-            format_ms(report["p90_ttft_ms"]),
-            format_ms(report["median_ttft_ms"]),
-            format_ms(report["p75_ttft_ms"]),
-            format_ms(report["std_ttft_ms"]),
-        ],
-        [
-            "TTST",
-            format_ms(report["mean_ttst_ms"]),
-            format_ms(report["min_ttst_ms"]),
-            format_ms(report["max_ttst_ms"]),
-            format_ms(report["p99_ttst_ms"]),
-            format_ms(report["p90_ttst_ms"]),
-            format_ms(report["median_ttst_ms"]),
-            format_ms(report["p75_ttst_ms"]),
-            format_ms(report["std_ttst_ms"]),
-        ],
-        [
-            "TPOT",
-            format_ms(report["mean_tpot_ms"]),
-            format_ms(report["min_tpot_ms"]),
-            format_ms(report["max_tpot_ms"]),
-            format_ms(report["p99_tpot_ms"]),
-            format_ms(report["p90_tpot_ms"]),
-            format_ms(report["median_tpot_ms"]),
-            format_ms(report["p75_tpot_ms"]),
-            format_ms(report["std_tpot_ms"]),
-        ],
-        [
-            "ITL",
-            format_ms(report["mean_itl_ms"]),
-            format_ms(report["min_itl_ms"]),
-            format_ms(report["max_itl_ms"]),
-            format_ms(report["p99_itl_ms"]),
-            format_ms(report["p90_itl_ms"]),
-            format_ms(report["median_itl_ms"]),
-            format_ms(report["p75_itl_ms"]),
-            format_ms(report["std_itl_ms"]),
-        ],
-        [
-            "E2E latency",
-            format_ms(report["mean_e2e_latency_ms"]),
-            format_ms(report["min_e2e_latency_ms"]),
-            format_ms(report["max_e2e_latency_ms"]),
-            format_ms(report["p99_e2e_latency_ms"]),
-            format_ms(report["p90_e2e_latency_ms"]),
-            format_ms(report["median_e2e_latency_ms"]),
-            format_ms(report["p75_e2e_latency_ms"]),
-            format_ms(report["std_e2e_latency_ms"]),
-        ],
-        [
-            "Output TPS/User",
-            format_number(report["mean_output_token_throughput_per_user"]),
-            format_number(report["min_output_token_throughput_per_user"]),
-            format_number(report["max_output_token_throughput_per_user"]),
-            format_number(report["p99_output_token_throughput_per_user"]),
-            format_number(report["p90_output_token_throughput_per_user"]),
-            format_number(report["median_output_token_throughput_per_user"]),
-            format_number(report["p75_output_token_throughput_per_user"]),
-            format_number(report["std_output_token_throughput_per_user"]),
-        ],
-    ]
-    lines = [
-        "Replay Summary",
-        format_table(["Metric", "Value"], scalar_rows),
-        "",
-        format_table(
-            ["Metric", "avg", "min", "max", "p99", "p90", "p50", "p75", "std"],
-            latency_rows,
-        ),
-        f"JSON report: {output_file}",
-    ]
-    print("\n".join(lines))
-def write_replay_report(report: dict[str, Any], output_file: Path) -> None:
-    output_file.parent.mkdir(parents=True, exist_ok=True)
-    with open(output_file, "w") as f:
-        json.dump(report, f, indent=2, sort_keys=True)
-def run_trace_replay(
-    trace_file: Path,
-    output_file: Path | None,
-    extra_engine_args: Path,
-    num_workers: int,
-    replay_concurrency: int | None,
-) -> None:
-    resolved_output_file = output_file or default_replay_output_path(trace_file)
-    report = run_mocker_trace_replay(
-        trace_file=trace_file,
-        extra_engine_args=extra_engine_args,
-        num_workers=num_workers,
-        replay_concurrency=replay_concurrency,
-    )
-    write_replay_report(report, resolved_output_file)
-    print_replay_summary(report, resolved_output_file)
--- a/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
@@ -190,10 +190,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s
            MultiModalGroup(multimodal_input=MultiModalInput(image_url=url))
            for url in image_urls
        ]
+        preprocessed_request = PreprocessedRequest.model_validate(raw_request)
        # Build SglangMultimodalRequest from the pre-tokenized request
        request = SglangMultimodalRequest(
-            request=PreprocessedRequest(**raw_request),
+            request=preprocessed_request,
            multimodal_inputs=multimodal_groups,
        )

--- a/docs/benchmarks/mocker-trace-replay.md
+++ b/docs/benchmarks/mocker-trace-replay.md
 ---
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-title: Mocker Offline Trace Replay
+title: Mocker Trace Replay
-subtitle: Replay Mooncake-style traces offline without launching a runtime or router
+subtitle: Replay Mooncake-style traces through the mocker in offline or online mode
 ---
-This guide covers the mocker's offline trace replay mode, which replays a Mooncake-style JSONL trace directly through the mock scheduler and writes a metrics report. Unlike normal `dynamo.mocker` usage, this mode does not launch workers, register endpoints, or require NATS, etcd, or a frontend.
+This guide covers the mocker's trace replay support for Mooncake-style JSONL traces. The replay
+surface is available in two forms:
+- `python -m dynamo.mocker --trace-file ...`, which writes a report file and prints a replay summary
+- `python -m dynamo.replay ...`, which returns the replay report JSON on stdout and exposes
+  `offline|online`, `round_robin|kv_router`, `arrival_speedup_ratio`, and synthetic replay inputs
+  directly
+Unlike normal `dynamo.mocker` usage, offline replay does not launch workers, register endpoints, or
+require NATS, etcd, or a frontend. Online replay does exercise the live mock-worker runtime path.
 Use this when you want to:
@@ -15,7 +24,31 @@ Use this when you want to:
 ## Quick Start
-Run offline replay by passing `--trace-file`:
+Run offline replay through the dedicated replay CLI:
+```bash
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --num-workers 4 \
+    --replay-mode offline \
+    --router-mode round_robin \
+    --extra-engine-args /path/to/mocker_args.json
+```
+Run synthetic replay through the same CLI when you want fixed request shapes without a trace file:
+```bash
+python -m dynamo.replay \
+    --input-tokens 5000 \
+    --output-tokens 500 \
+    --request-count 1000 \
+    --arrival-interval-ms 1.0 \
+    --num-workers 1 \
+    --replay-mode offline \
+    --replay-concurrency 100 \
+    --extra-engine-args /path/to/mocker_args.json
+```
+You can also run replay through the mocker CLI by passing `--trace-file`:
 ```bash
 python -m dynamo.mocker \
@@ -29,7 +62,8 @@ This writes a JSON report next to the trace file by default:
 /path/to/mooncake_trace.replay.json
 ```
-The CLI also prints a `Replay Summary` table to stdout with request counts, throughput, and latency statistics.
+`python -m dynamo.replay` prints the replay report JSON directly to stdout. The mocker CLI prints a
+`Replay Summary` table to stdout and writes the report JSON to disk.
 ## Input Format
@@ -46,37 +80,180 @@ Example:
 {"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3]}
 ```
-The mocker synthesizes token blocks from `hash_ids` using the configured `--block-size`, so the replay block size should match the block size used when the trace was generated.
+The mocker synthesizes token blocks from `hash_ids` using the configured `--block-size`, so the
+replay block size must match the block size used when the trace was generated. Public Mooncake
+traces are commonly block-level hashes at `512` tokens per hash ID, so replaying them with the
+default mocker `block_size=64` will fail once `input_length > len(hash_ids) * 64`. For
+`engine_type=sglang`, replay still uses canonical `block_size` internally; `sglang.page_size` is
+accepted as a compatibility alias and is normalized into `block_size` before replay starts.
+## Replay Surfaces
+### `python -m dynamo.replay`
+The dedicated replay CLI exposes:
+- either a positional `trace_file`, or all of `--input-tokens`, `--output-tokens`, and `--request-count`
+- `--replay-mode offline|online`
+- `--router-mode round_robin|kv_router`
+- `--router-queue-policy fcfs|wspt|lcfs`
+- `--num-workers`
+- `--replay-concurrency`
+- `--arrival-interval-ms`
+- `--arrival-speedup-ratio`
+- `--extra-engine-args`
+- `--extra-engine-args-json`
+- `--router-config`
+- `--router-config-json`
+Example:
+```bash
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --replay-mode online \
+    --router-mode kv_router \
+    --num-workers 4 \
+    --arrival-speedup-ratio 10 \
+    --extra-engine-args-json '{"block_size":64,"speedup_ratio":1000.0}' \
+    --router-config-json '{"router_queue_policy":"fcfs","router_temperature":0.0}'
+```
+SGLang replay uses the same CLI surface. A minimal extra-engine-args file can use either
+`block_size` directly or the compatibility alias `sglang.page_size`:
+```json
+{
+  "engine_type": "sglang",
+  "num_gpu_blocks": 512,
+  "speedup_ratio": 1000.0,
+  "sglang": {
+    "page_size": 2
+  }
+}
+```
+For both `--extra-engine-args-json` and `--router-config-json`, replay accepts partial JSON
+objects. Unspecified fields fall back to the same defaults used by `MockEngineArgs::default()`
+and `KvRouterConfig::default()`.
+### `python -m dynamo.mocker --trace-file`
+The mocker CLI supports offline replay and remains useful when you want the historical
+`Replay Summary` output and report-file workflow.
+### Synthetic Replay
+Synthetic replay bypasses trace loading and generates in-memory requests with fixed input/output
+lengths and optional synthetic arrival spacing:
+```bash
+python -m dynamo.replay \
+    --input-tokens 5000 \
+    --output-tokens 500 \
+    --request-count 200 \
+    --arrival-interval-ms 0.5 \
+    --replay-mode offline \
+    --replay-concurrency 50 \
+    --extra-engine-args /path/to/mocker_args.json
+```
+This is useful for parameter sweeps where Mooncake-style prefix structure is not required.
 ## Modes
 ### Fixed-Schedule Replay
-Default replay mode preserves the timestamps from the trace and simulates arrivals in virtual time:
+Default trace replay preserves the timestamps from the trace and simulates arrivals according to
+those timestamps:
 ```bash
-python -m dynamo.mocker \
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
-    --trace-file /path/to/mooncake_trace.jsonl \
+    --replay-mode offline \
-    --model-path Qwen/Qwen3-0.6B \
+    --num-workers 4 \
-    --block-size 512
+    --extra-engine-args /path/to/mocker_args.json
 ```
 This is the right mode when you want deterministic replay of the original arrival pattern.
 ### Closed-Loop Concurrency Replay
-Use `--replay-concurrency` to ignore trace arrival timing and keep a fixed number of requests in flight:
+Use `--replay-concurrency` to ignore trace arrival timing and keep a fixed number of requests in
+flight:
 ```bash
-python -m dynamo.mocker \
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
-    --trace-file /path/to/mooncake_trace.jsonl \
+    --replay-mode offline \
-    --model-path Qwen/Qwen3-0.6B \
+    --num-workers 4 \
-    --block-size 512 \
    --replay-concurrency 16
 ```
 This mode is useful when you want to compare scheduler behavior under a fixed offered concurrency rather than the original trace schedule.
+### Online Replay
+Online replay launches the mock workers and replays the trace against the live runtime path. This
+is useful when you want the replay to include live request dispatch, live output handling, and the
+same async KV-event propagation model used by the current router integration.
+```bash
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --replay-mode online \
+    --router-mode kv_router \
+    --num-workers 4 \
+    --arrival-speedup-ratio 10 \
+    --extra-engine-args /path/to/mocker_args.json
+```
+### Arrival Speedup
+Use `--arrival-speedup-ratio` to compress or stretch the trace arrival process without changing the
+mocker compute model. Larger values make arrivals happen sooner relative to the original trace.
+```bash
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --replay-mode offline \
+    --num-workers 4 \
+    --arrival-speedup-ratio 5 \
+    --extra-engine-args /path/to/mocker_args.json
+```
+### Router Modes
+Replay currently supports:
+- `round_robin`
+- `kv_router`
+`kv_router` uses the shared local scheduler and an in-process KV indexer. In offline replay:
+- `kv_router` is supported only when `num_workers > 1`
+- router queueing is enabled and uses simulation time rather than wall-clock time
+- KV visibility is delayed slightly relative to request lifecycle events
+- queue admission is driven by router lifecycle edges (`add_request`, `mark_prefill_completed`, and `free`)
+- transient in-pass prefill occupancy is still approximated at the router level rather than modeled exactly
+To compare queue policies manually, keep the same trace and engine args fixed and swap only
+`--router-queue-policy`:
+```bash
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --replay-mode offline \
+    --router-mode kv_router \
+    --router-queue-policy fcfs \
+    --num-workers 4 \
+    --extra-engine-args /path/to/mocker_args.json
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --replay-mode offline \
+    --router-mode kv_router \
+    --router-queue-policy lcfs \
+    --num-workers 4 \
+    --extra-engine-args /path/to/mocker_args.json
+```
+`lcfs` is intentionally a worse comparison policy under saturation; use it for experiments, not as
+an expected production default.
 ## Output
 Use `--output-file` to override the default report location:
@@ -88,7 +265,7 @@ python -m dynamo.mocker \
    --output-file /tmp/replay-report.json
 ```
-If `--output-file` is not set, the report path defaults to `<trace stem>.replay.json` in the same directory as the input trace.
+If `--output-file` is not set, the report path defaults to `TRACE_STEM.replay.json` in the same directory as the input trace.
 The report contains:
@@ -100,23 +277,41 @@ The report contains:
 - TTFT, TTST, TPOT, ITL, and end-to-end latency summaries
 - output-token-throughput-per-user summaries
+The dedicated replay CLI returns the same report schema as the Python APIs
+`dynamo.replay.run_trace_replay(...)` and `dynamo.replay.run_synthetic_trace_replay(...)`.
 ## Replay Constraints
-Offline replay currently supports only this configuration:
+Shared replay constraints:
- `--num-workers 1`
 - aggregated mode
- `--engine-type vllm`
+- `--engine-type vllm|sglang`
 - `--data-parallel-size 1`
+Additional offline constraints:
+- offline `kv_router` requires `num_workers > 1`
+- public single-worker offline replay still uses the legacy single-worker runtime for `vllm`
+  while `sglang` goes through the shared multi-worker replay runtime even when `num_workers=1`
+Additional online constraints:
+- the current live replay path is also limited to aggregated workers
 If you violate those constraints, replay fails immediately with a validation error.
 ## Practical Notes
- `--replay-concurrency` requires `--trace-file`
+- `python -m dynamo.replay` requires exactly one of:
+  either a trace file, or all of `--input-tokens`, `--output-tokens`, and `--request-count`
+- `--replay-concurrency` works with both trace replay and synthetic replay
 - `--speedup-ratio` still affects simulated timing
+- `--arrival-speedup-ratio` affects trace timestamps, not worker compute speed
+- `--arrival-interval-ms` only applies to synthetic replay
 - `--extra-engine-args` can be used to provide a full mocker config JSON instead of individual CLI flags
- offline replay does not need planner runtime setup, router registration, or event transport
+- offline replay does not need planner runtime setup, router registration, or external event transport
+- the replay block size should match the trace block size, because token synthesis expands `hash_ids`
+  using the configured block size
 ## When To Use This vs AIPerf

--- a/docs/components/frontend/configuration.md
+++ b/docs/components/frontend/configuration.md
@@ -43,8 +43,8 @@ The Rust HTTP server also reads these environment variables (not exposed as CLI
 | `--router-assume-kv-reuse` / `--no-router-assume-kv-reuse` | `DYN_ROUTER_ASSUME_KV_REUSE` | `true` | Assume KV cache reuse when tracking active blocks |
 | `--router-track-output-blocks` / `--no-router-track-output-blocks` | `DYN_ROUTER_TRACK_OUTPUT_BLOCKS` | `false` | Track output blocks with fractional decay during generation |
 | `--router-event-threads` | `DYN_ROUTER_EVENT_THREADS` | `4` | Event processing threads. >1 enables concurrent radix tree |
-| `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `2.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling |
+| `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `4.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling |
-| `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT) or `wspt` (avg TTFT) |
+| `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
 | `--enable-cache-control` / `--no-enable-cache-control` | `DYN_ENABLE_CACHE_CONTROL` | `false` | Enable TTL-based cache pinning (requires `--router-mode=kv`) |
 | `--decode-fallback` / `--no-decode-fallback` | `DYN_DECODE_FALLBACK` | `false` | Fall back to aggregated mode when prefill workers unavailable |

--- a/docs/components/router/README.md
+++ b/docs/components/router/README.md
@@ -21,8 +21,8 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa
 | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
 | `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
 | `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
-| `--router-queue-threshold` | `2.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` |
+| `--router-queue-threshold` | `4.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` |
-| `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT) or `wspt` (avg TTFT) |
+| `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
 ### Standalone Router

--- a/docs/components/router/router-guide.md
+++ b/docs/components/router/router-guide.md
@@ -87,8 +87,8 @@ Backend workers register themselves using the `register_model` API, after which
 | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
 | `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
 | `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
-| `--router-queue-threshold <float>` | `2.0` | Queue threshold fraction; enables priority scheduling via `priority` |
+| `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` |
-| `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT) or `wspt` (avg TTFT) |
+| `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
 For all available options: `python -m dynamo.frontend --help`
@@ -231,10 +231,11 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
 - `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.
- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 2.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `priority` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately).
+- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 4.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `priority` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately).
- `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Two policies are available:
+- `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Three policies are available:
  - **`fcfs`** (first-come first-served): Orders by adjusted arrival time (`priority_jump - arrival_offset`). Optimizes **tail TTFT** — no request waits longer than necessary.
+  - **`lcfs`** (last-come first-served): Orders by adjusted reverse arrival time (`priority_jump + arrival_offset`). Intentionally favors newer arrivals under saturation and is mainly useful for controlled comparison experiments.
  - **`wspt`** (weighted shortest processing time, Smith's rule): Orders by `(1 + priority_jump) / isl_tokens`. Optimizes **average TTFT** — short or high-priority requests are scheduled before long low-priority ones, minimizing total weighted completion time.
 ### KV Event Transport and Persistence
@@ -281,7 +282,7 @@ Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worke
 Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default).
-The `--router-queue-threshold` (default: 2.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.
+The `--router-queue-threshold` (default: 4.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.
 Use `--router-queue-policy wspt` when your workload has a mix of short and long requests and you want to minimize **average** TTFT. WSPT (Smith's rule) schedules short or high-priority requests first, reducing mean latency across the batch. Use the default `fcfs` when you want to minimize **tail** TTFT — no request waits longer than necessary, since ordering is purely by (adjusted) arrival time.

--- a/docs/kubernetes/tilt-dev-setup.md
+++ b/docs/kubernetes/tilt-dev-setup.md
@@ -58,7 +58,7 @@ EOF
 tilt up
 ```
-Tilt opens a terminal UI and a web dashboard at <http://localhost:10350>.
+Tilt opens a terminal UI and a web dashboard at [http://localhost:10350](http://localhost:10350).
 The dashboard shows resource status, build logs, and port-forwards.
 Press **Space** in the terminal to open the web UI. Press **Ctrl-C** to
@@ -246,7 +246,7 @@ REGISTRY=ghcr.io/myorg tilt up
 ## Tilt UI
-The web UI at <http://localhost:10350> shows:
+The web UI at [http://localhost:10350](http://localhost:10350) shows:
 - **Resource status** — green/red/pending for each resource
 - **Build logs** — compilation output and errors

--- a/docs/mocker/mocker.md
+++ b/docs/mocker/mocker.md
@@ -11,7 +11,7 @@ The Mocker is a lightweight, high-fidelity simulation of an LLM inference engine
 The mocker simulates:
 - **Block-based KV cache management** with LRU eviction
- **Continuous batching scheduler** with watermark-based admission control
+- **Engine-specific continuous batching schedulers** for vLLM and SGLang
 - **Prefix caching** with hash-based block deduplication
 - **Chunked prefill** for better batching efficiency
 - **Realistic timing models** for prefill and decode phases
@@ -74,10 +74,10 @@ python -m dynamo.mocker \
 | `--endpoint` | Auto-derived | Dynamo endpoint string. Defaults are namespace-dependent, and prefill workers use a different default endpoint than aggregated/decode workers |
 | `--model-name` | Derived from model-path | Model name for API responses |
 | `--trace-file` | None | Run offline trace replay from a Mooncake-style JSONL trace file |
-| `--output-file` | `<trace stem>.replay.json` | Write replay metrics JSON to this path |
+| `--output-file` | `TRACE_STEM.replay.json` | Write replay metrics JSON to this path |
 | `--replay-concurrency` | None | Run offline replay in closed-loop concurrency mode with this many in-flight requests |
 | `--num-gpu-blocks-override` | 16384 | Number of KV cache blocks |
-| `--block-size` | 64 | Tokens per KV cache block |
+| `--block-size` | 64 (`vllm`) / engine-specific | Tokens per KV cache block. For `sglang`, if omitted, the effective page/block size defaults to 1 or to `--sglang-page-size` when provided |
 | `--max-num-seqs` | 256 | Maximum concurrent sequences |
 | `--max-num-batched-tokens` | 8192 | Maximum tokens per batch |
 | `--enable-prefix-caching` | True | Enable prefix caching |
@@ -85,7 +85,6 @@ python -m dynamo.mocker \
 | `--enable-chunked-prefill` | True | Enable chunked prefill |
 | `--no-enable-chunked-prefill` | - | Disable chunked prefill |
 | `--preemption-mode` | `lifo` | Decode eviction policy under memory pressure: `lifo` (vLLM v1 style) or `fifo` |
-| `--watermark` | 0.01 | KV cache watermark (fraction reserved) |
 | `--speedup-ratio` | 1.0 | Timing speedup factor |
 | `--decode-speedup-ratio` | 1.0 | Decode-only speedup multiplier (e.g. for Eagle speculation) |
 | `--data-parallel-size` | 1 | Number of DP replicas |
@@ -95,7 +94,7 @@ python -m dynamo.mocker \
 | `--reasoning` | None | JSON config for emitting reasoning token spans, with `start_thinking_token_id`, `end_thinking_token_id`, and `thinking_ratio` |
 | `--engine-type` | `vllm` | Engine simulation type: `vllm` or `sglang` |
 | `--sglang-schedule-policy` | `fifo` / `fcfs` | SGLang scheduling policy override |
-| `--sglang-page-size` | 1 | SGLang radix-cache page size in tokens |
+| `--sglang-page-size` | 1 | SGLang radix-cache page size in tokens. Also becomes the effective block size when `--engine-type sglang` and `--block-size` is omitted |
 | `--sglang-max-prefill-tokens` | 16384 | SGLang max prefill-token budget per batch |
 | `--sglang-chunked-prefill-size` | 8192 | SGLang chunked-prefill chunk size |
 | `--sglang-clip-max-new-tokens` | 4096 | SGLang admission-budget cap for max new tokens |
@@ -126,9 +125,12 @@ python -m dynamo.mocker \
 > **Note:** For local scale tests and router benchmarks, prefer `--num-workers` over launching many separate mocker processes. All workers share one tokio runtime and thread pool, which is both lighter weight and closer to how the test harnesses exercise the mocker.
-## Offline Trace Replay
+## Trace Replay
-The mocker also supports an offline replay mode for Mooncake-style traces:
+The mocker also supports replaying Mooncake-style traces through both the original mocker CLI and
+the dedicated replay harness.
+For the original mocker CLI flow:
 ```bash
 python -m dynamo.mocker \
@@ -136,9 +138,41 @@ python -m dynamo.mocker \
    --model-path Qwen/Qwen3-0.6B
 ```
-This mode writes a replay report JSON and prints a `Replay Summary` table without launching a runtime or router.
+For the standalone replay CLI, which exposes `offline|online`, `round_robin|kv_router`,
+`arrival_speedup_ratio`, `router_queue_policy`, and the synthetic replay path directly:
+```bash
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --num-workers 4 \
+    --replay-mode offline \
+    --router-mode kv_router \
+    --router-queue-policy fcfs \
+    --arrival-speedup-ratio 5 \
+    --extra-engine-args /path/to/mocker_args.json
+```
+The same CLI also supports synthetic replay without a trace file:
+```bash
+python -m dynamo.replay \
+    --input-tokens 5000 \
+    --output-tokens 500 \
+    --request-count 1000 \
+    --arrival-interval-ms 1.0 \
+    --num-workers 1 \
+    --replay-mode offline \
+    --replay-concurrency 100 \
+    --extra-engine-args /path/to/mocker_args.json
+```
+The standalone replay CLI prints the replay report JSON directly to stdout. The `dynamo.mocker`
+trace-file flow still writes a report file and prints a `Replay Summary` table.
+For full usage, constraints, and benchmarking guidance, see [Mocker Trace Replay](../benchmarks/mocker-trace-replay.md).
-For full usage, constraints, and benchmarking guidance, see [Mocker Offline Trace Replay](../benchmarks/mocker-trace-replay.md).
+Replay supports aggregated `vllm` and `sglang` engine configs. Internally replay uses canonical
+`block_size`; for `sglang`, `sglang.page_size` is still accepted as a compatibility alias as long
+as it matches `block_size` when both are provided.
 ## Performance Modeling Setup
@@ -225,15 +259,21 @@ The mocker is organized into several cooperating components that mirror the inte
 ### Scheduler
-The scheduler implements continuous batching, maintaining three logical queues:
+The mocker now has two scheduler shapes rather than one generic queue model:
-1. **Waiting Queue** - Newly arrived requests awaiting scheduling
+- **vLLM mocker** uses an upstream-style `waiting + running` scheduler. Each request tracks
-2. **Prefill Queue** - Requests scheduled for prefill
+  computed tokens, the scheduler spends one token budget across the running set first, and decode
-3. **Decode Queue** - Requests actively decoding (ordered by age for preemption)
+  pressure triggers inline preemption of running requests.
+- **SGLang mocker** uses a cache-aware waiting/running scheduler around a radix-style prefix cache.
+  It batches prefill work with decode-state awareness and handles pressure primarily through decode
+  retraction while preserving cached prefixes.
-Each iteration, the scheduler receives incoming requests, moves eligible requests from waiting to prefill based on available memory and compute budgets, simulates the prefill phase for queued requests, runs one decode step for all active sequences, and publishes metrics about current resource utilization.
+Both schedulers simulate continuous batching, prefix reuse, chunked prefill, memory pressure, and
+decode token emission while publishing metrics about current resource utilization.
-When resources become constrained, the scheduler employs preemption: the oldest decoding request is evicted back to the waiting queue, its KV blocks are freed, and it will be rescheduled later. This mirrors how real engines handle memory pressure.
+When resources become constrained, the mocker simulates the engine's real recovery path:
+- vLLM-style decode preemption and recompute
+- SGLang-style decode retraction plus prefix-preserving cache updates
 ### KV Block Manager

--- a/docs/reference/feature-matrix.md
+++ b/docs/reference/feature-matrix.md
@@ -122,10 +122,10 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
 [tools]: ../user-guides/tool-calling
 {/* Multimodal */}
-[mm]: ../user-guides/multimodal
+[mm]: ../features/multimodal/README.md
-[mm-vllm]: ../user-guides/multimodal/multimodal-vllm
+[mm-vllm]: ../features/multimodal/multimodal-vllm.md
-[mm-trtllm]: ../user-guides/multimodal/multimodal-trtllm
+[mm-trtllm]: ../features/multimodal/multimodal-trtllm.md
-[mm-sglang]: ../user-guides/multimodal/multimodal-sglang
+[mm-sglang]: ../features/multimodal/multimodal-sglang.md
 {/* Feature-specific */}
 [lora]: ../kubernetes-deployment/deployment-guide/managing-models-with-dynamo-model

--- a/docs/reference/release-artifacts.md
+++ b/docs/reference/release-artifacts.md
@@ -185,11 +185,11 @@ For a complete list of known issues, refer to the release notes for each version
 | `v1.0.1` | Mar 16, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v1.0.1) | [Docs](https://docs.dynamo.nvidia.com/dynamo) | |
 | `v1.0.0` | Mar 12, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v1.0.0) | [Docs](https://docs.dynamo.nvidia.com/dynamo) | |
 | `v0.9.1` | Mar 4, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.9.1) | [Docs](https://docs.dynamo.nvidia.com/dynamo) |
-| `v0.9.0` | Feb 11, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.9.0) | [Docs](https://docs.dynamo.nvidia.com/dynamo/v-0-9-0/) |
+| `v0.9.0` | Feb 11, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.9.0) | Archived docs unavailable |
-| `v0.8.1` | Jan 23, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.1) | [Docs](https://docs.nvidia.com/dynamo/v-0-8-1/) |
+| `v0.8.1` | Jan 23, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.1) | Archived docs unavailable |
-| `v0.8.0` | Jan 15, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.0) | [Docs](https://docs.nvidia.com/dynamo/v-0-8-0/) |
+| `v0.8.0` | Jan 15, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.0) | Archived docs unavailable |
-| `v0.7.1` | Dec 15, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.1) | [Docs](https://docs.nvidia.com/dynamo/v-0-7-1/) |
+| `v0.7.1` | Dec 15, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.1) | Archived docs unavailable |
-| `v0.7.0` | Nov 26, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.0) | [Docs](https://docs.nvidia.com/dynamo/v-0-7-0/) |
+| `v0.7.0` | Nov 26, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.0) | Archived docs unavailable |
 | `v0.6.1` | Nov 6, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.6.1) | — |
 | `v0.6.0` | Oct 28, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.6.0) | — |

--- a/lib/bench/kv_router/active_sequences_bench.rs
+++ b/lib/bench/kv_router/active_sequences_bench.rs
@@ -9,7 +9,7 @@ use clap::Parser;
 use common::NoopSequencePublisher;
 use dynamo_kv_router::protocols::WorkerWithDpRank;
 use dynamo_kv_router::{ActiveSequencesMultiWorker, OverlapScores, SequenceRequest};
-use dynamo_mocker::common::protocols::{DirectRequest, OutputSignal};
+use dynamo_mocker::common::protocols::{DirectRequest, KvEventPublishers, OutputSignal};
 use dynamo_mocker::scheduler::Scheduler;
 use dynamo_mocker::scheduler::SchedulerHandle;
 use dynamo_tokens::SequenceHash;
@@ -101,7 +101,13 @@ async fn generate_sequence_events(
            let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>();
            // No KvCacheEventSink — we only need output signals
-            let scheduler = Scheduler::new(sched_args, 0, Some(output_tx), None, None);
+            let scheduler = Scheduler::new(
+                sched_args,
+                0,
+                Some(output_tx),
+                KvEventPublishers::default(),
+                None,
+            );
            // Pre-compute metadata for each request before submission
            let mut metadata: HashMap<Uuid, RequestMetadata> = HashMap::new();

--- a/lib/bench/kv_router/common/mod.rs
+++ b/lib/bench/kv_router/common/mod.rs
@@ -11,7 +11,9 @@ use dynamo_kv_router::protocols::{
    KvCacheStoredBlockData, RouterEvent, WorkerId, XXH3_SEED, compute_seq_hash_for_block,
 };
 pub use dynamo_kv_router::test_utils::{NoopSequencePublisher, SimpleWorkerConfig};
-use dynamo_mocker::common::protocols::{DirectRequest, KvCacheEventSink, MockEngineArgs};
+use dynamo_mocker::common::protocols::{
+    DirectRequest, KvCacheEventSink, KvEventPublishers, MockEngineArgs,
+};
 use dynamo_mocker::scheduler::Scheduler;
 use dynamo_mocker::scheduler::SchedulerHandle;
 use dynamo_tokens::compute_hash_v2;
@@ -122,11 +124,7 @@ impl EventCollector {
 }
 impl KvCacheEventSink for EventCollector {
-    fn publish(
+    fn publish(&self, event: KvCacheEvent) -> anyhow::Result<()> {
-        &self,
-        event: KvCacheEvent,
-        _block_token_ids: Option<&[Vec<u32>]>,
-    ) -> anyhow::Result<()> {
        let timestamp = Instant::now();
        if let Some(events) = self.events.lock().unwrap().as_mut() {
            events.push((event, timestamp));
@@ -361,7 +359,13 @@ pub async fn generate_kv_events(
        tasks.push(tokio::spawn(async move {
            let collector = EventCollector::new();
-            let scheduler = Scheduler::new(sched_args, 0, None, Some(collector.clone()), None);
+            let scheduler = Scheduler::new(
+                sched_args,
+                0,
+                None,
+                KvEventPublishers::new(Some(collector.clone()), None),
+                None,
+            );
            let mut i = 0;
            let mut target = Instant::now();

--- a/lib/bindings/kvbm/Cargo.lock
+++ b/lib/bindings/kvbm/Cargo.lock
@@ -1707,7 +1707,6 @@ dependencies = [
 "derive-getters",
 "derive_builder",
 "dynamo-kv-router",
- "dynamo-runtime",
 "dynamo-tokens",
 "ndarray",
 "ndarray-interp",

--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -1723,7 +1723,6 @@ dependencies = [
 "derive-getters",
 "derive_builder",
 "dynamo-kv-router",
- "dynamo-runtime",
 "dynamo-tokens",
 "ndarray",
 "ndarray-interp",
@@ -1783,6 +1782,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "uuid",
 ]
 [[package]]

--- a/lib/bindings/python/Cargo.toml
+++ b/lib/bindings/python/Cargo.toml
@@ -46,6 +46,7 @@ tokio = { version = "1.46.0", features = ["full"] }
 tokio-stream = { version = "0" }
 tokio-util = { version = "0.7", features = ["rt"] }
 tracing = { version = "0" }
+uuid = { version = "1.18.1" }
 # kv-indexer / shared kv-router types
 dynamo-kv-router = { path = "../../kv-router", features = ["standalone-indexer"] }