"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "4ae77dfd42041dc2defe21f6ccf76aecb4478812"
Unverified Commit b7fe46b1 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add multi-worker replay and router startup fixes (#7553)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 82794761
...@@ -2082,7 +2082,6 @@ dependencies = [ ...@@ -2082,7 +2082,6 @@ dependencies = [
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-kv-router", "dynamo-kv-router",
"dynamo-runtime",
"dynamo-tokens", "dynamo-tokens",
"ndarray 0.16.1", "ndarray 0.16.1",
"ndarray-interp", "ndarray-interp",
...@@ -2092,6 +2091,7 @@ dependencies = [ ...@@ -2092,6 +2091,7 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"slotmap", "slotmap",
"tempfile",
"tokio", "tokio",
"tokio-timerfd", "tokio-timerfd",
"tokio-util", "tokio-util",
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
"""Shared KV router configuration ArgGroup. """Shared KV router configuration ArgGroup.
Defines the 17 KvRouterConfig parameters once so that both Defines the shared KvRouterConfig parameters once so that both
``dynamo.frontend`` and ``dynamo.router`` can reuse them without duplication. ``dynamo.frontend`` and ``dynamo.router`` can reuse them without duplication.
Field names on ``KvRouterConfigBase`` match the ``KvRouterConfig`` Python Field names on ``KvRouterConfigBase`` match the ``KvRouterConfig`` Python
constructor kwargs 1:1, so ``kv_router_kwargs()`` returns a dict that can be constructor kwargs 1:1, so ``kv_router_kwargs()`` returns a dict that can be
...@@ -34,13 +34,14 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = ( ...@@ -34,13 +34,14 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
"router_queue_threshold", "router_queue_threshold",
"router_event_threads", "router_event_threads",
"router_enable_cache_control", "router_enable_cache_control",
"min_initial_workers",
"router_queue_policy", "router_queue_policy",
"remote_indexer_component", "remote_indexer_component",
) )
class KvRouterConfigBase(ConfigBase): class KvRouterConfigBase(ConfigBase):
"""Mixin carrying the 17 KvRouterConfig fields.""" """Mixin carrying the shared KvRouterConfig fields."""
overlap_score_weight: float overlap_score_weight: float
router_temperature: float router_temperature: float
...@@ -58,6 +59,7 @@ class KvRouterConfigBase(ConfigBase): ...@@ -58,6 +59,7 @@ class KvRouterConfigBase(ConfigBase):
router_queue_threshold: Optional[float] router_queue_threshold: Optional[float]
router_event_threads: int router_event_threads: int
router_enable_cache_control: bool router_enable_cache_control: bool
min_initial_workers: int
router_queue_policy: str router_queue_policy: str
remote_indexer_component: Optional[str] remote_indexer_component: Optional[str]
...@@ -67,7 +69,7 @@ class KvRouterConfigBase(ConfigBase): ...@@ -67,7 +69,7 @@ class KvRouterConfigBase(ConfigBase):
class KvRouterArgGroup(ArgGroup): class KvRouterArgGroup(ArgGroup):
"""CLI arguments for the 17 KvRouterConfig parameters.""" """CLI arguments for the shared KvRouterConfig parameters."""
def add_arguments(self, parser) -> None: def add_arguments(self, parser) -> None:
g = parser.add_argument_group("KV Router Options") g = parser.add_argument_group("KV Router Options")
...@@ -226,7 +228,7 @@ class KvRouterArgGroup(ArgGroup): ...@@ -226,7 +228,7 @@ class KvRouterArgGroup(ArgGroup):
g, g,
flag_name="--router-queue-threshold", flag_name="--router-queue-threshold",
env_var="DYN_ROUTER_QUEUE_THRESHOLD", env_var="DYN_ROUTER_QUEUE_THRESHOLD",
default=2.0, default=4.0,
help=( help=(
"KV Router: Queue threshold fraction for prefill token capacity. " "KV Router: Queue threshold fraction for prefill token capacity. "
"Requests are queued if all workers exceed this fraction of " "Requests are queued if all workers exceed this fraction of "
...@@ -258,6 +260,18 @@ class KvRouterArgGroup(ArgGroup): ...@@ -258,6 +260,18 @@ class KvRouterArgGroup(ArgGroup):
"requests with nvext.cache_control." "requests with nvext.cache_control."
), ),
) )
add_argument(
g,
flag_name="--router-min-initial-workers",
env_var="DYN_ROUTER_MIN_INITIAL_WORKERS",
default=1,
help=(
"KV Router: Minimum number of workers that must be discovered before "
"router startup continues. Ignored when skip_initial_worker_wait is enabled."
),
arg_type=int,
dest="min_initial_workers",
)
add_argument( add_argument(
g, g,
flag_name="--router-queue-policy", flag_name="--router-queue-policy",
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import argparse import argparse
import json
import logging import logging
import os import os
import tempfile import tempfile
...@@ -90,92 +89,6 @@ def resolve_planner_profile_data( ...@@ -90,92 +89,6 @@ def resolve_planner_profile_data(
) )
def create_temp_engine_args_file(args: argparse.Namespace) -> Path:
"""
Create a temporary JSON file with MockEngineArgs from CLI arguments.
Returns the path to the temporary file.
"""
engine_args = {}
# Only include non-None values that differ from defaults
# Note: argparse converts hyphens to underscores in attribute names
# Extract all potential engine arguments, using None as default for missing attributes
engine_args = {
"num_gpu_blocks": getattr(args, "num_gpu_blocks", None),
"block_size": getattr(args, "block_size", None),
"max_num_seqs": getattr(args, "max_num_seqs", None),
"max_num_batched_tokens": getattr(args, "max_num_batched_tokens", None),
"enable_prefix_caching": getattr(args, "enable_prefix_caching", None),
"enable_chunked_prefill": getattr(args, "enable_chunked_prefill", None),
"preemption_mode": getattr(args, "preemption_mode", None),
"speedup_ratio": getattr(args, "speedup_ratio", None),
"decode_speedup_ratio": getattr(args, "decode_speedup_ratio", None),
"dp_size": getattr(args, "dp_size", None),
"startup_time": getattr(args, "startup_time", None),
"planner_profile_data": (
str(getattr(args, "planner_profile_data", None))
if getattr(args, "planner_profile_data", None)
else None
),
"is_prefill": getattr(args, "is_prefill_worker", None),
"is_decode": getattr(args, "is_decode_worker", None),
"enable_local_indexer": not getattr(args, "durable_kv_events", False),
# Note: bootstrap_port and zmq_kv_events_port are NOT included here
# - they are per-worker and set in launch_workers()
# Note: kv_bytes_per_token and kv_cache_dtype are NOT included here
# - kv_bytes_per_token is auto-computed in main.py after model prefetch,
# - kv_cache_dtype is only used Python-side for the auto-computation.
"kv_transfer_bandwidth": getattr(args, "kv_transfer_bandwidth", None),
"engine_type": getattr(args, "engine_type", None),
}
# If --aic-perf-model is set, add AIC fields
if getattr(args, "aic_perf_model", False):
engine_type = getattr(args, "engine_type", None) or "vllm"
engine_args["aic_backend"] = engine_type
if getattr(args, "aic_system", None):
engine_args["aic_system"] = args.aic_system
if getattr(args, "aic_backend_version", None):
engine_args["aic_backend_version"] = args.aic_backend_version
if getattr(args, "aic_tp_size", None):
engine_args["aic_tp_size"] = args.aic_tp_size
if getattr(args, "model_path", None):
engine_args["aic_model_path"] = args.model_path
# Parse --reasoning JSON string into a nested object
reasoning_str = getattr(args, "reasoning", None)
if reasoning_str:
engine_args["reasoning"] = json.loads(reasoning_str)
# Build nested sglang config from individual CLI flags
sglang_args = {
"schedule_policy": getattr(args, "sglang_schedule_policy", None),
"page_size": getattr(args, "sglang_page_size", None),
"max_prefill_tokens": getattr(args, "sglang_max_prefill_tokens", None),
"chunked_prefill_size": getattr(args, "sglang_chunked_prefill_size", None),
"clip_max_new_tokens": getattr(args, "sglang_clip_max_new_tokens", None),
"schedule_conservativeness": getattr(
args, "sglang_schedule_conservativeness", None
),
}
sglang_args = {k: v for k, v in sglang_args.items() if v is not None}
if sglang_args:
engine_args["sglang"] = sglang_args
# Remove None values to only include explicitly set arguments
engine_args = {k: v for k, v in engine_args.items() if v is not None}
# Create temporary file
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(engine_args, f, indent=2)
temp_path = Path(f.name)
logger.debug(f"Created temporary MockEngineArgs file at {temp_path}")
logger.debug(f"MockEngineArgs: {engine_args}")
return temp_path
def validate_worker_type_args(args: argparse.Namespace) -> None: def validate_worker_type_args(args: argparse.Namespace) -> None:
""" """
Resolve disaggregation mode from --disaggregation-mode or legacy boolean flags. Resolve disaggregation mode from --disaggregation-mode or legacy boolean flags.
...@@ -261,31 +174,13 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -261,31 +174,13 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
default=None, default=None,
help="Model name for API responses (default: derived from model-path)", help="Model name for API responses (default: derived from model-path)",
) )
parser.add_argument(
"--trace-file",
type=Path,
default=None,
help="Run offline trace replay from a Mooncake-style JSONL trace file.",
)
parser.add_argument(
"--output-file",
type=Path,
default=None,
help="Write replay metrics JSON to this path. Defaults to a replay JSON next to the trace file.",
)
parser.add_argument(
"--replay-concurrency",
type=int,
default=None,
help="Run offline replay in closed-loop concurrency mode with this many in-flight requests.",
)
# MockEngineArgs parameters (similar to vLLM style) # MockEngineArgs parameters (similar to vLLM style)
parser.add_argument( parser.add_argument(
"--num-gpu-blocks-override", "--num-gpu-blocks-override",
type=int, type=int,
dest="num_gpu_blocks", # Maps to num_gpu_blocks in MockEngineArgs dest="num_gpu_blocks", # Maps to num_gpu_blocks in MockEngineArgs
default=None, default=16384,
help="Number of GPU blocks for KV cache (default: 16384)", help="Number of GPU blocks for KV cache (default: 16384)",
) )
parser.add_argument( parser.add_argument(
...@@ -297,20 +192,20 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -297,20 +192,20 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--max-num-seqs", "--max-num-seqs",
type=int, type=int,
default=None, default=256,
help="Maximum number of sequences per iteration (default: 256)", help="Maximum number of sequences per iteration (default: 256)",
) )
parser.add_argument( parser.add_argument(
"--max-num-batched-tokens", "--max-num-batched-tokens",
type=int, type=int,
default=None, default=8192,
help="Maximum number of batched tokens per iteration (default: 8192)", help="Maximum number of batched tokens per iteration (default: 8192)",
) )
parser.add_argument( parser.add_argument(
"--enable-prefix-caching", "--enable-prefix-caching",
action="store_true", action="store_true",
dest="enable_prefix_caching", dest="enable_prefix_caching",
default=None, default=True,
help="Enable automatic prefix caching (default: True)", help="Enable automatic prefix caching (default: True)",
) )
parser.add_argument( parser.add_argument(
...@@ -324,7 +219,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -324,7 +219,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
"--enable-chunked-prefill", "--enable-chunked-prefill",
action="store_true", action="store_true",
dest="enable_chunked_prefill", dest="enable_chunked_prefill",
default=None, default=True,
help="Enable chunked prefill (default: True)", help="Enable chunked prefill (default: True)",
) )
parser.add_argument( parser.add_argument(
...@@ -337,7 +232,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -337,7 +232,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--preemption-mode", "--preemption-mode",
type=str, type=str,
default=None, default="lifo",
choices=["lifo", "fifo"], choices=["lifo", "fifo"],
help="Preemption mode for decode eviction under memory pressure. " help="Preemption mode for decode eviction under memory pressure. "
"'lifo' (default) evicts the newest request (matches vLLM v1), " "'lifo' (default) evicts the newest request (matches vLLM v1), "
...@@ -346,13 +241,13 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -346,13 +241,13 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--speedup-ratio", "--speedup-ratio",
type=float, type=float,
default=None, default=1.0,
help="Speedup ratio for mock execution (default: 1.0). Use 0 for infinite speedup (no simulation delays).", help="Speedup ratio for mock execution (default: 1.0). Use 0 for infinite speedup (no simulation delays).",
) )
parser.add_argument( parser.add_argument(
"--decode-speedup-ratio", "--decode-speedup-ratio",
type=float, type=float,
default=None, default=1.0,
help="Additional speedup multiplier applied only to decode steps (default: 1.0). " help="Additional speedup multiplier applied only to decode steps (default: 1.0). "
"Models speculative decoding (e.g. Eagle) where decode throughput improves " "Models speculative decoding (e.g. Eagle) where decode throughput improves "
"without affecting prefill latency. Effective decode speedup is speedup_ratio * decode_speedup_ratio.", "without affecting prefill latency. Effective decode speedup is speedup_ratio * decode_speedup_ratio.",
...@@ -361,7 +256,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -361,7 +256,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
"--data-parallel-size", "--data-parallel-size",
type=int, type=int,
dest="dp_size", dest="dp_size",
default=None, default=1,
help="Number of data parallel replicas (default: 1)", help="Number of data parallel replicas (default: 1)",
) )
parser.add_argument( parser.add_argument(
...@@ -426,7 +321,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -426,7 +321,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--engine-type", "--engine-type",
type=str, type=str,
default=None, default="vllm",
choices=["vllm", "sglang"], choices=["vllm", "sglang"],
help="Engine simulation type: 'vllm' (default) or 'sglang'.", help="Engine simulation type: 'vllm' (default) or 'sglang'.",
) )
...@@ -604,9 +499,6 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ...@@ -604,9 +499,6 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
args = parser.parse_args(argv) args = parser.parse_args(argv)
validate_worker_type_args(args) validate_worker_type_args(args)
if args.replay_concurrency is not None and args.trace_file is None:
raise ValueError("--replay-concurrency requires --trace-file")
# Validate num_workers # Validate num_workers
if args.num_workers < 1: if args.num_workers < 1:
raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}") raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}")
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import os
import socket
from dynamo.llm import MockEngineArgs, ModelRuntimeConfig, ReasoningConfig, SglangArgs
_DEFAULT_NUM_GPU_BLOCKS = 16384
_DEFAULT_MAX_NUM_SEQS = 256
_DEFAULT_MAX_NUM_BATCHED_TOKENS = 8192
def _parse_reasoning_config(reasoning_json: str | None) -> ReasoningConfig | None:
if not reasoning_json:
return None
reasoning = json.loads(reasoning_json)
return ReasoningConfig(
start_thinking_token_id=reasoning["start_thinking_token_id"],
end_thinking_token_id=reasoning["end_thinking_token_id"],
thinking_ratio=reasoning["thinking_ratio"],
)
def _build_sglang_args(args: argparse.Namespace) -> SglangArgs | None:
sglang_args = {
"schedule_policy": getattr(args, "sglang_schedule_policy", None),
"page_size": getattr(args, "sglang_page_size", None),
"max_prefill_tokens": getattr(args, "sglang_max_prefill_tokens", None),
"chunked_prefill_size": getattr(args, "sglang_chunked_prefill_size", None),
"clip_max_new_tokens": getattr(args, "sglang_clip_max_new_tokens", None),
"schedule_conservativeness": getattr(
args, "sglang_schedule_conservativeness", None
),
}
if not any(value is not None for value in sglang_args.values()):
return None
return SglangArgs(**sglang_args)
def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
aic_backend = None
aic_system = None
aic_backend_version = None
aic_tp_size = None
aic_model_path = None
if getattr(args, "aic_perf_model", False):
aic_backend = getattr(args, "engine_type", None) or "vllm"
aic_system = getattr(args, "aic_system", None)
aic_backend_version = getattr(args, "aic_backend_version", None)
aic_tp_size = getattr(args, "aic_tp_size", None)
aic_model_path = getattr(args, "model_path", None)
return MockEngineArgs(
engine_type=getattr(args, "engine_type", None) or "vllm",
num_gpu_blocks=getattr(args, "num_gpu_blocks", _DEFAULT_NUM_GPU_BLOCKS),
block_size=getattr(args, "block_size", 0) or 0,
max_num_seqs=getattr(args, "max_num_seqs", _DEFAULT_MAX_NUM_SEQS),
max_num_batched_tokens=getattr(
args, "max_num_batched_tokens", _DEFAULT_MAX_NUM_BATCHED_TOKENS
),
enable_prefix_caching=getattr(args, "enable_prefix_caching", True),
enable_chunked_prefill=getattr(args, "enable_chunked_prefill", True),
preemption_mode=getattr(args, "preemption_mode", "lifo"),
speedup_ratio=getattr(args, "speedup_ratio", 1.0),
decode_speedup_ratio=getattr(args, "decode_speedup_ratio", 1.0),
dp_size=getattr(args, "dp_size", 1),
startup_time=getattr(args, "startup_time", None),
worker_type=(
"prefill"
if getattr(args, "is_prefill_worker", False)
else "decode"
if getattr(args, "is_decode_worker", False)
else "aggregated"
),
aic_backend=aic_backend,
aic_system=aic_system,
aic_backend_version=aic_backend_version,
aic_tp_size=aic_tp_size,
aic_model_path=aic_model_path,
enable_local_indexer=not getattr(args, "durable_kv_events", False),
kv_transfer_bandwidth=getattr(args, "kv_transfer_bandwidth", None),
reasoning=_parse_reasoning_config(getattr(args, "reasoning", None)),
sglang=_build_sglang_args(args),
)
def load_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
if args.extra_engine_args:
return MockEngineArgs.from_json(args.extra_engine_args.read_text())
return build_mocker_engine_args(args)
def apply_worker_engine_args_overrides(
engine_args: MockEngineArgs,
*,
kv_bytes_per_token: int | None = None,
bootstrap_port: int | None = None,
zmq_kv_events_port: int | None = None,
zmq_replay_port: int | None = None,
) -> MockEngineArgs:
return engine_args.with_overrides(
bootstrap_port=bootstrap_port,
zmq_kv_events_port=zmq_kv_events_port,
zmq_replay_port=zmq_replay_port,
kv_bytes_per_token=kv_bytes_per_token,
)
def build_runtime_config(
engine_args: MockEngineArgs,
) -> tuple[int, ModelRuntimeConfig]:
rc = ModelRuntimeConfig()
rc.total_kv_blocks = engine_args.num_gpu_blocks
rc.max_num_seqs = engine_args.max_num_seqs
if rc.max_num_seqs is None:
rc.max_num_seqs = _DEFAULT_MAX_NUM_SEQS
rc.max_num_batched_tokens = engine_args.max_num_batched_tokens
if rc.max_num_batched_tokens is None:
rc.max_num_batched_tokens = _DEFAULT_MAX_NUM_BATCHED_TOKENS
rc.enable_local_indexer = (
engine_args.enable_local_indexer and not engine_args.is_decode()
)
rc.data_parallel_size = engine_args.dp_size
bootstrap_port = engine_args.bootstrap_port
if engine_args.is_prefill() and bootstrap_port is not None:
host = os.environ.get(
"DYN_HTTP_RPC_HOST", socket.gethostbyname(socket.gethostname())
)
rc.set_disaggregated_endpoint(
bootstrap_host=host, bootstrap_port=bootstrap_port
)
return engine_args.block_size, rc
...@@ -6,12 +6,9 @@ ...@@ -6,12 +6,9 @@
import argparse import argparse
import asyncio import asyncio
import json
import logging import logging
import os import os
import signal import signal
import socket
import tempfile
from pathlib import Path from pathlib import Path
import uvloop import uvloop
...@@ -19,18 +16,15 @@ import uvloop ...@@ -19,18 +16,15 @@ import uvloop
os.environ.setdefault("DYN_COMPUTE_THREADS", "0") os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
from dynamo.common.utils.runtime import create_runtime from dynamo.common.utils.runtime import create_runtime
from dynamo.llm import ( from dynamo.llm import EngineType, EntrypointArgs, fetch_model, make_engine, run_input
EngineType,
EntrypointArgs,
ModelRuntimeConfig,
fetch_model,
make_engine,
run_input,
)
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data from .args import parse_args, resolve_planner_profile_data
from .replay import run_trace_replay from .config import (
apply_worker_engine_args_overrides,
build_runtime_config,
load_mocker_engine_args,
)
from .utils.kv_cache import compute_kv_bytes_per_token from .utils.kv_cache import compute_kv_bytes_per_token
configure_dynamo_logging() configure_dynamo_logging()
...@@ -77,73 +71,32 @@ async def worker(): ...@@ -77,73 +71,32 @@ async def worker():
profile_data_result = resolve_planner_profile_data(args.planner_profile_data) profile_data_result = resolve_planner_profile_data(args.planner_profile_data)
args.planner_profile_data = profile_data_result.npz_path args.planner_profile_data = profile_data_result.npz_path
# Offline replay does not need planner profile conversion or runtime setup.
if args.trace_file is not None:
if args.extra_engine_args:
extra_engine_args_path = args.extra_engine_args
logger.info(f"Using provided MockEngineArgs from {extra_engine_args_path}")
else:
extra_engine_args_path = create_temp_engine_args_file(args)
logger.info("Created MockEngineArgs from CLI arguments")
try:
run_trace_replay(
trace_file=args.trace_file,
output_file=args.output_file,
extra_engine_args=extra_engine_args_path,
num_workers=args.num_workers,
replay_concurrency=args.replay_concurrency,
)
return
finally:
if not args.extra_engine_args and extra_engine_args_path.exists():
try:
extra_engine_args_path.unlink()
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}")
# Handle extra_engine_args: either use provided file or create from CLI args
if args.extra_engine_args:
# User provided explicit JSON file
extra_engine_args_path = args.extra_engine_args
logger.info(f"Using provided MockEngineArgs from {extra_engine_args_path}")
else:
# Create temporary JSON file from CLI arguments
extra_engine_args_path = create_temp_engine_args_file(args)
logger.info("Created MockEngineArgs from CLI arguments")
try: try:
# Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers # Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers
if args.num_workers > 1 and args.model_path: if args.num_workers > 1 and args.model_path:
await prefetch_model(args.model_path) await prefetch_model(args.model_path)
engine_args = load_mocker_engine_args(args)
logger.info(
"Loaded MockEngineArgs from JSON file"
if args.extra_engine_args
else "Created MockEngineArgs from CLI arguments"
)
# Auto-compute kv_bytes_per_token from model config if not explicitly set # Auto-compute kv_bytes_per_token from model config if not explicitly set
if args.kv_bytes_per_token is None and args.model_path: if args.kv_bytes_per_token is None and args.model_path:
args.kv_bytes_per_token = compute_kv_bytes_per_token( args.kv_bytes_per_token = compute_kv_bytes_per_token(
args.model_path, args.kv_cache_dtype args.model_path, args.kv_cache_dtype
) )
engine_args = apply_worker_engine_args_overrides(
# Inject kv_bytes_per_token into engine args JSON (computed after model prefetch) engine_args, kv_bytes_per_token=args.kv_bytes_per_token
if args.kv_bytes_per_token is not None and not args.extra_engine_args: )
with open(extra_engine_args_path) as f:
engine_args = json.load(f)
engine_args["kv_bytes_per_token"] = args.kv_bytes_per_token
with open(extra_engine_args_path, "w") as f:
json.dump(engine_args, f, indent=2)
logger.info( logger.info(
f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances" f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances"
) )
await launch_workers(args, extra_engine_args_path) await launch_workers(args, engine_args)
finally: finally:
# Clean up temporary file if we created one
if not args.extra_engine_args and extra_engine_args_path.exists():
try:
extra_engine_args_path.unlink()
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}")
if profile_data_result is not None: if profile_data_result is not None:
del profile_data_result # Triggers tmpdir cleanup via __del__ del profile_data_result # Triggers tmpdir cleanup via __del__
...@@ -170,47 +123,7 @@ def compute_stagger_delay(num_workers: int, stagger_delay: float) -> float: ...@@ -170,47 +123,7 @@ def compute_stagger_delay(num_workers: int, stagger_delay: float) -> float:
return 0.2 return 0.2
def _build_runtime_config( async def launch_workers(args: argparse.Namespace, base_engine_args):
engine_args: dict,
) -> tuple[int, ModelRuntimeConfig]:
"""Build a ModelRuntimeConfig from the engine args dict.
Returns (kv_cache_block_size, runtime_config). Defaults match
the Rust MockEngineArgsBuilder so hand-crafted JSON files that
omit fields behave identically.
"""
is_prefill = engine_args.get("is_prefill", False)
is_decode = engine_args.get("is_decode", False)
rc = ModelRuntimeConfig()
rc.total_kv_blocks = engine_args.get("num_gpu_blocks", 16384)
if (v := engine_args.get("max_num_seqs")) is not None:
rc.max_num_seqs = v
if (v := engine_args.get("max_num_batched_tokens")) is not None:
rc.max_num_batched_tokens = v
rc.enable_local_indexer = (
engine_args.get("enable_local_indexer", False) and not is_decode
)
rc.data_parallel_size = engine_args.get("dp_size", 1)
bootstrap_port = engine_args.get("bootstrap_port")
if is_prefill and bootstrap_port is not None:
host = os.environ.get(
"DYN_HTTP_RPC_HOST", socket.gethostbyname(socket.gethostname())
)
rc.set_disaggregated_endpoint(
bootstrap_host=host, bootstrap_port=bootstrap_port
)
logger.info(
"Mocker prefill worker: publishing bootstrap endpoint to discovery "
f"(bootstrap_port={bootstrap_port})"
)
block_size = engine_args.get("block_size", 64)
return block_size, rc
async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path):
"""Launch mocker worker(s) with isolated DistributedRuntime instances. """Launch mocker worker(s) with isolated DistributedRuntime instances.
Each worker gets its own DistributedRuntime, which means: Each worker gets its own DistributedRuntime, which means:
...@@ -221,7 +134,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path) ...@@ -221,7 +134,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
""" """
futures = [] futures = []
runtimes = [] runtimes = []
per_worker_temp_files: list[Path] = []
stagger_delay = compute_stagger_delay(args.num_workers, args.stagger_delay) stagger_delay = compute_stagger_delay(args.num_workers, args.stagger_delay)
batch_size = 32 batch_size = 32
...@@ -238,11 +150,7 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path) ...@@ -238,11 +150,7 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
f"(estimated total: {total_time:.1f}s)" f"(estimated total: {total_time:.1f}s)"
) )
# Always load base engine args for runtime config construction needs_per_worker_overrides = bool(
with open(extra_engine_args_path) as f:
base_engine_args = json.load(f)
needs_per_worker_args = bool(
args.bootstrap_ports_list args.bootstrap_ports_list
or args.zmq_kv_events_ports_list or args.zmq_kv_events_ports_list
or args.zmq_replay_ports_list or args.zmq_replay_ports_list
...@@ -261,30 +169,29 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path) ...@@ -261,30 +169,29 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
) )
runtimes.append(runtime) runtimes.append(runtime)
# Determine which engine args file and dict to use if needs_per_worker_overrides:
worker_engine_args_path: Path | str worker_engine_args = apply_worker_engine_args_overrides(
if needs_per_worker_args: base_engine_args,
worker_args = base_engine_args.copy() bootstrap_port=(
if args.bootstrap_ports_list: args.bootstrap_ports_list[worker_id]
worker_args["bootstrap_port"] = args.bootstrap_ports_list[worker_id] if args.bootstrap_ports_list
if args.zmq_kv_events_ports_list: else None
worker_args["zmq_kv_events_port"] = args.zmq_kv_events_ports_list[ ),
worker_id zmq_kv_events_port=(
] args.zmq_kv_events_ports_list[worker_id]
if args.zmq_replay_ports_list: if args.zmq_kv_events_ports_list
worker_args["zmq_replay_port"] = args.zmq_replay_ports_list[worker_id] else None
with tempfile.NamedTemporaryFile( ),
mode="w", suffix=".json", delete=False zmq_replay_port=(
) as tmp: args.zmq_replay_ports_list[worker_id]
json.dump(worker_args, tmp) if args.zmq_replay_ports_list
worker_engine_args_path = Path(tmp.name) else None
per_worker_temp_files.append(worker_engine_args_path) ),
logger.debug(f"Worker {worker_id}: per-worker args {worker_args}") )
else: else:
worker_args = base_engine_args worker_engine_args = base_engine_args
worker_engine_args_path = extra_engine_args_path
kv_cache_block_size, runtime_config = _build_runtime_config(worker_args) kv_cache_block_size, runtime_config = build_runtime_config(worker_engine_args)
# Create EntrypointArgs for this worker # Create EntrypointArgs for this worker
entrypoint_args = EntrypointArgs( entrypoint_args = EntrypointArgs(
...@@ -293,7 +200,8 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path) ...@@ -293,7 +200,8 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
model_name=args.model_name, model_name=args.model_name,
endpoint_id=args.endpoint, endpoint_id=args.endpoint,
context_length=0, context_length=0,
extra_engine_args=str(worker_engine_args_path), extra_engine_args=None,
mocker_engine_args=worker_engine_args,
runtime_config=runtime_config, runtime_config=runtime_config,
kv_cache_block_size=kv_cache_block_size, kv_cache_block_size=kv_cache_block_size,
is_prefill=args.is_prefill_worker, is_prefill=args.is_prefill_worker,
...@@ -337,13 +245,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path) ...@@ -337,13 +245,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
for runtime in runtimes: for runtime in runtimes:
runtime.shutdown() runtime.shutdown()
# Clean up per-worker temp files
for temp_file in per_worker_temp_files:
try:
temp_file.unlink()
except Exception:
pass
def main(): def main():
uvloop.run(worker()) uvloop.run(worker())
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import json
from pathlib import Path
from typing import Any
from dynamo.llm import run_mocker_trace_replay
def default_replay_output_path(trace_file: Path) -> Path:
return trace_file.with_name(f"{trace_file.stem}.replay.json")
def format_table(headers: list[str], rows: list[list[str]]) -> str:
widths = [len(header) for header in headers]
for row in rows:
for idx, cell in enumerate(row):
widths[idx] = max(widths[idx], len(cell))
def format_row(row: list[str]) -> str:
return " | ".join(cell.ljust(widths[idx]) for idx, cell in enumerate(row))
separator = "-+-".join("-" * width for width in widths)
return "\n".join(
[format_row(headers), separator, *(format_row(row) for row in rows)]
)
def format_ms(value: float | None) -> str:
if value is None:
return "-"
return f"{value:.3f}"
def format_number(value: float | None) -> str:
if value is None:
return "-"
return f"{value:.3f}"
def print_replay_summary(report: dict[str, Any], output_file: Path) -> None:
scalar_rows = [
["Request count", str(report["num_requests"])],
["Completed requests", str(report["completed_requests"])],
["Virtual duration (ms)", f"{report['duration_ms']:.3f}"],
["Wall time (ms)", f"{report['wall_time_ms']:.3f}"],
["Input tokens", str(report["total_input_tokens"])],
["Output tokens", str(report["total_output_tokens"])],
["Request throughput (req/s)", f"{report['request_throughput_rps']:.3f}"],
["Input throughput (tok/s)", f"{report['input_throughput_tok_s']:.3f}"],
["Output throughput (tok/s)", f"{report['output_throughput_tok_s']:.3f}"],
["Total throughput (tok/s)", f"{report['total_throughput_tok_s']:.3f}"],
["Prefix cache reused ratio", f"{report['prefix_cache_reused_ratio']:.6f}"],
]
latency_rows = [
[
"TTFT",
format_ms(report["mean_ttft_ms"]),
format_ms(report["min_ttft_ms"]),
format_ms(report["max_ttft_ms"]),
format_ms(report["p99_ttft_ms"]),
format_ms(report["p90_ttft_ms"]),
format_ms(report["median_ttft_ms"]),
format_ms(report["p75_ttft_ms"]),
format_ms(report["std_ttft_ms"]),
],
[
"TTST",
format_ms(report["mean_ttst_ms"]),
format_ms(report["min_ttst_ms"]),
format_ms(report["max_ttst_ms"]),
format_ms(report["p99_ttst_ms"]),
format_ms(report["p90_ttst_ms"]),
format_ms(report["median_ttst_ms"]),
format_ms(report["p75_ttst_ms"]),
format_ms(report["std_ttst_ms"]),
],
[
"TPOT",
format_ms(report["mean_tpot_ms"]),
format_ms(report["min_tpot_ms"]),
format_ms(report["max_tpot_ms"]),
format_ms(report["p99_tpot_ms"]),
format_ms(report["p90_tpot_ms"]),
format_ms(report["median_tpot_ms"]),
format_ms(report["p75_tpot_ms"]),
format_ms(report["std_tpot_ms"]),
],
[
"ITL",
format_ms(report["mean_itl_ms"]),
format_ms(report["min_itl_ms"]),
format_ms(report["max_itl_ms"]),
format_ms(report["p99_itl_ms"]),
format_ms(report["p90_itl_ms"]),
format_ms(report["median_itl_ms"]),
format_ms(report["p75_itl_ms"]),
format_ms(report["std_itl_ms"]),
],
[
"E2E latency",
format_ms(report["mean_e2e_latency_ms"]),
format_ms(report["min_e2e_latency_ms"]),
format_ms(report["max_e2e_latency_ms"]),
format_ms(report["p99_e2e_latency_ms"]),
format_ms(report["p90_e2e_latency_ms"]),
format_ms(report["median_e2e_latency_ms"]),
format_ms(report["p75_e2e_latency_ms"]),
format_ms(report["std_e2e_latency_ms"]),
],
[
"Output TPS/User",
format_number(report["mean_output_token_throughput_per_user"]),
format_number(report["min_output_token_throughput_per_user"]),
format_number(report["max_output_token_throughput_per_user"]),
format_number(report["p99_output_token_throughput_per_user"]),
format_number(report["p90_output_token_throughput_per_user"]),
format_number(report["median_output_token_throughput_per_user"]),
format_number(report["p75_output_token_throughput_per_user"]),
format_number(report["std_output_token_throughput_per_user"]),
],
]
lines = [
"Replay Summary",
format_table(["Metric", "Value"], scalar_rows),
"",
format_table(
["Metric", "avg", "min", "max", "p99", "p90", "p50", "p75", "std"],
latency_rows,
),
f"JSON report: {output_file}",
]
print("\n".join(lines))
def write_replay_report(report: dict[str, Any], output_file: Path) -> None:
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2, sort_keys=True)
def run_trace_replay(
trace_file: Path,
output_file: Path | None,
extra_engine_args: Path,
num_workers: int,
replay_concurrency: int | None,
) -> None:
resolved_output_file = output_file or default_replay_output_path(trace_file)
report = run_mocker_trace_replay(
trace_file=trace_file,
extra_engine_args=extra_engine_args,
num_workers=num_workers,
replay_concurrency=replay_concurrency,
)
write_replay_report(report, resolved_output_file)
print_replay_summary(report, resolved_output_file)
...@@ -190,10 +190,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s ...@@ -190,10 +190,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s
MultiModalGroup(multimodal_input=MultiModalInput(image_url=url)) MultiModalGroup(multimodal_input=MultiModalInput(image_url=url))
for url in image_urls for url in image_urls
] ]
preprocessed_request = PreprocessedRequest.model_validate(raw_request)
# Build SglangMultimodalRequest from the pre-tokenized request # Build SglangMultimodalRequest from the pre-tokenized request
request = SglangMultimodalRequest( request = SglangMultimodalRequest(
request=PreprocessedRequest(**raw_request), request=preprocessed_request,
multimodal_inputs=multimodal_groups, multimodal_inputs=multimodal_groups,
) )
......
--- ---
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
title: Mocker Offline Trace Replay title: Mocker Trace Replay
subtitle: Replay Mooncake-style traces offline without launching a runtime or router subtitle: Replay Mooncake-style traces through the mocker in offline or online mode
--- ---
This guide covers the mocker's offline trace replay mode, which replays a Mooncake-style JSONL trace directly through the mock scheduler and writes a metrics report. Unlike normal `dynamo.mocker` usage, this mode does not launch workers, register endpoints, or require NATS, etcd, or a frontend. This guide covers the mocker's trace replay support for Mooncake-style JSONL traces. The replay
surface is available in two forms:
- `python -m dynamo.mocker --trace-file ...`, which writes a report file and prints a replay summary
- `python -m dynamo.replay ...`, which returns the replay report JSON on stdout and exposes
`offline|online`, `round_robin|kv_router`, `arrival_speedup_ratio`, and synthetic replay inputs
directly
Unlike normal `dynamo.mocker` usage, offline replay does not launch workers, register endpoints, or
require NATS, etcd, or a frontend. Online replay does exercise the live mock-worker runtime path.
Use this when you want to: Use this when you want to:
...@@ -15,7 +24,31 @@ Use this when you want to: ...@@ -15,7 +24,31 @@ Use this when you want to:
## Quick Start ## Quick Start
Run offline replay by passing `--trace-file`: Run offline replay through the dedicated replay CLI:
```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--num-workers 4 \
--replay-mode offline \
--router-mode round_robin \
--extra-engine-args /path/to/mocker_args.json
```
Run synthetic replay through the same CLI when you want fixed request shapes without a trace file:
```bash
python -m dynamo.replay \
--input-tokens 5000 \
--output-tokens 500 \
--request-count 1000 \
--arrival-interval-ms 1.0 \
--num-workers 1 \
--replay-mode offline \
--replay-concurrency 100 \
--extra-engine-args /path/to/mocker_args.json
```
You can also run replay through the mocker CLI by passing `--trace-file`:
```bash ```bash
python -m dynamo.mocker \ python -m dynamo.mocker \
...@@ -29,7 +62,8 @@ This writes a JSON report next to the trace file by default: ...@@ -29,7 +62,8 @@ This writes a JSON report next to the trace file by default:
/path/to/mooncake_trace.replay.json /path/to/mooncake_trace.replay.json
``` ```
The CLI also prints a `Replay Summary` table to stdout with request counts, throughput, and latency statistics. `python -m dynamo.replay` prints the replay report JSON directly to stdout. The mocker CLI prints a
`Replay Summary` table to stdout and writes the report JSON to disk.
## Input Format ## Input Format
...@@ -46,37 +80,180 @@ Example: ...@@ -46,37 +80,180 @@ Example:
{"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3]} {"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3]}
``` ```
The mocker synthesizes token blocks from `hash_ids` using the configured `--block-size`, so the replay block size should match the block size used when the trace was generated. The mocker synthesizes token blocks from `hash_ids` using the configured `--block-size`, so the
replay block size must match the block size used when the trace was generated. Public Mooncake
traces are commonly block-level hashes at `512` tokens per hash ID, so replaying them with the
default mocker `block_size=64` will fail once `input_length > len(hash_ids) * 64`. For
`engine_type=sglang`, replay still uses canonical `block_size` internally; `sglang.page_size` is
accepted as a compatibility alias and is normalized into `block_size` before replay starts.
## Replay Surfaces
### `python -m dynamo.replay`
The dedicated replay CLI exposes:
- either a positional `trace_file`, or all of `--input-tokens`, `--output-tokens`, and `--request-count`
- `--replay-mode offline|online`
- `--router-mode round_robin|kv_router`
- `--router-queue-policy fcfs|wspt|lcfs`
- `--num-workers`
- `--replay-concurrency`
- `--arrival-interval-ms`
- `--arrival-speedup-ratio`
- `--extra-engine-args`
- `--extra-engine-args-json`
- `--router-config`
- `--router-config-json`
Example:
```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode online \
--router-mode kv_router \
--num-workers 4 \
--arrival-speedup-ratio 10 \
--extra-engine-args-json '{"block_size":64,"speedup_ratio":1000.0}' \
--router-config-json '{"router_queue_policy":"fcfs","router_temperature":0.0}'
```
SGLang replay uses the same CLI surface. A minimal extra-engine-args file can use either
`block_size` directly or the compatibility alias `sglang.page_size`:
```json
{
"engine_type": "sglang",
"num_gpu_blocks": 512,
"speedup_ratio": 1000.0,
"sglang": {
"page_size": 2
}
}
```
For both `--extra-engine-args-json` and `--router-config-json`, replay accepts partial JSON
objects. Unspecified fields fall back to the same defaults used by `MockEngineArgs::default()`
and `KvRouterConfig::default()`.
### `python -m dynamo.mocker --trace-file`
The mocker CLI supports offline replay and remains useful when you want the historical
`Replay Summary` output and report-file workflow.
### Synthetic Replay
Synthetic replay bypasses trace loading and generates in-memory requests with fixed input/output
lengths and optional synthetic arrival spacing:
```bash
python -m dynamo.replay \
--input-tokens 5000 \
--output-tokens 500 \
--request-count 200 \
--arrival-interval-ms 0.5 \
--replay-mode offline \
--replay-concurrency 50 \
--extra-engine-args /path/to/mocker_args.json
```
This is useful for parameter sweeps where Mooncake-style prefix structure is not required.
## Modes ## Modes
### Fixed-Schedule Replay ### Fixed-Schedule Replay
Default replay mode preserves the timestamps from the trace and simulates arrivals in virtual time: Default trace replay preserves the timestamps from the trace and simulates arrivals according to
those timestamps:
```bash ```bash
python -m dynamo.mocker \ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--trace-file /path/to/mooncake_trace.jsonl \ --replay-mode offline \
--model-path Qwen/Qwen3-0.6B \ --num-workers 4 \
--block-size 512 --extra-engine-args /path/to/mocker_args.json
``` ```
This is the right mode when you want deterministic replay of the original arrival pattern. This is the right mode when you want deterministic replay of the original arrival pattern.
### Closed-Loop Concurrency Replay ### Closed-Loop Concurrency Replay
Use `--replay-concurrency` to ignore trace arrival timing and keep a fixed number of requests in flight: Use `--replay-concurrency` to ignore trace arrival timing and keep a fixed number of requests in
flight:
```bash ```bash
python -m dynamo.mocker \ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--trace-file /path/to/mooncake_trace.jsonl \ --replay-mode offline \
--model-path Qwen/Qwen3-0.6B \ --num-workers 4 \
--block-size 512 \
--replay-concurrency 16 --replay-concurrency 16
``` ```
This mode is useful when you want to compare scheduler behavior under a fixed offered concurrency rather than the original trace schedule. This mode is useful when you want to compare scheduler behavior under a fixed offered concurrency rather than the original trace schedule.
### Online Replay
Online replay launches the mock workers and replays the trace against the live runtime path. This
is useful when you want the replay to include live request dispatch, live output handling, and the
same async KV-event propagation model used by the current router integration.
```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode online \
--router-mode kv_router \
--num-workers 4 \
--arrival-speedup-ratio 10 \
--extra-engine-args /path/to/mocker_args.json
```
### Arrival Speedup
Use `--arrival-speedup-ratio` to compress or stretch the trace arrival process without changing the
mocker compute model. Larger values make arrivals happen sooner relative to the original trace.
```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--num-workers 4 \
--arrival-speedup-ratio 5 \
--extra-engine-args /path/to/mocker_args.json
```
### Router Modes
Replay currently supports:
- `round_robin`
- `kv_router`
`kv_router` uses the shared local scheduler and an in-process KV indexer. In offline replay:
- `kv_router` is supported only when `num_workers > 1`
- router queueing is enabled and uses simulation time rather than wall-clock time
- KV visibility is delayed slightly relative to request lifecycle events
- queue admission is driven by router lifecycle edges (`add_request`, `mark_prefill_completed`, and `free`)
- transient in-pass prefill occupancy is still approximated at the router level rather than modeled exactly
To compare queue policies manually, keep the same trace and engine args fixed and swap only
`--router-queue-policy`:
```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--router-queue-policy fcfs \
--num-workers 4 \
--extra-engine-args /path/to/mocker_args.json
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--router-queue-policy lcfs \
--num-workers 4 \
--extra-engine-args /path/to/mocker_args.json
```
`lcfs` is intentionally a worse comparison policy under saturation; use it for experiments, not as
an expected production default.
## Output ## Output
Use `--output-file` to override the default report location: Use `--output-file` to override the default report location:
...@@ -88,7 +265,7 @@ python -m dynamo.mocker \ ...@@ -88,7 +265,7 @@ python -m dynamo.mocker \
--output-file /tmp/replay-report.json --output-file /tmp/replay-report.json
``` ```
If `--output-file` is not set, the report path defaults to `<trace stem>.replay.json` in the same directory as the input trace. If `--output-file` is not set, the report path defaults to `TRACE_STEM.replay.json` in the same directory as the input trace.
The report contains: The report contains:
...@@ -100,23 +277,41 @@ The report contains: ...@@ -100,23 +277,41 @@ The report contains:
- TTFT, TTST, TPOT, ITL, and end-to-end latency summaries - TTFT, TTST, TPOT, ITL, and end-to-end latency summaries
- output-token-throughput-per-user summaries - output-token-throughput-per-user summaries
The dedicated replay CLI returns the same report schema as the Python APIs
`dynamo.replay.run_trace_replay(...)` and `dynamo.replay.run_synthetic_trace_replay(...)`.
## Replay Constraints ## Replay Constraints
Offline replay currently supports only this configuration: Shared replay constraints:
- `--num-workers 1`
- aggregated mode - aggregated mode
- `--engine-type vllm` - `--engine-type vllm|sglang`
- `--data-parallel-size 1` - `--data-parallel-size 1`
Additional offline constraints:
- offline `kv_router` requires `num_workers > 1`
- public single-worker offline replay still uses the legacy single-worker runtime for `vllm`
while `sglang` goes through the shared multi-worker replay runtime even when `num_workers=1`
Additional online constraints:
- the current live replay path is also limited to aggregated workers
If you violate those constraints, replay fails immediately with a validation error. If you violate those constraints, replay fails immediately with a validation error.
## Practical Notes ## Practical Notes
- `--replay-concurrency` requires `--trace-file` - `python -m dynamo.replay` requires exactly one of:
either a trace file, or all of `--input-tokens`, `--output-tokens`, and `--request-count`
- `--replay-concurrency` works with both trace replay and synthetic replay
- `--speedup-ratio` still affects simulated timing - `--speedup-ratio` still affects simulated timing
- `--arrival-speedup-ratio` affects trace timestamps, not worker compute speed
- `--arrival-interval-ms` only applies to synthetic replay
- `--extra-engine-args` can be used to provide a full mocker config JSON instead of individual CLI flags - `--extra-engine-args` can be used to provide a full mocker config JSON instead of individual CLI flags
- offline replay does not need planner runtime setup, router registration, or event transport - offline replay does not need planner runtime setup, router registration, or external event transport
- the replay block size should match the trace block size, because token synthesis expands `hash_ids`
using the configured block size
## When To Use This vs AIPerf ## When To Use This vs AIPerf
......
...@@ -43,8 +43,8 @@ The Rust HTTP server also reads these environment variables (not exposed as CLI ...@@ -43,8 +43,8 @@ The Rust HTTP server also reads these environment variables (not exposed as CLI
| `--router-assume-kv-reuse` / `--no-router-assume-kv-reuse` | `DYN_ROUTER_ASSUME_KV_REUSE` | `true` | Assume KV cache reuse when tracking active blocks | | `--router-assume-kv-reuse` / `--no-router-assume-kv-reuse` | `DYN_ROUTER_ASSUME_KV_REUSE` | `true` | Assume KV cache reuse when tracking active blocks |
| `--router-track-output-blocks` / `--no-router-track-output-blocks` | `DYN_ROUTER_TRACK_OUTPUT_BLOCKS` | `false` | Track output blocks with fractional decay during generation | | `--router-track-output-blocks` / `--no-router-track-output-blocks` | `DYN_ROUTER_TRACK_OUTPUT_BLOCKS` | `false` | Track output blocks with fractional decay during generation |
| `--router-event-threads` | `DYN_ROUTER_EVENT_THREADS` | `4` | Event processing threads. >1 enables concurrent radix tree | | `--router-event-threads` | `DYN_ROUTER_EVENT_THREADS` | `4` | Event processing threads. >1 enables concurrent radix tree |
| `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `2.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling | | `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `4.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling |
| `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT) or `wspt` (avg TTFT) | | `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
| `--enable-cache-control` / `--no-enable-cache-control` | `DYN_ENABLE_CACHE_CONTROL` | `false` | Enable TTL-based cache pinning (requires `--router-mode=kv`) | | `--enable-cache-control` / `--no-enable-cache-control` | `DYN_ENABLE_CACHE_CONTROL` | `false` | Enable TTL-based cache pinning (requires `--router-mode=kv`) |
| `--decode-fallback` / `--no-decode-fallback` | `DYN_DECODE_FALLBACK` | `false` | Fall back to aggregated mode when prefill workers unavailable | | `--decode-fallback` / `--no-decode-fallback` | `DYN_DECODE_FALLBACK` | `false` | Fall back to aggregated mode when prefill workers unavailable |
......
...@@ -21,8 +21,8 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa ...@@ -21,8 +21,8 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing | | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) | | `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) | | `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
| `--router-queue-threshold` | `2.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` | | `--router-queue-threshold` | `4.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` |
| `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT) or `wspt` (avg TTFT) | | `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
### Standalone Router ### Standalone Router
......
...@@ -87,8 +87,8 @@ Backend workers register themselves using the `register_model` API, after which ...@@ -87,8 +87,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) | | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking | | `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) | | `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--router-queue-threshold <float>` | `2.0` | Queue threshold fraction; enables priority scheduling via `priority` | | `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` |
| `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT) or `wspt` (avg TTFT) | | `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
For all available options: `python -m dynamo.frontend --help` For all available options: `python -m dynamo.frontend --help`
...@@ -231,10 +231,11 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na ...@@ -231,10 +231,11 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
- `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness. - `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.
- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 2.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `priority` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately). - `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 4.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `priority` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately).
- `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Two policies are available: - `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Three policies are available:
- **`fcfs`** (first-come first-served): Orders by adjusted arrival time (`priority_jump - arrival_offset`). Optimizes **tail TTFT** — no request waits longer than necessary. - **`fcfs`** (first-come first-served): Orders by adjusted arrival time (`priority_jump - arrival_offset`). Optimizes **tail TTFT** — no request waits longer than necessary.
- **`lcfs`** (last-come first-served): Orders by adjusted reverse arrival time (`priority_jump + arrival_offset`). Intentionally favors newer arrivals under saturation and is mainly useful for controlled comparison experiments.
- **`wspt`** (weighted shortest processing time, Smith's rule): Orders by `(1 + priority_jump) / isl_tokens`. Optimizes **average TTFT** — short or high-priority requests are scheduled before long low-priority ones, minimizing total weighted completion time. - **`wspt`** (weighted shortest processing time, Smith's rule): Orders by `(1 + priority_jump) / isl_tokens`. Optimizes **average TTFT** — short or high-priority requests are scheduled before long low-priority ones, minimizing total weighted completion time.
### KV Event Transport and Persistence ### KV Event Transport and Persistence
...@@ -281,7 +282,7 @@ Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worke ...@@ -281,7 +282,7 @@ Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worke
Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default). Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default).
The `--router-queue-threshold` (default: 2.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely. The `--router-queue-threshold` (default: 4.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.
Use `--router-queue-policy wspt` when your workload has a mix of short and long requests and you want to minimize **average** TTFT. WSPT (Smith's rule) schedules short or high-priority requests first, reducing mean latency across the batch. Use the default `fcfs` when you want to minimize **tail** TTFT — no request waits longer than necessary, since ordering is purely by (adjusted) arrival time. Use `--router-queue-policy wspt` when your workload has a mix of short and long requests and you want to minimize **average** TTFT. WSPT (Smith's rule) schedules short or high-priority requests first, reducing mean latency across the batch. Use the default `fcfs` when you want to minimize **tail** TTFT — no request waits longer than necessary, since ordering is purely by (adjusted) arrival time.
......
...@@ -58,7 +58,7 @@ EOF ...@@ -58,7 +58,7 @@ EOF
tilt up tilt up
``` ```
Tilt opens a terminal UI and a web dashboard at <http://localhost:10350>. Tilt opens a terminal UI and a web dashboard at [http://localhost:10350](http://localhost:10350).
The dashboard shows resource status, build logs, and port-forwards. The dashboard shows resource status, build logs, and port-forwards.
Press **Space** in the terminal to open the web UI. Press **Ctrl-C** to Press **Space** in the terminal to open the web UI. Press **Ctrl-C** to
...@@ -246,7 +246,7 @@ REGISTRY=ghcr.io/myorg tilt up ...@@ -246,7 +246,7 @@ REGISTRY=ghcr.io/myorg tilt up
## Tilt UI ## Tilt UI
The web UI at <http://localhost:10350> shows: The web UI at [http://localhost:10350](http://localhost:10350) shows:
- **Resource status** — green/red/pending for each resource - **Resource status** — green/red/pending for each resource
- **Build logs** — compilation output and errors - **Build logs** — compilation output and errors
......
...@@ -11,7 +11,7 @@ The Mocker is a lightweight, high-fidelity simulation of an LLM inference engine ...@@ -11,7 +11,7 @@ The Mocker is a lightweight, high-fidelity simulation of an LLM inference engine
The mocker simulates: The mocker simulates:
- **Block-based KV cache management** with LRU eviction - **Block-based KV cache management** with LRU eviction
- **Continuous batching scheduler** with watermark-based admission control - **Engine-specific continuous batching schedulers** for vLLM and SGLang
- **Prefix caching** with hash-based block deduplication - **Prefix caching** with hash-based block deduplication
- **Chunked prefill** for better batching efficiency - **Chunked prefill** for better batching efficiency
- **Realistic timing models** for prefill and decode phases - **Realistic timing models** for prefill and decode phases
...@@ -74,10 +74,10 @@ python -m dynamo.mocker \ ...@@ -74,10 +74,10 @@ python -m dynamo.mocker \
| `--endpoint` | Auto-derived | Dynamo endpoint string. Defaults are namespace-dependent, and prefill workers use a different default endpoint than aggregated/decode workers | | `--endpoint` | Auto-derived | Dynamo endpoint string. Defaults are namespace-dependent, and prefill workers use a different default endpoint than aggregated/decode workers |
| `--model-name` | Derived from model-path | Model name for API responses | | `--model-name` | Derived from model-path | Model name for API responses |
| `--trace-file` | None | Run offline trace replay from a Mooncake-style JSONL trace file | | `--trace-file` | None | Run offline trace replay from a Mooncake-style JSONL trace file |
| `--output-file` | `<trace stem>.replay.json` | Write replay metrics JSON to this path | | `--output-file` | `TRACE_STEM.replay.json` | Write replay metrics JSON to this path |
| `--replay-concurrency` | None | Run offline replay in closed-loop concurrency mode with this many in-flight requests | | `--replay-concurrency` | None | Run offline replay in closed-loop concurrency mode with this many in-flight requests |
| `--num-gpu-blocks-override` | 16384 | Number of KV cache blocks | | `--num-gpu-blocks-override` | 16384 | Number of KV cache blocks |
| `--block-size` | 64 | Tokens per KV cache block | | `--block-size` | 64 (`vllm`) / engine-specific | Tokens per KV cache block. For `sglang`, if omitted, the effective page/block size defaults to 1 or to `--sglang-page-size` when provided |
| `--max-num-seqs` | 256 | Maximum concurrent sequences | | `--max-num-seqs` | 256 | Maximum concurrent sequences |
| `--max-num-batched-tokens` | 8192 | Maximum tokens per batch | | `--max-num-batched-tokens` | 8192 | Maximum tokens per batch |
| `--enable-prefix-caching` | True | Enable prefix caching | | `--enable-prefix-caching` | True | Enable prefix caching |
...@@ -85,7 +85,6 @@ python -m dynamo.mocker \ ...@@ -85,7 +85,6 @@ python -m dynamo.mocker \
| `--enable-chunked-prefill` | True | Enable chunked prefill | | `--enable-chunked-prefill` | True | Enable chunked prefill |
| `--no-enable-chunked-prefill` | - | Disable chunked prefill | | `--no-enable-chunked-prefill` | - | Disable chunked prefill |
| `--preemption-mode` | `lifo` | Decode eviction policy under memory pressure: `lifo` (vLLM v1 style) or `fifo` | | `--preemption-mode` | `lifo` | Decode eviction policy under memory pressure: `lifo` (vLLM v1 style) or `fifo` |
| `--watermark` | 0.01 | KV cache watermark (fraction reserved) |
| `--speedup-ratio` | 1.0 | Timing speedup factor | | `--speedup-ratio` | 1.0 | Timing speedup factor |
| `--decode-speedup-ratio` | 1.0 | Decode-only speedup multiplier (e.g. for Eagle speculation) | | `--decode-speedup-ratio` | 1.0 | Decode-only speedup multiplier (e.g. for Eagle speculation) |
| `--data-parallel-size` | 1 | Number of DP replicas | | `--data-parallel-size` | 1 | Number of DP replicas |
...@@ -95,7 +94,7 @@ python -m dynamo.mocker \ ...@@ -95,7 +94,7 @@ python -m dynamo.mocker \
| `--reasoning` | None | JSON config for emitting reasoning token spans, with `start_thinking_token_id`, `end_thinking_token_id`, and `thinking_ratio` | | `--reasoning` | None | JSON config for emitting reasoning token spans, with `start_thinking_token_id`, `end_thinking_token_id`, and `thinking_ratio` |
| `--engine-type` | `vllm` | Engine simulation type: `vllm` or `sglang` | | `--engine-type` | `vllm` | Engine simulation type: `vllm` or `sglang` |
| `--sglang-schedule-policy` | `fifo` / `fcfs` | SGLang scheduling policy override | | `--sglang-schedule-policy` | `fifo` / `fcfs` | SGLang scheduling policy override |
| `--sglang-page-size` | 1 | SGLang radix-cache page size in tokens | | `--sglang-page-size` | 1 | SGLang radix-cache page size in tokens. Also becomes the effective block size when `--engine-type sglang` and `--block-size` is omitted |
| `--sglang-max-prefill-tokens` | 16384 | SGLang max prefill-token budget per batch | | `--sglang-max-prefill-tokens` | 16384 | SGLang max prefill-token budget per batch |
| `--sglang-chunked-prefill-size` | 8192 | SGLang chunked-prefill chunk size | | `--sglang-chunked-prefill-size` | 8192 | SGLang chunked-prefill chunk size |
| `--sglang-clip-max-new-tokens` | 4096 | SGLang admission-budget cap for max new tokens | | `--sglang-clip-max-new-tokens` | 4096 | SGLang admission-budget cap for max new tokens |
...@@ -126,9 +125,12 @@ python -m dynamo.mocker \ ...@@ -126,9 +125,12 @@ python -m dynamo.mocker \
> **Note:** For local scale tests and router benchmarks, prefer `--num-workers` over launching many separate mocker processes. All workers share one tokio runtime and thread pool, which is both lighter weight and closer to how the test harnesses exercise the mocker. > **Note:** For local scale tests and router benchmarks, prefer `--num-workers` over launching many separate mocker processes. All workers share one tokio runtime and thread pool, which is both lighter weight and closer to how the test harnesses exercise the mocker.
## Offline Trace Replay ## Trace Replay
The mocker also supports an offline replay mode for Mooncake-style traces: The mocker also supports replaying Mooncake-style traces through both the original mocker CLI and
the dedicated replay harness.
For the original mocker CLI flow:
```bash ```bash
python -m dynamo.mocker \ python -m dynamo.mocker \
...@@ -136,9 +138,41 @@ python -m dynamo.mocker \ ...@@ -136,9 +138,41 @@ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B --model-path Qwen/Qwen3-0.6B
``` ```
This mode writes a replay report JSON and prints a `Replay Summary` table without launching a runtime or router. For the standalone replay CLI, which exposes `offline|online`, `round_robin|kv_router`,
`arrival_speedup_ratio`, `router_queue_policy`, and the synthetic replay path directly:
```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--num-workers 4 \
--replay-mode offline \
--router-mode kv_router \
--router-queue-policy fcfs \
--arrival-speedup-ratio 5 \
--extra-engine-args /path/to/mocker_args.json
```
The same CLI also supports synthetic replay without a trace file:
```bash
python -m dynamo.replay \
--input-tokens 5000 \
--output-tokens 500 \
--request-count 1000 \
--arrival-interval-ms 1.0 \
--num-workers 1 \
--replay-mode offline \
--replay-concurrency 100 \
--extra-engine-args /path/to/mocker_args.json
```
The standalone replay CLI prints the replay report JSON directly to stdout. The `dynamo.mocker`
trace-file flow still writes a report file and prints a `Replay Summary` table.
For full usage, constraints, and benchmarking guidance, see [Mocker Trace Replay](../benchmarks/mocker-trace-replay.md).
For full usage, constraints, and benchmarking guidance, see [Mocker Offline Trace Replay](../benchmarks/mocker-trace-replay.md). Replay supports aggregated `vllm` and `sglang` engine configs. Internally replay uses canonical
`block_size`; for `sglang`, `sglang.page_size` is still accepted as a compatibility alias as long
as it matches `block_size` when both are provided.
## Performance Modeling Setup ## Performance Modeling Setup
...@@ -225,15 +259,21 @@ The mocker is organized into several cooperating components that mirror the inte ...@@ -225,15 +259,21 @@ The mocker is organized into several cooperating components that mirror the inte
### Scheduler ### Scheduler
The scheduler implements continuous batching, maintaining three logical queues: The mocker now has two scheduler shapes rather than one generic queue model:
1. **Waiting Queue** - Newly arrived requests awaiting scheduling - **vLLM mocker** uses an upstream-style `waiting + running` scheduler. Each request tracks
2. **Prefill Queue** - Requests scheduled for prefill computed tokens, the scheduler spends one token budget across the running set first, and decode
3. **Decode Queue** - Requests actively decoding (ordered by age for preemption) pressure triggers inline preemption of running requests.
- **SGLang mocker** uses a cache-aware waiting/running scheduler around a radix-style prefix cache.
It batches prefill work with decode-state awareness and handles pressure primarily through decode
retraction while preserving cached prefixes.
Each iteration, the scheduler receives incoming requests, moves eligible requests from waiting to prefill based on available memory and compute budgets, simulates the prefill phase for queued requests, runs one decode step for all active sequences, and publishes metrics about current resource utilization. Both schedulers simulate continuous batching, prefix reuse, chunked prefill, memory pressure, and
decode token emission while publishing metrics about current resource utilization.
When resources become constrained, the scheduler employs preemption: the oldest decoding request is evicted back to the waiting queue, its KV blocks are freed, and it will be rescheduled later. This mirrors how real engines handle memory pressure. When resources become constrained, the mocker simulates the engine's real recovery path:
- vLLM-style decode preemption and recompute
- SGLang-style decode retraction plus prefix-preserving cache updates
### KV Block Manager ### KV Block Manager
......
...@@ -122,10 +122,10 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full ...@@ -122,10 +122,10 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
[tools]: ../user-guides/tool-calling [tools]: ../user-guides/tool-calling
{/* Multimodal */} {/* Multimodal */}
[mm]: ../user-guides/multimodal [mm]: ../features/multimodal/README.md
[mm-vllm]: ../user-guides/multimodal/multimodal-vllm [mm-vllm]: ../features/multimodal/multimodal-vllm.md
[mm-trtllm]: ../user-guides/multimodal/multimodal-trtllm [mm-trtllm]: ../features/multimodal/multimodal-trtllm.md
[mm-sglang]: ../user-guides/multimodal/multimodal-sglang [mm-sglang]: ../features/multimodal/multimodal-sglang.md
{/* Feature-specific */} {/* Feature-specific */}
[lora]: ../kubernetes-deployment/deployment-guide/managing-models-with-dynamo-model [lora]: ../kubernetes-deployment/deployment-guide/managing-models-with-dynamo-model
......
...@@ -185,11 +185,11 @@ For a complete list of known issues, refer to the release notes for each version ...@@ -185,11 +185,11 @@ For a complete list of known issues, refer to the release notes for each version
| `v1.0.1` | Mar 16, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v1.0.1) | [Docs](https://docs.dynamo.nvidia.com/dynamo) | | | `v1.0.1` | Mar 16, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v1.0.1) | [Docs](https://docs.dynamo.nvidia.com/dynamo) | |
| `v1.0.0` | Mar 12, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v1.0.0) | [Docs](https://docs.dynamo.nvidia.com/dynamo) | | | `v1.0.0` | Mar 12, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v1.0.0) | [Docs](https://docs.dynamo.nvidia.com/dynamo) | |
| `v0.9.1` | Mar 4, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.9.1) | [Docs](https://docs.dynamo.nvidia.com/dynamo) | | `v0.9.1` | Mar 4, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.9.1) | [Docs](https://docs.dynamo.nvidia.com/dynamo) |
| `v0.9.0` | Feb 11, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.9.0) | [Docs](https://docs.dynamo.nvidia.com/dynamo/v-0-9-0/) | | `v0.9.0` | Feb 11, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.9.0) | Archived docs unavailable |
| `v0.8.1` | Jan 23, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.1) | [Docs](https://docs.nvidia.com/dynamo/v-0-8-1/) | | `v0.8.1` | Jan 23, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.1) | Archived docs unavailable |
| `v0.8.0` | Jan 15, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.0) | [Docs](https://docs.nvidia.com/dynamo/v-0-8-0/) | | `v0.8.0` | Jan 15, 2026 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.8.0) | Archived docs unavailable |
| `v0.7.1` | Dec 15, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.1) | [Docs](https://docs.nvidia.com/dynamo/v-0-7-1/) | | `v0.7.1` | Dec 15, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.1) | Archived docs unavailable |
| `v0.7.0` | Nov 26, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.0) | [Docs](https://docs.nvidia.com/dynamo/v-0-7-0/) | | `v0.7.0` | Nov 26, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.7.0) | Archived docs unavailable |
| `v0.6.1` | Nov 6, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.6.1) | — | | `v0.6.1` | Nov 6, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.6.1) | — |
| `v0.6.0` | Oct 28, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.6.0) | — | | `v0.6.0` | Oct 28, 2025 | [Release](https://github.com/ai-dynamo/dynamo/releases/tag/v0.6.0) | — |
......
...@@ -9,7 +9,7 @@ use clap::Parser; ...@@ -9,7 +9,7 @@ use clap::Parser;
use common::NoopSequencePublisher; use common::NoopSequencePublisher;
use dynamo_kv_router::protocols::WorkerWithDpRank; use dynamo_kv_router::protocols::WorkerWithDpRank;
use dynamo_kv_router::{ActiveSequencesMultiWorker, OverlapScores, SequenceRequest}; use dynamo_kv_router::{ActiveSequencesMultiWorker, OverlapScores, SequenceRequest};
use dynamo_mocker::common::protocols::{DirectRequest, OutputSignal}; use dynamo_mocker::common::protocols::{DirectRequest, KvEventPublishers, OutputSignal};
use dynamo_mocker::scheduler::Scheduler; use dynamo_mocker::scheduler::Scheduler;
use dynamo_mocker::scheduler::SchedulerHandle; use dynamo_mocker::scheduler::SchedulerHandle;
use dynamo_tokens::SequenceHash; use dynamo_tokens::SequenceHash;
...@@ -101,7 +101,13 @@ async fn generate_sequence_events( ...@@ -101,7 +101,13 @@ async fn generate_sequence_events(
let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>(); let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>();
// No KvCacheEventSink — we only need output signals // No KvCacheEventSink — we only need output signals
let scheduler = Scheduler::new(sched_args, 0, Some(output_tx), None, None); let scheduler = Scheduler::new(
sched_args,
0,
Some(output_tx),
KvEventPublishers::default(),
None,
);
// Pre-compute metadata for each request before submission // Pre-compute metadata for each request before submission
let mut metadata: HashMap<Uuid, RequestMetadata> = HashMap::new(); let mut metadata: HashMap<Uuid, RequestMetadata> = HashMap::new();
......
...@@ -11,7 +11,9 @@ use dynamo_kv_router::protocols::{ ...@@ -11,7 +11,9 @@ use dynamo_kv_router::protocols::{
KvCacheStoredBlockData, RouterEvent, WorkerId, XXH3_SEED, compute_seq_hash_for_block, KvCacheStoredBlockData, RouterEvent, WorkerId, XXH3_SEED, compute_seq_hash_for_block,
}; };
pub use dynamo_kv_router::test_utils::{NoopSequencePublisher, SimpleWorkerConfig}; pub use dynamo_kv_router::test_utils::{NoopSequencePublisher, SimpleWorkerConfig};
use dynamo_mocker::common::protocols::{DirectRequest, KvCacheEventSink, MockEngineArgs}; use dynamo_mocker::common::protocols::{
DirectRequest, KvCacheEventSink, KvEventPublishers, MockEngineArgs,
};
use dynamo_mocker::scheduler::Scheduler; use dynamo_mocker::scheduler::Scheduler;
use dynamo_mocker::scheduler::SchedulerHandle; use dynamo_mocker::scheduler::SchedulerHandle;
use dynamo_tokens::compute_hash_v2; use dynamo_tokens::compute_hash_v2;
...@@ -122,11 +124,7 @@ impl EventCollector { ...@@ -122,11 +124,7 @@ impl EventCollector {
} }
impl KvCacheEventSink for EventCollector { impl KvCacheEventSink for EventCollector {
fn publish( fn publish(&self, event: KvCacheEvent) -> anyhow::Result<()> {
&self,
event: KvCacheEvent,
_block_token_ids: Option<&[Vec<u32>]>,
) -> anyhow::Result<()> {
let timestamp = Instant::now(); let timestamp = Instant::now();
if let Some(events) = self.events.lock().unwrap().as_mut() { if let Some(events) = self.events.lock().unwrap().as_mut() {
events.push((event, timestamp)); events.push((event, timestamp));
...@@ -361,7 +359,13 @@ pub async fn generate_kv_events( ...@@ -361,7 +359,13 @@ pub async fn generate_kv_events(
tasks.push(tokio::spawn(async move { tasks.push(tokio::spawn(async move {
let collector = EventCollector::new(); let collector = EventCollector::new();
let scheduler = Scheduler::new(sched_args, 0, None, Some(collector.clone()), None); let scheduler = Scheduler::new(
sched_args,
0,
None,
KvEventPublishers::new(Some(collector.clone()), None),
None,
);
let mut i = 0; let mut i = 0;
let mut target = Instant::now(); let mut target = Instant::now();
......
...@@ -1707,7 +1707,6 @@ dependencies = [ ...@@ -1707,7 +1707,6 @@ dependencies = [
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-kv-router", "dynamo-kv-router",
"dynamo-runtime",
"dynamo-tokens", "dynamo-tokens",
"ndarray", "ndarray",
"ndarray-interp", "ndarray-interp",
......
...@@ -1723,7 +1723,6 @@ dependencies = [ ...@@ -1723,7 +1723,6 @@ dependencies = [
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-kv-router", "dynamo-kv-router",
"dynamo-runtime",
"dynamo-tokens", "dynamo-tokens",
"ndarray", "ndarray",
"ndarray-interp", "ndarray-interp",
...@@ -1783,6 +1782,7 @@ dependencies = [ ...@@ -1783,6 +1782,7 @@ dependencies = [
"tokio-util", "tokio-util",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"uuid",
] ]
[[package]] [[package]]
......
...@@ -46,6 +46,7 @@ tokio = { version = "1.46.0", features = ["full"] } ...@@ -46,6 +46,7 @@ tokio = { version = "1.46.0", features = ["full"] }
tokio-stream = { version = "0" } tokio-stream = { version = "0" }
tokio-util = { version = "0.7", features = ["rt"] } tokio-util = { version = "0.7", features = ["rt"] }
tracing = { version = "0" } tracing = { version = "0" }
uuid = { version = "1.18.1" }
# kv-indexer / shared kv-router types # kv-indexer / shared kv-router types
dynamo-kv-router = { path = "../../kv-router", features = ["standalone-indexer"] } dynamo-kv-router = { path = "../../kv-router", features = ["standalone-indexer"] }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment