Unverified Commit c3908a36 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add offline trace replay mode [DYN-2502] (#7543)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 6e56bad6
...@@ -11,12 +11,6 @@ from pathlib import Path ...@@ -11,12 +11,6 @@ from pathlib import Path
from dynamo.common.utils.namespace import get_worker_namespace from dynamo.common.utils.namespace import get_worker_namespace
from . import __version__ from . import __version__
from .utils.kv_cache import DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
from .utils.planner_profiler_perf_data_converter import (
convert_profile_results_to_npz,
is_mocker_format_npz,
is_profile_results_dir,
)
DYN_NAMESPACE = get_worker_namespace() DYN_NAMESPACE = get_worker_namespace()
DEFAULT_ENDPOINT = f"dyn://{DYN_NAMESPACE}.backend.generate" DEFAULT_ENDPOINT = f"dyn://{DYN_NAMESPACE}.backend.generate"
...@@ -62,6 +56,12 @@ def resolve_planner_profile_data( ...@@ -62,6 +56,12 @@ def resolve_planner_profile_data(
Raises: Raises:
FileNotFoundError: If path doesn't contain valid profile data in any supported format. FileNotFoundError: If path doesn't contain valid profile data in any supported format.
""" """
from .utils.planner_profiler_perf_data_converter import (
convert_profile_results_to_npz,
is_mocker_format_npz,
is_profile_results_dir,
)
if planner_profile_data is None: if planner_profile_data is None:
return ProfileDataResult(npz_path=None, tmpdir=None) return ProfileDataResult(npz_path=None, tmpdir=None)
...@@ -216,7 +216,7 @@ def parse_bootstrap_ports(ports_str: str | None) -> list[int]: ...@@ -216,7 +216,7 @@ def parse_bootstrap_ports(ports_str: str | None) -> list[int]:
return [int(p.strip()) for p in ports_str.split(",")] return [int(p.strip()) for p in ports_str.split(",")]
def parse_args() -> argparse.Namespace: def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
"""Parse command-line arguments for the Dynamo mocker engine. """Parse command-line arguments for the Dynamo mocker engine.
Returns: Returns:
...@@ -248,6 +248,24 @@ def parse_args() -> argparse.Namespace: ...@@ -248,6 +248,24 @@ def parse_args() -> argparse.Namespace:
default=None, default=None,
help="Model name for API responses (default: derived from model-path)", help="Model name for API responses (default: derived from model-path)",
) )
parser.add_argument(
"--trace-file",
type=Path,
default=None,
help="Run offline trace replay from a Mooncake-style JSONL trace file.",
)
parser.add_argument(
"--output-file",
type=Path,
default=None,
help="Write replay metrics JSON to this path. Defaults to a replay JSON next to the trace file.",
)
parser.add_argument(
"--replay-concurrency",
type=int,
default=None,
help="Run offline replay in closed-loop concurrency mode with this many in-flight requests.",
)
# MockEngineArgs parameters (similar to vLLM style) # MockEngineArgs parameters (similar to vLLM style)
parser.add_argument( parser.add_argument(
...@@ -481,7 +499,7 @@ def parse_args() -> argparse.Namespace: ...@@ -481,7 +499,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--kv-transfer-bandwidth", "--kv-transfer-bandwidth",
type=float, type=float,
default=DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS, default=_default_kv_transfer_bandwidth_gbps(),
help="KV cache transfer bandwidth in GB/s for disaggregated serving latency simulation. " help="KV cache transfer bandwidth in GB/s for disaggregated serving latency simulation. "
"Default: 64.0 (inter-node InfiniBand). Set to 0 to disable KV transfer delay. " "Default: 64.0 (inter-node InfiniBand). Set to 0 to disable KV transfer delay. "
"For intra-node NVLink, typical value is ~450.", "For intra-node NVLink, typical value is ~450.",
...@@ -543,9 +561,12 @@ def parse_args() -> argparse.Namespace: ...@@ -543,9 +561,12 @@ def parse_args() -> argparse.Namespace:
help="Determines how events are published [nats|zmq]", help="Determines how events are published [nats|zmq]",
) )
args = parser.parse_args() args = parser.parse_args(argv)
validate_worker_type_args(args) validate_worker_type_args(args)
if args.replay_concurrency is not None and args.trace_file is None:
raise ValueError("--replay-concurrency requires --trace-file")
# Validate num_workers # Validate num_workers
if args.num_workers < 1: if args.num_workers < 1:
raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}") raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}")
...@@ -587,5 +608,10 @@ def parse_args() -> argparse.Namespace: ...@@ -587,5 +608,10 @@ def parse_args() -> argparse.Namespace:
else: else:
args.endpoint = DEFAULT_ENDPOINT args.endpoint = DEFAULT_ENDPOINT
logger.debug(f"Using default endpoint: {args.endpoint}") logger.debug(f"Using default endpoint: {args.endpoint}")
return args return args
def _default_kv_transfer_bandwidth_gbps() -> float:
from .utils.kv_cache import DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
return DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
...@@ -30,6 +30,7 @@ from dynamo.llm import ( ...@@ -30,6 +30,7 @@ from dynamo.llm import (
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data
from .replay import run_trace_replay
from .utils.kv_cache import compute_kv_bytes_per_token from .utils.kv_cache import compute_kv_bytes_per_token
configure_dynamo_logging() configure_dynamo_logging()
...@@ -72,6 +73,33 @@ async def worker(): ...@@ -72,6 +73,33 @@ async def worker():
while still sharing the same event loop and tokio runtime. while still sharing the same event loop and tokio runtime.
""" """
args = parse_args() args = parse_args()
profile_data_result = None
# Offline replay does not need planner profile conversion or runtime setup.
if args.trace_file is not None:
if args.extra_engine_args:
extra_engine_args_path = args.extra_engine_args
logger.info(f"Using provided MockEngineArgs from {extra_engine_args_path}")
else:
extra_engine_args_path = create_temp_engine_args_file(args)
logger.info("Created MockEngineArgs from CLI arguments")
try:
run_trace_replay(
trace_file=args.trace_file,
output_file=args.output_file,
extra_engine_args=extra_engine_args_path,
num_workers=args.num_workers,
replay_concurrency=args.replay_concurrency,
)
return
finally:
if not args.extra_engine_args and extra_engine_args_path.exists():
try:
extra_engine_args_path.unlink()
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}")
# Resolve planner-profile-data: convert profile results dir to NPZ if needed # Resolve planner-profile-data: convert profile results dir to NPZ if needed
profile_data_result = resolve_planner_profile_data(args.planner_profile_data) profile_data_result = resolve_planner_profile_data(args.planner_profile_data)
...@@ -87,6 +115,7 @@ async def worker(): ...@@ -87,6 +115,7 @@ async def worker():
extra_engine_args_path = create_temp_engine_args_file(args) extra_engine_args_path = create_temp_engine_args_file(args)
logger.info("Created MockEngineArgs from CLI arguments") logger.info("Created MockEngineArgs from CLI arguments")
try:
# Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers # Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers
if args.num_workers > 1 and args.model_path: if args.num_workers > 1 and args.model_path:
await prefetch_model(args.model_path) await prefetch_model(args.model_path)
...@@ -105,7 +134,6 @@ async def worker(): ...@@ -105,7 +134,6 @@ async def worker():
with open(extra_engine_args_path, "w") as f: with open(extra_engine_args_path, "w") as f:
json.dump(engine_args, f, indent=2) json.dump(engine_args, f, indent=2)
try:
logger.info( logger.info(
f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances" f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances"
) )
...@@ -118,7 +146,7 @@ async def worker(): ...@@ -118,7 +146,7 @@ async def worker():
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}") logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e: except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}") logger.warning(f"Failed to clean up temporary file: {e}")
if profile_data_result is not None:
del profile_data_result # Triggers tmpdir cleanup via __del__ del profile_data_result # Triggers tmpdir cleanup via __del__
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import json
from pathlib import Path
from typing import Any
from dynamo.llm import run_mocker_trace_replay
def default_replay_output_path(trace_file: Path) -> Path:
return trace_file.with_name(f"{trace_file.stem}.replay.json")
def format_table(headers: list[str], rows: list[list[str]]) -> str:
widths = [len(header) for header in headers]
for row in rows:
for idx, cell in enumerate(row):
widths[idx] = max(widths[idx], len(cell))
def format_row(row: list[str]) -> str:
return " | ".join(cell.ljust(widths[idx]) for idx, cell in enumerate(row))
separator = "-+-".join("-" * width for width in widths)
return "\n".join(
[format_row(headers), separator, *(format_row(row) for row in rows)]
)
def format_ms(value: float | None) -> str:
if value is None:
return "-"
return f"{value:.3f}"
def format_number(value: float | None) -> str:
if value is None:
return "-"
return f"{value:.3f}"
def print_replay_summary(report: dict[str, Any], output_file: Path) -> None:
scalar_rows = [
["Request count", str(report["num_requests"])],
["Completed requests", str(report["completed_requests"])],
["Virtual duration (ms)", f"{report['duration_ms']:.3f}"],
["Wall time (ms)", f"{report['wall_time_ms']:.3f}"],
["Input tokens", str(report["total_input_tokens"])],
["Output tokens", str(report["total_output_tokens"])],
["Request throughput (req/s)", f"{report['request_throughput_rps']:.3f}"],
["Input throughput (tok/s)", f"{report['input_throughput_tok_s']:.3f}"],
["Output throughput (tok/s)", f"{report['output_throughput_tok_s']:.3f}"],
["Total throughput (tok/s)", f"{report['total_throughput_tok_s']:.3f}"],
["Prefix cache reused ratio", f"{report['prefix_cache_reused_ratio']:.6f}"],
]
latency_rows = [
[
"TTFT",
format_ms(report["mean_ttft_ms"]),
format_ms(report["min_ttft_ms"]),
format_ms(report["max_ttft_ms"]),
format_ms(report["p99_ttft_ms"]),
format_ms(report["p90_ttft_ms"]),
format_ms(report["median_ttft_ms"]),
format_ms(report["p75_ttft_ms"]),
format_ms(report["std_ttft_ms"]),
],
[
"TTST",
format_ms(report["mean_ttst_ms"]),
format_ms(report["min_ttst_ms"]),
format_ms(report["max_ttst_ms"]),
format_ms(report["p99_ttst_ms"]),
format_ms(report["p90_ttst_ms"]),
format_ms(report["median_ttst_ms"]),
format_ms(report["p75_ttst_ms"]),
format_ms(report["std_ttst_ms"]),
],
[
"TPOT",
format_ms(report["mean_tpot_ms"]),
format_ms(report["min_tpot_ms"]),
format_ms(report["max_tpot_ms"]),
format_ms(report["p99_tpot_ms"]),
format_ms(report["p90_tpot_ms"]),
format_ms(report["median_tpot_ms"]),
format_ms(report["p75_tpot_ms"]),
format_ms(report["std_tpot_ms"]),
],
[
"ITL",
format_ms(report["mean_itl_ms"]),
format_ms(report["min_itl_ms"]),
format_ms(report["max_itl_ms"]),
format_ms(report["p99_itl_ms"]),
format_ms(report["p90_itl_ms"]),
format_ms(report["median_itl_ms"]),
format_ms(report["p75_itl_ms"]),
format_ms(report["std_itl_ms"]),
],
[
"E2E latency",
format_ms(report["mean_e2e_latency_ms"]),
format_ms(report["min_e2e_latency_ms"]),
format_ms(report["max_e2e_latency_ms"]),
format_ms(report["p99_e2e_latency_ms"]),
format_ms(report["p90_e2e_latency_ms"]),
format_ms(report["median_e2e_latency_ms"]),
format_ms(report["p75_e2e_latency_ms"]),
format_ms(report["std_e2e_latency_ms"]),
],
[
"Output TPS/User",
format_number(report["mean_output_token_throughput_per_user"]),
format_number(report["min_output_token_throughput_per_user"]),
format_number(report["max_output_token_throughput_per_user"]),
format_number(report["p99_output_token_throughput_per_user"]),
format_number(report["p90_output_token_throughput_per_user"]),
format_number(report["median_output_token_throughput_per_user"]),
format_number(report["p75_output_token_throughput_per_user"]),
format_number(report["std_output_token_throughput_per_user"]),
],
]
lines = [
"Replay Summary",
format_table(["Metric", "Value"], scalar_rows),
"",
format_table(
["Metric", "avg", "min", "max", "p99", "p90", "p50", "p75", "std"],
latency_rows,
),
f"JSON report: {output_file}",
]
print("\n".join(lines))
def write_replay_report(report: dict[str, Any], output_file: Path) -> None:
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2, sort_keys=True)
def run_trace_replay(
trace_file: Path,
output_file: Path | None,
extra_engine_args: Path,
num_workers: int,
replay_concurrency: int | None,
) -> None:
resolved_output_file = output_file or default_replay_output_path(trace_file)
report = run_mocker_trace_replay(
trace_file=trace_file,
extra_engine_args=extra_engine_args,
num_workers=num_workers,
replay_concurrency=replay_concurrency,
)
write_replay_report(report, resolved_output_file)
print_replay_summary(report, resolved_output_file)
---
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: Mocker Offline Trace Replay
subtitle: Replay Mooncake-style traces offline without launching a runtime or router
---
This guide covers the mocker's offline trace replay mode, which replays a Mooncake-style JSONL trace directly through the mock scheduler and writes a metrics report. Unlike normal `dynamo.mocker` usage, this mode does not launch workers, register endpoints, or require NATS, etcd, or a frontend.
Use this when you want to:
- benchmark scheduler behavior from a saved trace
- compare timing and cache behavior across mocker configurations
- validate replay logic in CI without bringing up a distributed stack
## Quick Start
Run offline replay by passing `--trace-file`:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B
```
This writes a JSON report next to the trace file by default:
```text
/path/to/mooncake_trace.replay.json
```
The CLI also prints a `Replay Summary` table to stdout with request counts, throughput, and latency statistics.
## Input Format
The trace file must be Mooncake-style JSONL. Each line should contain:
- `timestamp` or `created_time`
- `input_length` or `input_tokens`
- `output_length` or `output_tokens`
- `hash_ids`
Example:
```json
{"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3]}
```
The mocker synthesizes token blocks from `hash_ids` using the configured `--block-size`, so the replay block size should match the block size used when the trace was generated.
## Modes
### Fixed-Schedule Replay
Default replay mode preserves the timestamps from the trace and simulates arrivals in virtual time:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B \
--block-size 512
```
This is the right mode when you want deterministic replay of the original arrival pattern.
### Closed-Loop Concurrency Replay
Use `--replay-concurrency` to ignore trace arrival timing and keep a fixed number of requests in flight:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B \
--block-size 512 \
--replay-concurrency 16
```
This mode is useful when you want to compare scheduler behavior under a fixed offered concurrency rather than the original trace schedule.
## Output
Use `--output-file` to override the default report location:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B \
--output-file /tmp/replay-report.json
```
If `--output-file` is not set, the report path defaults to `<trace stem>.replay.json` in the same directory as the input trace.
The report contains:
- request counts
- input and output token totals
- virtual duration and wall-clock runtime
- request and token throughput
- prefix cache reuse ratio
- TTFT, TTST, TPOT, ITL, and end-to-end latency summaries
- output-token-throughput-per-user summaries
## Replay Constraints
Offline replay currently supports only this configuration:
- `--num-workers 1`
- aggregated mode
- `--engine-type vllm`
- `--data-parallel-size 1`
If you violate those constraints, replay fails immediately with a validation error.
## Practical Notes
- `--replay-concurrency` requires `--trace-file`
- `--speedup-ratio` still affects simulated timing
- `--extra-engine-args` can be used to provide a full mocker config JSON instead of individual CLI flags
- offline replay does not need planner runtime setup, router registration, or event transport
## When To Use This vs AIPerf
Use offline replay when:
- you want a fast scheduler-only simulation
- you want deterministic CI coverage of replay behavior
- you do not need HTTP serving, frontend behavior, or network effects
Use [Dynamo Benchmarking](benchmarking.md) when:
- you want end-to-end benchmarking against a live endpoint
- you need frontend, transport, or cluster-level behavior
- you want AIPerf dashboards and endpoint-facing metrics
...@@ -101,6 +101,8 @@ navigation: ...@@ -101,6 +101,8 @@ navigation:
path: components/kvbm/kvbm-guide.md path: components/kvbm/kvbm-guide.md
- page: Dynamo Benchmarking - page: Dynamo Benchmarking
path: benchmarks/benchmarking.md path: benchmarks/benchmarking.md
- page: Mocker Offline Trace Replay
path: benchmarks/mocker-trace-replay.md
- section: Testing - section: Testing
contents: contents:
- page: Mocker - page: Mocker
......
...@@ -73,6 +73,9 @@ python -m dynamo.mocker \ ...@@ -73,6 +73,9 @@ python -m dynamo.mocker \
| `--model-path` | Required | HuggingFace model ID or local path for tokenizer | | `--model-path` | Required | HuggingFace model ID or local path for tokenizer |
| `--endpoint` | Auto-derived | Dynamo endpoint string. Defaults are namespace-dependent, and prefill workers use a different default endpoint than aggregated/decode workers | | `--endpoint` | Auto-derived | Dynamo endpoint string. Defaults are namespace-dependent, and prefill workers use a different default endpoint than aggregated/decode workers |
| `--model-name` | Derived from model-path | Model name for API responses | | `--model-name` | Derived from model-path | Model name for API responses |
| `--trace-file` | None | Run offline trace replay from a Mooncake-style JSONL trace file |
| `--output-file` | `<trace stem>.replay.json` | Write replay metrics JSON to this path |
| `--replay-concurrency` | None | Run offline replay in closed-loop concurrency mode with this many in-flight requests |
| `--num-gpu-blocks-override` | 16384 | Number of KV cache blocks | | `--num-gpu-blocks-override` | 16384 | Number of KV cache blocks |
| `--block-size` | 64 | Tokens per KV cache block | | `--block-size` | 64 | Tokens per KV cache block |
| `--max-num-seqs` | 256 | Maximum concurrent sequences | | `--max-num-seqs` | 256 | Maximum concurrent sequences |
...@@ -119,6 +122,20 @@ python -m dynamo.mocker \ ...@@ -119,6 +122,20 @@ python -m dynamo.mocker \
> **Note:** For local scale tests and router benchmarks, prefer `--num-workers` over launching many separate mocker processes. All workers share one tokio runtime and thread pool, which is both lighter weight and closer to how the test harnesses exercise the mocker. > **Note:** For local scale tests and router benchmarks, prefer `--num-workers` over launching many separate mocker processes. All workers share one tokio runtime and thread pool, which is both lighter weight and closer to how the test harnesses exercise the mocker.
## Offline Trace Replay
The mocker also supports an offline replay mode for Mooncake-style traces:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B
```
This mode writes a replay report JSON and prints a `Replay Summary` table without launching a runtime or router.
For full usage, constraints, and benchmarking guidance, see [Mocker Offline Trace Replay](../benchmarks/mocker-trace-replay.md).
## Performance Modeling Setup ## Performance Modeling Setup
By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, pass `--planner-profile-data` with either: By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, pass `--planner-profile-data` with either:
......
...@@ -176,6 +176,7 @@ async fn generate_sequence_events( ...@@ -176,6 +176,7 @@ async fn generate_sequence_events(
max_output_tokens: worker_trace[i].output_length as usize, max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid), uuid: Some(worker_trace[i].uuid),
dp_rank: 0, dp_rank: 0,
arrival_timestamp_ms: None,
}); });
i += 1; i += 1;
...@@ -187,6 +188,7 @@ async fn generate_sequence_events( ...@@ -187,6 +188,7 @@ async fn generate_sequence_events(
max_output_tokens: worker_trace[i].output_length as usize, max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid), uuid: Some(worker_trace[i].uuid),
dp_rank: 0, dp_rank: 0,
arrival_timestamp_ms: None,
}); });
i += 1; i += 1;
} }
......
...@@ -373,6 +373,7 @@ pub async fn generate_kv_events( ...@@ -373,6 +373,7 @@ pub async fn generate_kv_events(
max_output_tokens: worker_trace[i].output_length as usize, max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid), uuid: Some(worker_trace[i].uuid),
dp_rank: 0, dp_rank: 0,
arrival_timestamp_ms: None,
}); });
i += 1; i += 1;
...@@ -384,6 +385,7 @@ pub async fn generate_kv_events( ...@@ -384,6 +385,7 @@ pub async fn generate_kv_events(
max_output_tokens: worker_trace[i].output_length as usize, max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid), uuid: Some(worker_trace[i].uuid),
dp_rank: 0, dp_rank: 0,
arrival_timestamp_ms: None,
}); });
i += 1; i += 1;
} }
......
...@@ -149,6 +149,10 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { ...@@ -149,6 +149,10 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(fetch_model, m)?)?; m.add_function(wrap_pyfunction!(fetch_model, m)?)?;
m.add_function(wrap_pyfunction!(run_kv_indexer, m)?)?; m.add_function(wrap_pyfunction!(run_kv_indexer, m)?)?;
m.add_function(wrap_pyfunction!(llm::entrypoint::make_engine, m)?)?; m.add_function(wrap_pyfunction!(llm::entrypoint::make_engine, m)?)?;
m.add_function(wrap_pyfunction!(
llm::entrypoint::run_mocker_trace_replay,
m
)?)?;
m.add_function(wrap_pyfunction!(llm::entrypoint::run_input, m)?)?; m.add_function(wrap_pyfunction!(llm::entrypoint::run_input, m)?)?;
m.add_class::<DistributedRuntime>()?; m.add_class::<DistributedRuntime>()?;
......
...@@ -9,6 +9,7 @@ use std::sync::Arc; ...@@ -9,6 +9,7 @@ use std::sync::Arc;
use pyo3::{exceptions::PyException, prelude::*}; use pyo3::{exceptions::PyException, prelude::*};
use pyo3_async_runtimes::TaskLocals; use pyo3_async_runtimes::TaskLocals;
use pythonize::pythonize;
use dynamo_kv_router::config::KvRouterConfig as RsKvRouterConfig; use dynamo_kv_router::config::KvRouterConfig as RsKvRouterConfig;
use dynamo_llm::discovery::LoadThresholdConfig as RsLoadThresholdConfig; use dynamo_llm::discovery::LoadThresholdConfig as RsLoadThresholdConfig;
...@@ -467,6 +468,50 @@ pub fn run_input<'p>( ...@@ -467,6 +468,50 @@ pub fn run_input<'p>(
}) })
} }
#[pyfunction]
#[pyo3(signature = (trace_file, extra_engine_args=None, num_workers=1, replay_concurrency=None))]
pub fn run_mocker_trace_replay(
py: Python<'_>,
trace_file: PathBuf,
extra_engine_args: Option<PathBuf>,
num_workers: usize,
replay_concurrency: Option<isize>,
) -> PyResult<PyObject> {
let report = py.allow_threads(move || {
let args = if let Some(extra_args_path) = extra_engine_args {
MockEngineArgs::from_json_file(&extra_args_path).map_err(|e| {
anyhow::anyhow!(
"Failed to load mocker args from {:?}: {}",
extra_args_path,
e
)
})?
} else {
MockEngineArgs::default()
};
let replay_concurrency = replay_concurrency
.map(usize::try_from)
.transpose()
.map_err(|_| anyhow::anyhow!("replay_concurrency must be at least 1"))?;
if let Some(max_in_flight) = replay_concurrency {
dynamo_mocker::simulation::simulate_concurrency_file(
args,
&trace_file,
max_in_flight,
num_workers,
)
} else {
dynamo_mocker::simulation::simulate_trace_file(args, &trace_file, num_workers)
}
});
let report = report.map_err(to_pyerr)?;
pythonize(py, &report)
.map_err(to_pyerr)
.map(|obj| obj.unbind())
}
pub fn to_pyerr<E>(err: E) -> PyErr pub fn to_pyerr<E>(err: E) -> PyErr
where where
E: Display, E: Display,
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import os
from typing import Any, AsyncIterator, Awaitable, Callable, Dict, List, Optional, Tuple from typing import Any, AsyncIterator, Awaitable, Callable, Dict, List, Optional, Tuple
# Import from specialized modules # Import from specialized modules
...@@ -1246,6 +1247,15 @@ async def run_input(runtime: DistributedRuntime, input: str, engine_config: Engi ...@@ -1246,6 +1247,15 @@ async def run_input(runtime: DistributedRuntime, input: str, engine_config: Engi
"""Start an engine, connect it to an input, and run until stopped.""" """Start an engine, connect it to an input, and run until stopped."""
... ...
def run_mocker_trace_replay(
trace_file: str | os.PathLike[str],
extra_engine_args: Optional[str | os.PathLike[str]] = None,
num_workers: int = 1,
replay_concurrency: Optional[int] = None,
) -> Dict[str, Any]:
"""Replay a mocker trace file and return the simulation report."""
...
class Layer: class Layer:
""" """
A KV cache block layer A KV cache block layer
......
...@@ -35,6 +35,7 @@ from dynamo._core import make_engine ...@@ -35,6 +35,7 @@ from dynamo._core import make_engine
from dynamo._core import register_model as register_model from dynamo._core import register_model as register_model
from dynamo._core import run_input from dynamo._core import run_input
from dynamo._core import run_kv_indexer as run_kv_indexer from dynamo._core import run_kv_indexer as run_kv_indexer
from dynamo._core import run_mocker_trace_replay
from dynamo._core import unregister_model as unregister_model from dynamo._core import unregister_model as unregister_model
from .exceptions import HttpError from .exceptions import HttpError
......
...@@ -11,7 +11,7 @@ use tokio::sync::watch; ...@@ -11,7 +11,7 @@ use tokio::sync::watch;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use zeromq::{DealerSocket, Socket, SocketRecv, SocketSend, SubSocket}; use zeromq::{DealerSocket, Socket, SocketRecv, SocketSend, SubSocket};
use crate::protocols::{RouterEvent, WorkerId, WorkerWithDpRank}; use crate::protocols::{WorkerId, WorkerWithDpRank};
use crate::zmq_wire::{KvEventBatch, convert_event}; use crate::zmq_wire::{KvEventBatch, convert_event};
use super::indexer::Indexer; use super::indexer::Indexer;
......
...@@ -359,11 +359,16 @@ impl WorkerRegistry { ...@@ -359,11 +359,16 @@ impl WorkerRegistry {
#[cfg(feature = "metrics")] #[cfg(feature = "metrics")]
pub fn refresh_metrics(&self) { pub fn refresh_metrics(&self) {
let models = self.indexers.len(); let models = self.indexers.len();
let mut workers = self.workers.len(); let workers = self.workers.len() + {
#[cfg(feature = "indexer-runtime")] #[cfg(feature = "indexer-runtime")]
{ {
workers += self.discovered_workers.len(); self.discovered_workers.len()
} }
#[cfg(not(feature = "indexer-runtime"))]
{
0
}
};
let mut listener_counts = [0_i64; 4]; let mut listener_counts = [0_i64; 4];
for entry in self.workers.iter() { for entry in self.workers.iter() {
......
...@@ -637,6 +637,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error> ...@@ -637,6 +637,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
max_output_tokens, max_output_tokens,
uuid: Some(request_uuid), uuid: Some(request_uuid),
dp_rank, dp_rank,
arrival_timestamp_ms: request.request_timestamp_ms,
}; };
let (request_tx, mut request_rx) = mpsc::unbounded_channel::<OutputSignal>(); let (request_tx, mut request_rx) = mpsc::unbounded_channel::<OutputSignal>();
......
...@@ -297,6 +297,7 @@ impl OpenAIPreprocessor { ...@@ -297,6 +297,7 @@ impl OpenAIPreprocessor {
if let Some(nvext) = request.nvext() { if let Some(nvext) = request.nvext() {
// Build routing hints from nvext fields // Build routing hints from nvext fields
let hints = nvext.agent_hints.as_ref(); let hints = nvext.agent_hints.as_ref();
builder.request_timestamp_ms(nvext.request_timestamp_ms);
let routing = RoutingHints { let routing = RoutingHints {
backend_instance_id: nvext.backend_instance_id, backend_instance_id: nvext.backend_instance_id,
prefill_worker_id: nvext.prefill_worker_id, prefill_worker_id: nvext.prefill_worker_id,
......
...@@ -190,6 +190,11 @@ pub struct PreprocessedRequest { ...@@ -190,6 +190,11 @@ pub struct PreprocessedRequest {
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub extra_args: Option<serde_json::Value>, pub extra_args: Option<serde_json::Value>,
/// Optional request timestamp in milliseconds forwarded from nvext.
#[builder(default)]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub request_timestamp_ms: Option<f64>,
/// Optional request tracker for per-request metrics (shared with DeltaGenerator) /// Optional request tracker for per-request metrics (shared with DeltaGenerator)
#[builder(default)] #[builder(default)]
#[serde(skip)] #[serde(skip)]
......
...@@ -174,6 +174,11 @@ pub struct NvExt { ...@@ -174,6 +174,11 @@ pub struct NvExt {
#[builder(default, setter(strip_option))] #[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>, pub cache_control: Option<CacheControl>,
/// Optional request timestamp in milliseconds for trace replay / virtual-time simulation.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub request_timestamp_ms: Option<f64>,
} }
/// Hints from the agent/caller about request characteristics. /// Hints from the agent/caller about request characteristics.
......
...@@ -227,8 +227,8 @@ impl PerfModel { ...@@ -227,8 +227,8 @@ impl PerfModel {
decode_interp.interp(query_x, query_y).unwrap_or(0.0) decode_interp.interp(query_x, query_y).unwrap_or(0.0)
} }
}; };
// Ensure non-negative timing // Token-emitting decode steps should not collapse onto the same timestamp.
let result = time.max(0.0); let result = time.max(1.0);
tracing::trace!( tracing::trace!(
"Decode time prediction: active_kv_tokens={active_kv_tokens}, context_length={context_length}, time={result:.2}ms" "Decode time prediction: active_kv_tokens={active_kv_tokens}, context_length={context_length}, time={result:.2}ms"
); );
......
...@@ -53,6 +53,7 @@ pub struct DirectRequest { ...@@ -53,6 +53,7 @@ pub struct DirectRequest {
pub max_output_tokens: usize, pub max_output_tokens: usize,
pub uuid: Option<Uuid>, pub uuid: Option<Uuid>,
pub dp_rank: u32, pub dp_rank: u32,
pub arrival_timestamp_ms: Option<f64>,
} }
/// Represents the cost of prefilling content in the cache /// Represents the cost of prefilling content in the cache
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment