Unverified Commit c3908a36 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add offline trace replay mode [DYN-2502] (#7543)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 6e56bad6
......@@ -11,12 +11,6 @@ from pathlib import Path
from dynamo.common.utils.namespace import get_worker_namespace
from . import __version__
from .utils.kv_cache import DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
from .utils.planner_profiler_perf_data_converter import (
convert_profile_results_to_npz,
is_mocker_format_npz,
is_profile_results_dir,
)
DYN_NAMESPACE = get_worker_namespace()
DEFAULT_ENDPOINT = f"dyn://{DYN_NAMESPACE}.backend.generate"
......@@ -62,6 +56,12 @@ def resolve_planner_profile_data(
Raises:
FileNotFoundError: If path doesn't contain valid profile data in any supported format.
"""
from .utils.planner_profiler_perf_data_converter import (
convert_profile_results_to_npz,
is_mocker_format_npz,
is_profile_results_dir,
)
if planner_profile_data is None:
return ProfileDataResult(npz_path=None, tmpdir=None)
......@@ -216,7 +216,7 @@ def parse_bootstrap_ports(ports_str: str | None) -> list[int]:
return [int(p.strip()) for p in ports_str.split(",")]
def parse_args() -> argparse.Namespace:
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
"""Parse command-line arguments for the Dynamo mocker engine.
Returns:
......@@ -248,6 +248,24 @@ def parse_args() -> argparse.Namespace:
default=None,
help="Model name for API responses (default: derived from model-path)",
)
parser.add_argument(
"--trace-file",
type=Path,
default=None,
help="Run offline trace replay from a Mooncake-style JSONL trace file.",
)
parser.add_argument(
"--output-file",
type=Path,
default=None,
help="Write replay metrics JSON to this path. Defaults to a replay JSON next to the trace file.",
)
parser.add_argument(
"--replay-concurrency",
type=int,
default=None,
help="Run offline replay in closed-loop concurrency mode with this many in-flight requests.",
)
# MockEngineArgs parameters (similar to vLLM style)
parser.add_argument(
......@@ -481,7 +499,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--kv-transfer-bandwidth",
type=float,
default=DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS,
default=_default_kv_transfer_bandwidth_gbps(),
help="KV cache transfer bandwidth in GB/s for disaggregated serving latency simulation. "
"Default: 64.0 (inter-node InfiniBand). Set to 0 to disable KV transfer delay. "
"For intra-node NVLink, typical value is ~450.",
......@@ -543,9 +561,12 @@ def parse_args() -> argparse.Namespace:
help="Determines how events are published [nats|zmq]",
)
args = parser.parse_args()
args = parser.parse_args(argv)
validate_worker_type_args(args)
if args.replay_concurrency is not None and args.trace_file is None:
raise ValueError("--replay-concurrency requires --trace-file")
# Validate num_workers
if args.num_workers < 1:
raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}")
......@@ -587,5 +608,10 @@ def parse_args() -> argparse.Namespace:
else:
args.endpoint = DEFAULT_ENDPOINT
logger.debug(f"Using default endpoint: {args.endpoint}")
return args
def _default_kv_transfer_bandwidth_gbps() -> float:
from .utils.kv_cache import DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
return DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
......@@ -30,6 +30,7 @@ from dynamo.llm import (
from dynamo.runtime.logging import configure_dynamo_logging
from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data
from .replay import run_trace_replay
from .utils.kv_cache import compute_kv_bytes_per_token
configure_dynamo_logging()
......@@ -72,6 +73,33 @@ async def worker():
while still sharing the same event loop and tokio runtime.
"""
args = parse_args()
profile_data_result = None
# Offline replay does not need planner profile conversion or runtime setup.
if args.trace_file is not None:
if args.extra_engine_args:
extra_engine_args_path = args.extra_engine_args
logger.info(f"Using provided MockEngineArgs from {extra_engine_args_path}")
else:
extra_engine_args_path = create_temp_engine_args_file(args)
logger.info("Created MockEngineArgs from CLI arguments")
try:
run_trace_replay(
trace_file=args.trace_file,
output_file=args.output_file,
extra_engine_args=extra_engine_args_path,
num_workers=args.num_workers,
replay_concurrency=args.replay_concurrency,
)
return
finally:
if not args.extra_engine_args and extra_engine_args_path.exists():
try:
extra_engine_args_path.unlink()
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}")
# Resolve planner-profile-data: convert profile results dir to NPZ if needed
profile_data_result = resolve_planner_profile_data(args.planner_profile_data)
......@@ -87,6 +115,7 @@ async def worker():
extra_engine_args_path = create_temp_engine_args_file(args)
logger.info("Created MockEngineArgs from CLI arguments")
try:
# Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers
if args.num_workers > 1 and args.model_path:
await prefetch_model(args.model_path)
......@@ -105,7 +134,6 @@ async def worker():
with open(extra_engine_args_path, "w") as f:
json.dump(engine_args, f, indent=2)
try:
logger.info(
f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances"
)
......@@ -118,7 +146,7 @@ async def worker():
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}")
if profile_data_result is not None:
del profile_data_result # Triggers tmpdir cleanup via __del__
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import json
from pathlib import Path
from typing import Any
from dynamo.llm import run_mocker_trace_replay
def default_replay_output_path(trace_file: Path) -> Path:
return trace_file.with_name(f"{trace_file.stem}.replay.json")
def format_table(headers: list[str], rows: list[list[str]]) -> str:
widths = [len(header) for header in headers]
for row in rows:
for idx, cell in enumerate(row):
widths[idx] = max(widths[idx], len(cell))
def format_row(row: list[str]) -> str:
return " | ".join(cell.ljust(widths[idx]) for idx, cell in enumerate(row))
separator = "-+-".join("-" * width for width in widths)
return "\n".join(
[format_row(headers), separator, *(format_row(row) for row in rows)]
)
def format_ms(value: float | None) -> str:
if value is None:
return "-"
return f"{value:.3f}"
def format_number(value: float | None) -> str:
if value is None:
return "-"
return f"{value:.3f}"
def print_replay_summary(report: dict[str, Any], output_file: Path) -> None:
scalar_rows = [
["Request count", str(report["num_requests"])],
["Completed requests", str(report["completed_requests"])],
["Virtual duration (ms)", f"{report['duration_ms']:.3f}"],
["Wall time (ms)", f"{report['wall_time_ms']:.3f}"],
["Input tokens", str(report["total_input_tokens"])],
["Output tokens", str(report["total_output_tokens"])],
["Request throughput (req/s)", f"{report['request_throughput_rps']:.3f}"],
["Input throughput (tok/s)", f"{report['input_throughput_tok_s']:.3f}"],
["Output throughput (tok/s)", f"{report['output_throughput_tok_s']:.3f}"],
["Total throughput (tok/s)", f"{report['total_throughput_tok_s']:.3f}"],
["Prefix cache reused ratio", f"{report['prefix_cache_reused_ratio']:.6f}"],
]
latency_rows = [
[
"TTFT",
format_ms(report["mean_ttft_ms"]),
format_ms(report["min_ttft_ms"]),
format_ms(report["max_ttft_ms"]),
format_ms(report["p99_ttft_ms"]),
format_ms(report["p90_ttft_ms"]),
format_ms(report["median_ttft_ms"]),
format_ms(report["p75_ttft_ms"]),
format_ms(report["std_ttft_ms"]),
],
[
"TTST",
format_ms(report["mean_ttst_ms"]),
format_ms(report["min_ttst_ms"]),
format_ms(report["max_ttst_ms"]),
format_ms(report["p99_ttst_ms"]),
format_ms(report["p90_ttst_ms"]),
format_ms(report["median_ttst_ms"]),
format_ms(report["p75_ttst_ms"]),
format_ms(report["std_ttst_ms"]),
],
[
"TPOT",
format_ms(report["mean_tpot_ms"]),
format_ms(report["min_tpot_ms"]),
format_ms(report["max_tpot_ms"]),
format_ms(report["p99_tpot_ms"]),
format_ms(report["p90_tpot_ms"]),
format_ms(report["median_tpot_ms"]),
format_ms(report["p75_tpot_ms"]),
format_ms(report["std_tpot_ms"]),
],
[
"ITL",
format_ms(report["mean_itl_ms"]),
format_ms(report["min_itl_ms"]),
format_ms(report["max_itl_ms"]),
format_ms(report["p99_itl_ms"]),
format_ms(report["p90_itl_ms"]),
format_ms(report["median_itl_ms"]),
format_ms(report["p75_itl_ms"]),
format_ms(report["std_itl_ms"]),
],
[
"E2E latency",
format_ms(report["mean_e2e_latency_ms"]),
format_ms(report["min_e2e_latency_ms"]),
format_ms(report["max_e2e_latency_ms"]),
format_ms(report["p99_e2e_latency_ms"]),
format_ms(report["p90_e2e_latency_ms"]),
format_ms(report["median_e2e_latency_ms"]),
format_ms(report["p75_e2e_latency_ms"]),
format_ms(report["std_e2e_latency_ms"]),
],
[
"Output TPS/User",
format_number(report["mean_output_token_throughput_per_user"]),
format_number(report["min_output_token_throughput_per_user"]),
format_number(report["max_output_token_throughput_per_user"]),
format_number(report["p99_output_token_throughput_per_user"]),
format_number(report["p90_output_token_throughput_per_user"]),
format_number(report["median_output_token_throughput_per_user"]),
format_number(report["p75_output_token_throughput_per_user"]),
format_number(report["std_output_token_throughput_per_user"]),
],
]
lines = [
"Replay Summary",
format_table(["Metric", "Value"], scalar_rows),
"",
format_table(
["Metric", "avg", "min", "max", "p99", "p90", "p50", "p75", "std"],
latency_rows,
),
f"JSON report: {output_file}",
]
print("\n".join(lines))
def write_replay_report(report: dict[str, Any], output_file: Path) -> None:
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2, sort_keys=True)
def run_trace_replay(
trace_file: Path,
output_file: Path | None,
extra_engine_args: Path,
num_workers: int,
replay_concurrency: int | None,
) -> None:
resolved_output_file = output_file or default_replay_output_path(trace_file)
report = run_mocker_trace_replay(
trace_file=trace_file,
extra_engine_args=extra_engine_args,
num_workers=num_workers,
replay_concurrency=replay_concurrency,
)
write_replay_report(report, resolved_output_file)
print_replay_summary(report, resolved_output_file)
---
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: Mocker Offline Trace Replay
subtitle: Replay Mooncake-style traces offline without launching a runtime or router
---
This guide covers the mocker's offline trace replay mode, which replays a Mooncake-style JSONL trace directly through the mock scheduler and writes a metrics report. Unlike normal `dynamo.mocker` usage, this mode does not launch workers, register endpoints, or require NATS, etcd, or a frontend.
Use this when you want to:
- benchmark scheduler behavior from a saved trace
- compare timing and cache behavior across mocker configurations
- validate replay logic in CI without bringing up a distributed stack
## Quick Start
Run offline replay by passing `--trace-file`:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B
```
This writes a JSON report next to the trace file by default:
```text
/path/to/mooncake_trace.replay.json
```
The CLI also prints a `Replay Summary` table to stdout with request counts, throughput, and latency statistics.
## Input Format
The trace file must be Mooncake-style JSONL. Each line should contain:
- `timestamp` or `created_time`
- `input_length` or `input_tokens`
- `output_length` or `output_tokens`
- `hash_ids`
Example:
```json
{"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3]}
```
The mocker synthesizes token blocks from `hash_ids` using the configured `--block-size`, so the replay block size should match the block size used when the trace was generated.
## Modes
### Fixed-Schedule Replay
Default replay mode preserves the timestamps from the trace and simulates arrivals in virtual time:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B \
--block-size 512
```
This is the right mode when you want deterministic replay of the original arrival pattern.
### Closed-Loop Concurrency Replay
Use `--replay-concurrency` to ignore trace arrival timing and keep a fixed number of requests in flight:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B \
--block-size 512 \
--replay-concurrency 16
```
This mode is useful when you want to compare scheduler behavior under a fixed offered concurrency rather than the original trace schedule.
## Output
Use `--output-file` to override the default report location:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B \
--output-file /tmp/replay-report.json
```
If `--output-file` is not set, the report path defaults to `<trace stem>.replay.json` in the same directory as the input trace.
The report contains:
- request counts
- input and output token totals
- virtual duration and wall-clock runtime
- request and token throughput
- prefix cache reuse ratio
- TTFT, TTST, TPOT, ITL, and end-to-end latency summaries
- output-token-throughput-per-user summaries
## Replay Constraints
Offline replay currently supports only this configuration:
- `--num-workers 1`
- aggregated mode
- `--engine-type vllm`
- `--data-parallel-size 1`
If you violate those constraints, replay fails immediately with a validation error.
## Practical Notes
- `--replay-concurrency` requires `--trace-file`
- `--speedup-ratio` still affects simulated timing
- `--extra-engine-args` can be used to provide a full mocker config JSON instead of individual CLI flags
- offline replay does not need planner runtime setup, router registration, or event transport
## When To Use This vs AIPerf
Use offline replay when:
- you want a fast scheduler-only simulation
- you want deterministic CI coverage of replay behavior
- you do not need HTTP serving, frontend behavior, or network effects
Use [Dynamo Benchmarking](benchmarking.md) when:
- you want end-to-end benchmarking against a live endpoint
- you need frontend, transport, or cluster-level behavior
- you want AIPerf dashboards and endpoint-facing metrics
......@@ -101,6 +101,8 @@ navigation:
path: components/kvbm/kvbm-guide.md
- page: Dynamo Benchmarking
path: benchmarks/benchmarking.md
- page: Mocker Offline Trace Replay
path: benchmarks/mocker-trace-replay.md
- section: Testing
contents:
- page: Mocker
......
......@@ -73,6 +73,9 @@ python -m dynamo.mocker \
| `--model-path` | Required | HuggingFace model ID or local path for tokenizer |
| `--endpoint` | Auto-derived | Dynamo endpoint string. Defaults are namespace-dependent, and prefill workers use a different default endpoint than aggregated/decode workers |
| `--model-name` | Derived from model-path | Model name for API responses |
| `--trace-file` | None | Run offline trace replay from a Mooncake-style JSONL trace file |
| `--output-file` | `<trace stem>.replay.json` | Write replay metrics JSON to this path |
| `--replay-concurrency` | None | Run offline replay in closed-loop concurrency mode with this many in-flight requests |
| `--num-gpu-blocks-override` | 16384 | Number of KV cache blocks |
| `--block-size` | 64 | Tokens per KV cache block |
| `--max-num-seqs` | 256 | Maximum concurrent sequences |
......@@ -119,6 +122,20 @@ python -m dynamo.mocker \
> **Note:** For local scale tests and router benchmarks, prefer `--num-workers` over launching many separate mocker processes. All workers share one tokio runtime and thread pool, which is both lighter weight and closer to how the test harnesses exercise the mocker.
## Offline Trace Replay
The mocker also supports an offline replay mode for Mooncake-style traces:
```bash
python -m dynamo.mocker \
--trace-file /path/to/mooncake_trace.jsonl \
--model-path Qwen/Qwen3-0.6B
```
This mode writes a replay report JSON and prints a `Replay Summary` table without launching a runtime or router.
For full usage, constraints, and benchmarking guidance, see [Mocker Offline Trace Replay](../benchmarks/mocker-trace-replay.md).
## Performance Modeling Setup
By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, pass `--planner-profile-data` with either:
......
......@@ -176,6 +176,7 @@ async fn generate_sequence_events(
max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid),
dp_rank: 0,
arrival_timestamp_ms: None,
});
i += 1;
......@@ -187,6 +188,7 @@ async fn generate_sequence_events(
max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid),
dp_rank: 0,
arrival_timestamp_ms: None,
});
i += 1;
}
......
......@@ -373,6 +373,7 @@ pub async fn generate_kv_events(
max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid),
dp_rank: 0,
arrival_timestamp_ms: None,
});
i += 1;
......@@ -384,6 +385,7 @@ pub async fn generate_kv_events(
max_output_tokens: worker_trace[i].output_length as usize,
uuid: Some(worker_trace[i].uuid),
dp_rank: 0,
arrival_timestamp_ms: None,
});
i += 1;
}
......
......@@ -149,6 +149,10 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(fetch_model, m)?)?;
m.add_function(wrap_pyfunction!(run_kv_indexer, m)?)?;
m.add_function(wrap_pyfunction!(llm::entrypoint::make_engine, m)?)?;
m.add_function(wrap_pyfunction!(
llm::entrypoint::run_mocker_trace_replay,
m
)?)?;
m.add_function(wrap_pyfunction!(llm::entrypoint::run_input, m)?)?;
m.add_class::<DistributedRuntime>()?;
......
......@@ -9,6 +9,7 @@ use std::sync::Arc;
use pyo3::{exceptions::PyException, prelude::*};
use pyo3_async_runtimes::TaskLocals;
use pythonize::pythonize;
use dynamo_kv_router::config::KvRouterConfig as RsKvRouterConfig;
use dynamo_llm::discovery::LoadThresholdConfig as RsLoadThresholdConfig;
......@@ -467,6 +468,50 @@ pub fn run_input<'p>(
})
}
#[pyfunction]
#[pyo3(signature = (trace_file, extra_engine_args=None, num_workers=1, replay_concurrency=None))]
pub fn run_mocker_trace_replay(
py: Python<'_>,
trace_file: PathBuf,
extra_engine_args: Option<PathBuf>,
num_workers: usize,
replay_concurrency: Option<isize>,
) -> PyResult<PyObject> {
let report = py.allow_threads(move || {
let args = if let Some(extra_args_path) = extra_engine_args {
MockEngineArgs::from_json_file(&extra_args_path).map_err(|e| {
anyhow::anyhow!(
"Failed to load mocker args from {:?}: {}",
extra_args_path,
e
)
})?
} else {
MockEngineArgs::default()
};
let replay_concurrency = replay_concurrency
.map(usize::try_from)
.transpose()
.map_err(|_| anyhow::anyhow!("replay_concurrency must be at least 1"))?;
if let Some(max_in_flight) = replay_concurrency {
dynamo_mocker::simulation::simulate_concurrency_file(
args,
&trace_file,
max_in_flight,
num_workers,
)
} else {
dynamo_mocker::simulation::simulate_trace_file(args, &trace_file, num_workers)
}
});
let report = report.map_err(to_pyerr)?;
pythonize(py, &report)
.map_err(to_pyerr)
.map(|obj| obj.unbind())
}
pub fn to_pyerr<E>(err: E) -> PyErr
where
E: Display,
......
......@@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
import asyncio
import os
from typing import Any, AsyncIterator, Awaitable, Callable, Dict, List, Optional, Tuple
# Import from specialized modules
......@@ -1246,6 +1247,15 @@ async def run_input(runtime: DistributedRuntime, input: str, engine_config: Engi
"""Start an engine, connect it to an input, and run until stopped."""
...
def run_mocker_trace_replay(
trace_file: str | os.PathLike[str],
extra_engine_args: Optional[str | os.PathLike[str]] = None,
num_workers: int = 1,
replay_concurrency: Optional[int] = None,
) -> Dict[str, Any]:
"""Replay a mocker trace file and return the simulation report."""
...
class Layer:
"""
A KV cache block layer
......
......@@ -35,6 +35,7 @@ from dynamo._core import make_engine
from dynamo._core import register_model as register_model
from dynamo._core import run_input
from dynamo._core import run_kv_indexer as run_kv_indexer
from dynamo._core import run_mocker_trace_replay
from dynamo._core import unregister_model as unregister_model
from .exceptions import HttpError
......
......@@ -11,7 +11,7 @@ use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use zeromq::{DealerSocket, Socket, SocketRecv, SocketSend, SubSocket};
use crate::protocols::{RouterEvent, WorkerId, WorkerWithDpRank};
use crate::protocols::{WorkerId, WorkerWithDpRank};
use crate::zmq_wire::{KvEventBatch, convert_event};
use super::indexer::Indexer;
......
......@@ -359,11 +359,16 @@ impl WorkerRegistry {
#[cfg(feature = "metrics")]
pub fn refresh_metrics(&self) {
let models = self.indexers.len();
let mut workers = self.workers.len();
let workers = self.workers.len() + {
#[cfg(feature = "indexer-runtime")]
{
workers += self.discovered_workers.len();
self.discovered_workers.len()
}
#[cfg(not(feature = "indexer-runtime"))]
{
0
}
};
let mut listener_counts = [0_i64; 4];
for entry in self.workers.iter() {
......
......@@ -637,6 +637,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
max_output_tokens,
uuid: Some(request_uuid),
dp_rank,
arrival_timestamp_ms: request.request_timestamp_ms,
};
let (request_tx, mut request_rx) = mpsc::unbounded_channel::<OutputSignal>();
......
......@@ -297,6 +297,7 @@ impl OpenAIPreprocessor {
if let Some(nvext) = request.nvext() {
// Build routing hints from nvext fields
let hints = nvext.agent_hints.as_ref();
builder.request_timestamp_ms(nvext.request_timestamp_ms);
let routing = RoutingHints {
backend_instance_id: nvext.backend_instance_id,
prefill_worker_id: nvext.prefill_worker_id,
......
......@@ -190,6 +190,11 @@ pub struct PreprocessedRequest {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub extra_args: Option<serde_json::Value>,
/// Optional request timestamp in milliseconds forwarded from nvext.
#[builder(default)]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub request_timestamp_ms: Option<f64>,
/// Optional request tracker for per-request metrics (shared with DeltaGenerator)
#[builder(default)]
#[serde(skip)]
......
......@@ -174,6 +174,11 @@ pub struct NvExt {
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
/// Optional request timestamp in milliseconds for trace replay / virtual-time simulation.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub request_timestamp_ms: Option<f64>,
}
/// Hints from the agent/caller about request characteristics.
......
......@@ -227,8 +227,8 @@ impl PerfModel {
decode_interp.interp(query_x, query_y).unwrap_or(0.0)
}
};
// Ensure non-negative timing
let result = time.max(0.0);
// Token-emitting decode steps should not collapse onto the same timestamp.
let result = time.max(1.0);
tracing::trace!(
"Decode time prediction: active_kv_tokens={active_kv_tokens}, context_length={context_length}, time={result:.2}ms"
);
......
......@@ -53,6 +53,7 @@ pub struct DirectRequest {
pub max_output_tokens: usize,
pub uuid: Option<Uuid>,
pub dp_rank: u32,
pub arrival_timestamp_ms: Option<f64>,
}
/// Represents the cost of prefilling content in the cache
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment