Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
88478b4b
Unverified
Commit
88478b4b
authored
Apr 23, 2026
by
Yan Ru Pei
Committed by
GitHub
Apr 23, 2026
Browse files
feat(replay): add agentic trace replay format (#8627)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
c8c99dd1
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
864 additions
and
51 deletions
+864
-51
components/src/dynamo/profiler/tests/test_replay_optimize.py
components/src/dynamo/profiler/tests/test_replay_optimize.py
+74
-0
components/src/dynamo/profiler/utils/replay_optimize/evaluate.py
...nts/src/dynamo/profiler/utils/replay_optimize/evaluate.py
+8
-0
components/src/dynamo/profiler/utils/replay_optimize/example.py
...ents/src/dynamo/profiler/utils/replay_optimize/example.py
+44
-1
components/src/dynamo/profiler/utils/replay_optimize/specs.py
...onents/src/dynamo/profiler/utils/replay_optimize/specs.py
+29
-0
lib/bench/README.md
lib/bench/README.md
+6
-1
lib/bench/offline_replay_bench.rs
lib/bench/offline_replay_bench.rs
+3
-9
lib/bindings/python/rust/llm/replay.rs
lib/bindings/python/rust/llm/replay.rs
+51
-7
lib/bindings/python/src/dynamo/_core.pyi
lib/bindings/python/src/dynamo/_core.pyi
+3
-0
lib/bindings/python/src/dynamo/llm/__init__.py
lib/bindings/python/src/dynamo/llm/__init__.py
+6
-0
lib/bindings/python/src/dynamo/replay/api.py
lib/bindings/python/src/dynamo/replay/api.py
+6
-0
lib/bindings/python/src/dynamo/replay/main.py
lib/bindings/python/src/dynamo/replay/main.py
+29
-0
lib/bindings/python/tests/replay/replay_utils.py
lib/bindings/python/tests/replay/replay_utils.py
+27
-0
lib/bindings/python/tests/replay/test_replay_smoke.py
lib/bindings/python/tests/replay/test_replay_smoke.py
+40
-0
lib/mocker/src/common/sequence.rs
lib/mocker/src/common/sequence.rs
+2
-6
lib/mocker/src/kv_manager/kvbm_backend.rs
lib/mocker/src/kv_manager/kvbm_backend.rs
+9
-13
lib/mocker/src/loadgen/mod.rs
lib/mocker/src/loadgen/mod.rs
+2
-1
lib/mocker/src/loadgen/tests.rs
lib/mocker/src/loadgen/tests.rs
+81
-0
lib/mocker/src/loadgen/trace.rs
lib/mocker/src/loadgen/trace.rs
+186
-0
lib/mocker/src/loadgen/types.rs
lib/mocker/src/loadgen/types.rs
+6
-0
lib/mocker/src/replay/entrypoints.rs
lib/mocker/src/replay/entrypoints.rs
+252
-13
No files found.
components/src/dynamo/profiler/tests/test_replay_optimize.py
View file @
88478b4b
...
...
@@ -18,6 +18,7 @@ except ImportError:
from
dynamo.profiler.utils
import
replay_optimize
from
dynamo.profiler.utils.replay_optimize
import
(
DenseAggReplayState
,
DenseReplayState
,
EngineSpec
,
HardwareSpec
,
ReplayObjective
,
...
...
@@ -124,6 +125,22 @@ def _trace_workload(trace_file: Path, speedup: float = 100.0) -> WorkloadSpec:
return
WorkloadSpec
(
traceFile
=
str
(
trace_file
),
arrivalSpeedupRatio
=
speedup
)
def
_applied_compute_agentic_trace_workload
(
trace_file
:
Path
,
*
,
concurrency
:
int
=
8
,
shared_prefix_ratio
:
float
=
0.5
,
num_prefix_groups
:
int
=
1
,
)
->
WorkloadSpec
:
return
WorkloadSpec
(
traceFile
=
str
(
trace_file
),
traceFormat
=
"applied_compute_agentic"
,
traceReplayConcurrency
=
concurrency
,
traceSharedPrefixRatio
=
shared_prefix_ratio
,
traceNumPrefixGroups
=
num_prefix_groups
,
)
def
_sla
(
**
bounds
:
float
)
->
SLASpec
:
"""Build an SLASpec from keyword bounds.
...
...
@@ -141,6 +158,63 @@ def _sla(**bounds: float) -> SLASpec:
return
SLASpec
(
**
{
translate
.
get
(
k
,
k
):
v
for
k
,
v
in
bounds
.
items
()})
def
test_applied_compute_agentic_trace_workload_requires_trace_replay_concurrency
(
tmp_path
,
)
->
None
:
trace_path
=
_write_trace
(
tmp_path
)
with
pytest
.
raises
(
ValueError
,
match
=
"traceReplayConcurrency"
):
WorkloadSpec
(
traceFile
=
str
(
trace_path
),
traceFormat
=
"applied_compute_agentic"
,
)
def
test_applied_compute_agentic_trace_workload_rejects_shared_prefix_ratio_without_groups
(
tmp_path
,
)
->
None
:
trace_path
=
_write_trace
(
tmp_path
)
with
pytest
.
raises
(
ValueError
,
match
=
"traceNumPrefixGroups"
):
WorkloadSpec
(
traceFile
=
str
(
trace_path
),
traceFormat
=
"applied_compute_agentic"
,
traceReplayConcurrency
=
8
,
traceSharedPrefixRatio
=
0.5
,
traceNumPrefixGroups
=
0
,
)
def
test_run_replay_for_state_passes_applied_compute_agentic_trace_knobs
(
tmp_path
,
monkeypatch
)
->
None
:
trace_path
=
_write_trace
(
tmp_path
)
workload
=
_applied_compute_agentic_trace_workload
(
trace_path
,
concurrency
=
9
)
captured
:
dict
[
str
,
Any
]
=
{}
def
fake_run_trace_replay
(
trace_file
,
**
kwargs
):
captured
[
"trace_file"
]
=
trace_file
captured
[
"kwargs"
]
=
kwargs
return
{
"output_throughput_tok_s"
:
1.0
}
monkeypatch
.
setattr
(
"dynamo.profiler.utils.replay_optimize.evaluate.run_trace_replay"
,
fake_run_trace_replay
,
)
replay_optimize
.
evaluate
.
_run_replay_for_state
(
state
=
DenseReplayState
(
1
,
1
,
1
,
1
,
0.5
),
workload
=
workload
,
prefill_engine_args
=
_base_prefill_args
(),
decode_engine_args
=
_base_decode_args
(),
router_config
=
KvRouterConfig
(),
)
assert
Path
(
captured
[
"trace_file"
])
==
trace_path
assert
captured
[
"kwargs"
][
"replay_concurrency"
]
==
9
assert
captured
[
"kwargs"
][
"trace_format"
]
==
"applied_compute_agentic"
assert
captured
[
"kwargs"
][
"trace_shared_prefix_ratio"
]
==
0.5
assert
captured
[
"kwargs"
][
"trace_num_prefix_groups"
]
==
1
def
_disagg_spec
(
*
,
workload
:
WorkloadSpec
|
None
=
None
,
...
...
components/src/dynamo/profiler/utils/replay_optimize/evaluate.py
View file @
88478b4b
...
...
@@ -53,9 +53,13 @@ def _run_replay_for_state(
router_config
=
router_config
,
num_prefill_workers
=
state
.
prefill_workers
,
num_decode_workers
=
state
.
decode_workers
,
replay_concurrency
=
workload
.
traceReplayConcurrency
,
replay_mode
=
"offline"
,
router_mode
=
state
.
router_mode
,
arrival_speedup_ratio
=
workload
.
arrivalSpeedupRatio
,
trace_format
=
workload
.
traceFormat
,
trace_shared_prefix_ratio
=
workload
.
traceSharedPrefixRatio
,
trace_num_prefix_groups
=
workload
.
traceNumPrefixGroups
,
)
return
run_synthetic_trace_replay
(
...
...
@@ -91,9 +95,13 @@ def _run_agg_replay_for_state(
extra_engine_args
=
engine_args
,
router_config
=
router_config
,
num_workers
=
state
.
workers
,
replay_concurrency
=
workload
.
traceReplayConcurrency
,
replay_mode
=
"offline"
,
router_mode
=
state
.
router_mode
,
arrival_speedup_ratio
=
workload
.
arrivalSpeedupRatio
,
trace_format
=
workload
.
traceFormat
,
trace_shared_prefix_ratio
=
workload
.
traceSharedPrefixRatio
,
trace_num_prefix_groups
=
workload
.
traceNumPrefixGroups
,
)
return
run_synthetic_trace_replay
(
...
...
components/src/dynamo/profiler/utils/replay_optimize/example.py
View file @
88478b4b
...
...
@@ -40,12 +40,20 @@ RESULT_COLUMNS: Sequence[str] = (
def
_build_workload
(
*
,
trace_file
:
str
|
None
,
trace_format
:
str
,
arrival_speedup_ratio
:
float
,
trace_replay_concurrency
:
int
|
None
,
trace_shared_prefix_ratio
:
float
,
trace_num_prefix_groups
:
int
,
)
->
WorkloadSpec
:
if
trace_file
is
not
None
:
return
WorkloadSpec
(
traceFile
=
trace_file
,
traceFormat
=
trace_format
,
arrivalSpeedupRatio
=
arrival_speedup_ratio
,
traceReplayConcurrency
=
trace_replay_concurrency
,
traceSharedPrefixRatio
=
trace_shared_prefix_ratio
,
traceNumPrefixGroups
=
trace_num_prefix_groups
,
)
return
WorkloadSpec
(
...
...
@@ -70,7 +78,11 @@ def _engine_args(worker_type: str) -> MockEngineArgs:
def
run_example
(
*
,
trace_file
:
str
|
None
=
None
,
trace_format
:
str
=
"mooncake"
,
arrival_speedup_ratio
:
float
=
1.0
,
trace_replay_concurrency
:
int
|
None
=
None
,
trace_shared_prefix_ratio
:
float
=
0.0
,
trace_num_prefix_groups
:
int
=
0
,
max_parallel_evals
:
int
=
1
,
)
->
None
:
spec
=
ReplayOptimizeSpec
(
...
...
@@ -83,7 +95,11 @@ def run_example(
hardware
=
HardwareSpec
(
gpuSku
=
GPU_SKU
,
totalGpus
=
TOTAL_GPUS
),
workload
=
_build_workload
(
trace_file
=
trace_file
,
trace_format
=
trace_format
,
arrival_speedup_ratio
=
arrival_speedup_ratio
,
trace_replay_concurrency
=
trace_replay_concurrency
,
trace_shared_prefix_ratio
=
trace_shared_prefix_ratio
,
trace_num_prefix_groups
=
trace_num_prefix_groups
,
),
sla
=
SLASpec
(
ttft
=
50000.0
,
itl
=
100.0
,
e2eLatency
=
60000.0
),
router
=
RouterSpec
(
...
...
@@ -109,7 +125,13 @@ def main() -> None:
)
parser
.
add_argument
(
"--trace-file"
,
help
=
"Optional Mooncake-style JSONL trace. If omitted, runs the synthetic workload."
,
help
=
"Optional replay trace JSONL file. If omitted, runs the synthetic workload."
,
)
parser
.
add_argument
(
"--trace-format"
,
choices
=
(
"mooncake"
,
"applied_compute_agentic"
),
default
=
"mooncake"
,
help
=
"Trace-file format to use with --trace-file."
,
)
parser
.
add_argument
(
"--arrival-speedup-ratio"
,
...
...
@@ -117,6 +139,23 @@ def main() -> None:
default
=
1.0
,
help
=
"Arrival speedup ratio to use with --trace-file."
,
)
parser
.
add_argument
(
"--trace-replay-concurrency"
,
type
=
int
,
help
=
"Optional replay concurrency cap for trace workloads; required for --trace-format=applied_compute_agentic."
,
)
parser
.
add_argument
(
"--trace-shared-prefix-ratio"
,
type
=
float
,
default
=
0.0
,
help
=
"Fraction of initial prompt blocks shared across sessions for applied_compute_agentic trace replay."
,
)
parser
.
add_argument
(
"--trace-num-prefix-groups"
,
type
=
int
,
default
=
0
,
help
=
"Number of shared-prefix groups for applied_compute_agentic trace replay."
,
)
parser
.
add_argument
(
"--max-parallel-evals"
,
type
=
int
,
...
...
@@ -126,7 +165,11 @@ def main() -> None:
args
=
parser
.
parse_args
()
run_example
(
trace_file
=
args
.
trace_file
,
trace_format
=
args
.
trace_format
,
arrival_speedup_ratio
=
args
.
arrival_speedup_ratio
,
trace_replay_concurrency
=
args
.
trace_replay_concurrency
,
trace_shared_prefix_ratio
=
args
.
trace_shared_prefix_ratio
,
trace_num_prefix_groups
=
args
.
trace_num_prefix_groups
,
max_parallel_evals
=
args
.
max_parallel_evals
,
)
...
...
components/src/dynamo/profiler/utils/replay_optimize/specs.py
View file @
88478b4b
...
...
@@ -166,7 +166,11 @@ class WorkloadSpec(BaseModel):
# Replay trace-source extensions (mutually exclusive with synthetic fields)
traceFile
:
str
|
None
=
None
traceFormat
:
str
=
"mooncake"
arrivalSpeedupRatio
:
float
=
1.0
traceReplayConcurrency
:
int
|
None
=
None
traceSharedPrefixRatio
:
float
=
0.0
traceNumPrefixGroups
:
int
=
0
@
model_validator
(
mode
=
"after"
)
def
_validate_source
(
self
)
->
"WorkloadSpec"
:
...
...
@@ -181,6 +185,31 @@ class WorkloadSpec(BaseModel):
"trace workload (traceFile set) must not also set synthetic "
f
"fields:
{
mixed
}
"
)
if
self
.
traceFormat
not
in
{
"mooncake"
,
"applied_compute_agentic"
}:
raise
ValueError
(
"traceFormat must be either 'mooncake' or 'applied_compute_agentic', got "
f
"
{
self
.
traceFormat
!
r
}
"
)
if
(
self
.
traceReplayConcurrency
is
not
None
and
self
.
traceReplayConcurrency
<
1
):
raise
ValueError
(
"traceReplayConcurrency must be at least 1"
)
if
not
0.0
<=
self
.
traceSharedPrefixRatio
<=
1.0
:
raise
ValueError
(
"traceSharedPrefixRatio must be between 0.0 and 1.0"
)
if
self
.
traceNumPrefixGroups
<
0
:
raise
ValueError
(
"traceNumPrefixGroups must be non-negative"
)
if
self
.
traceSharedPrefixRatio
>
0.0
and
self
.
traceNumPrefixGroups
==
0
:
raise
ValueError
(
"traceSharedPrefixRatio > 0 requires traceNumPrefixGroups >= 1"
)
if
(
self
.
traceFormat
==
"applied_compute_agentic"
and
self
.
traceReplayConcurrency
is
None
):
raise
ValueError
(
"traceFormat='applied_compute_agentic' requires traceReplayConcurrency"
)
return
self
missing
=
[
...
...
lib/bench/README.md
View file @
88478b4b
...
...
@@ -12,7 +12,9 @@ pre-warms the KV cache with the predicted next-turn prefix after each assistant
response, cutting TTFT on subsequent turns.
`offline_replay_bench`
runs the Rust-native replay loop directly for profiling
and throughput measurements without going through the Python wrapper.
and throughput measurements without going through the Python wrapper. It uses
the mocker's internal polynomial perf model so the results stay focused on
replay overhead instead of external timing backends.
## Quick start
...
...
@@ -114,3 +116,6 @@ cargo bench --package dynamo-bench --bench offline_replay_bench -- \
--trace-block-size
512
\
--block-size
64
```
Use
`--speedup-ratio`
and
`--decode-speedup-ratio`
if you want a simple scaling
knob while keeping the same internal polynomial model.
lib/bench/offline_replay_bench.rs
View file @
88478b4b
...
...
@@ -3,8 +3,9 @@
//! Rust-native offline replay benchmark entrypoint.
//!
//! Useful for profiling replay itself without the Python CLI wrapper. This keeps
//! the default mocker perf model unless CLI overrides are provided.
//! Useful for profiling replay itself without the Python CLI wrapper. This
//! bench intentionally uses the mocker's internal polynomial perf model so the
//! measurements stay focused on replay and router overhead.
//!
//! Run with: cargo bench --package dynamo-bench --bench offline_replay_bench -- --help
...
...
@@ -80,10 +81,6 @@ struct Args {
#[arg(long)]
decode_speedup_ratio
:
Option
<
f64
>
,
/// Explicit planner profile NPZ to use for perf-model timing
#[arg(long)]
planner_profile_data
:
Option
<
PathBuf
>
,
/// Optional path to write the full replay report as pretty JSON
#[arg(long)]
report_json
:
Option
<
PathBuf
>
,
...
...
@@ -112,9 +109,6 @@ fn build_engine_args(args: &Args) -> Result<MockEngineArgs> {
if
let
Some
(
decode_speedup_ratio
)
=
args
.decode_speedup_ratio
{
builder
=
builder
.decode_speedup_ratio
(
decode_speedup_ratio
);
}
if
let
Some
(
planner_profile_data
)
=
args
.planner_profile_data
.as_ref
()
{
builder
=
builder
.planner_profile_data
(
Some
(
planner_profile_data
.clone
()));
}
builder
.build
()
.context
(
"failed to build replay engine args"
)
?
...
...
lib/bindings/python/rust/llm/replay.rs
View file @
88478b4b
...
...
@@ -526,7 +526,7 @@ impl MockEngineArgs {
}
#[pyfunction]
#[pyo3(signature
=
(trace_file,
extra_engine_args=None,
prefill_engine_args=None,
decode_engine_args=None,
router_config=None,
aic_perf_config=None,
num_workers=
1
,
num_prefill_workers=
1
,
num_decode_workers=
1
,
replay_concurrency=None,
replay_mode=
"offline"
,
router_mode=
"round_robin"
,
arrival_speedup_ratio=
1.0
,
trace_block_size=
512
))]
#[pyo3(signature
=
(trace_file,
extra_engine_args=None,
prefill_engine_args=None,
decode_engine_args=None,
router_config=None,
aic_perf_config=None,
num_workers=
1
,
num_prefill_workers=
1
,
num_decode_workers=
1
,
replay_concurrency=None,
replay_mode=
"offline"
,
router_mode=
"round_robin"
,
arrival_speedup_ratio=
1.0
,
trace_block_size=
512
,
trace_format=
"mooncake"
,
trace_shared_prefix_ratio=
0.0
,
trace_num_prefix_groups=
0
))]
#[allow(clippy::too_many_arguments)]
pub
fn
run_mocker_trace_replay
(
py
:
Python
<
'_
>
,
...
...
@@ -544,6 +544,9 @@ pub fn run_mocker_trace_replay(
router_mode
:
&
str
,
arrival_speedup_ratio
:
f64
,
trace_block_size
:
usize
,
trace_format
:
&
str
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
PyResult
<
PyObject
>
{
let
args_selection
=
load_replay_args_selection
(
py
,
...
...
@@ -555,6 +558,7 @@ pub fn run_mocker_trace_replay(
num_decode_workers
,
)
?
;
let
router_mode
=
parse_replay_router_mode
(
router_mode
)
?
;
let
trace_format
=
parse_trace_file_format
(
trace_format
)
?
;
let
prefill_load_estimator
=
load_replay_prefill_load_estimator
(
py
,
router_mode
,
...
...
@@ -565,12 +569,19 @@ pub fn run_mocker_trace_replay(
let
replay_mode
=
replay_mode
.to_owned
();
let
report
=
py
.allow_threads
(
move
||
{
let
replay_concurrency
=
parse_replay_concurrency
(
replay_concurrency
)
?
;
if
trace_format
==
dynamo_mocker
::
loadgen
::
TraceFileFormat
::
AppliedComputeAgentic
&&
replay_concurrency
.is_none
()
{
anyhow
::
bail!
(
"trace_format='applied_compute_agentic' requires replay_concurrency because source traces do not contain first-turn timestamps"
);
}
match
args_selection
{
ReplayArgsSelection
::
Aggregated
(
args
)
=>
{
match
(
replay_mode
.as_str
(),
replay_concurrency
)
{
(
"offline"
,
Some
(
max_in_flight
))
=>
{
dynamo_mocker
::
replay
::
simulate_concurrency_file_with_router_mode
(
dynamo_mocker
::
replay
::
simulate_concurrency_file_with_router_mode
_and_format
(
*
args
,
router_config
.clone
(),
prefill_load_estimator
.clone
(),
...
...
@@ -579,10 +590,13 @@ pub fn run_mocker_trace_replay(
max_in_flight
,
num_workers
,
router_mode
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
}
(
"offline"
,
None
)
=>
{
dynamo_mocker
::
replay
::
simulate_trace_file_with_router_mode
(
dynamo_mocker
::
replay
::
simulate_trace_file_with_router_mode
_and_format
(
*
args
,
router_config
.clone
(),
prefill_load_estimator
.clone
(),
...
...
@@ -591,10 +605,13 @@ pub fn run_mocker_trace_replay(
num_workers
,
arrival_speedup_ratio
,
router_mode
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
}
(
"online"
,
Some
(
max_in_flight
))
=>
{
dynamo_mocker
::
replay
::
simulate_concurrency_live_file_with_router_mode
(
dynamo_mocker
::
replay
::
simulate_concurrency_live_file_with_router_mode
_and_format
(
*
args
,
router_config
.clone
(),
prefill_load_estimator
.clone
(),
...
...
@@ -603,10 +620,13 @@ pub fn run_mocker_trace_replay(
max_in_flight
,
num_workers
,
router_mode
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
}
(
"online"
,
None
)
=>
{
dynamo_mocker
::
replay
::
simulate_trace_live_file_with_router_mode
(
dynamo_mocker
::
replay
::
simulate_trace_live_file_with_router_mode
_and_format
(
*
args
,
router_config
.clone
(),
prefill_load_estimator
.clone
(),
...
...
@@ -615,6 +635,9 @@ pub fn run_mocker_trace_replay(
num_workers
,
arrival_speedup_ratio
,
router_mode
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
}
(
other
,
_
)
=>
anyhow
::
bail!
(
...
...
@@ -626,7 +649,7 @@ pub fn run_mocker_trace_replay(
ReplayArgsSelection
::
Disagg
(
config
)
=>
match
(
replay_mode
.as_str
(),
replay_concurrency
)
{
(
"offline"
,
Some
(
max_in_flight
))
=>
{
dynamo_mocker
::
replay
::
simulate_concurrency_file_disagg_with_router_mode
(
dynamo_mocker
::
replay
::
simulate_concurrency_file_disagg_with_router_mode
_and_format
(
*
config
,
router_config
.clone
(),
prefill_load_estimator
.clone
(),
...
...
@@ -634,10 +657,13 @@ pub fn run_mocker_trace_replay(
trace_block_size
,
max_in_flight
,
router_mode
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
}
(
"offline"
,
None
)
=>
{
dynamo_mocker
::
replay
::
simulate_trace_file_disagg_with_router_mode
(
dynamo_mocker
::
replay
::
simulate_trace_file_disagg_with_router_mode
_and_format
(
*
config
,
router_config
.clone
(),
prefill_load_estimator
.clone
(),
...
...
@@ -645,6 +671,9 @@ pub fn run_mocker_trace_replay(
trace_block_size
,
arrival_speedup_ratio
,
router_mode
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
}
(
"online"
,
_
)
=>
anyhow
::
bail!
(
"disagg replay only supports replay_mode='offline'"
),
...
...
@@ -1073,6 +1102,21 @@ fn parse_replay_router_mode(
}
}
fn
parse_trace_file_format
(
trace_format
:
&
str
,
)
->
PyResult
<
dynamo_mocker
::
loadgen
::
TraceFileFormat
>
{
match
trace_format
{
"mooncake"
=>
Ok
(
dynamo_mocker
::
loadgen
::
TraceFileFormat
::
Mooncake
),
"applied_compute_agentic"
=>
{
Ok
(
dynamo_mocker
::
loadgen
::
TraceFileFormat
::
AppliedComputeAgentic
)
}
other
=>
Err
(
PyException
::
new_err
(
format!
(
"trace_format must be either 'mooncake' or 'applied_compute_agentic', got '{}'"
,
other
))),
}
}
fn
parse_replay_concurrency
(
replay_concurrency
:
Option
<
isize
>
)
->
anyhow
::
Result
<
Option
<
usize
>>
{
match
replay_concurrency
{
Some
(
value
)
if
value
<
1
=>
anyhow
::
bail!
(
"replay_concurrency must be at least 1"
),
...
...
lib/bindings/python/src/dynamo/_core.pyi
View file @
88478b4b
...
...
@@ -1558,6 +1558,9 @@ def run_mocker_trace_replay(
router_mode: Literal["round_robin", "kv_router"] = "round_robin",
arrival_speedup_ratio: float = 1.0,
trace_block_size: int = 512,
trace_format: Literal["mooncake", "applied_compute_agentic"] = "mooncake",
trace_shared_prefix_ratio: float = 0.0,
trace_num_prefix_groups: int = 0,
) -> Dict[str, Any]:
"""Replay a mocker trace file and return the simulation report for aggregated vLLM or SGLang configs."""
...
...
...
lib/bindings/python/src/dynamo/llm/__init__.py
View file @
88478b4b
...
...
@@ -60,6 +60,9 @@ def run_mocker_trace_replay(
router_mode
=
"round_robin"
,
arrival_speedup_ratio
=
1.0
,
trace_block_size
=
512
,
trace_format
=
"mooncake"
,
trace_shared_prefix_ratio
=
0.0
,
trace_num_prefix_groups
=
0
,
):
return
_run_mocker_trace_replay
(
trace_file
,
...
...
@@ -71,4 +74,7 @@ def run_mocker_trace_replay(
router_mode
=
router_mode
,
arrival_speedup_ratio
=
arrival_speedup_ratio
,
trace_block_size
=
trace_block_size
,
trace_format
=
trace_format
,
trace_shared_prefix_ratio
=
trace_shared_prefix_ratio
,
trace_num_prefix_groups
=
trace_num_prefix_groups
,
)
lib/bindings/python/src/dynamo/replay/api.py
View file @
88478b4b
...
...
@@ -23,6 +23,9 @@ def run_trace_replay(
router_mode
=
"round_robin"
,
arrival_speedup_ratio
=
1.0
,
trace_block_size
=
512
,
trace_format
=
"mooncake"
,
trace_shared_prefix_ratio
=
0.0
,
trace_num_prefix_groups
=
0
,
):
return
_run_mocker_trace_replay
(
trace_file
,
...
...
@@ -39,6 +42,9 @@ def run_trace_replay(
router_mode
=
router_mode
,
arrival_speedup_ratio
=
arrival_speedup_ratio
,
trace_block_size
=
trace_block_size
,
trace_format
=
trace_format
,
trace_shared_prefix_ratio
=
trace_shared_prefix_ratio
,
trace_num_prefix_groups
=
trace_num_prefix_groups
,
)
...
...
lib/bindings/python/src/dynamo/replay/main.py
View file @
88478b4b
...
...
@@ -418,12 +418,30 @@ def main(argv: Sequence[str] | None = None) -> int:
default
=
"round_robin"
,
)
parser
.
add_argument
(
"--arrival-speedup-ratio"
,
type
=
float
,
default
=
1.0
)
parser
.
add_argument
(
"--trace-format"
,
choices
=
(
"mooncake"
,
"applied_compute_agentic"
),
default
=
"mooncake"
,
help
=
"format of trace_file when replaying from a file"
,
)
parser
.
add_argument
(
"--trace-block-size"
,
type
=
int
,
default
=
512
,
help
=
"tokens represented by each hash_id in the trace file; only used for file replay"
,
)
parser
.
add_argument
(
"--trace-shared-prefix-ratio"
,
type
=
float
,
default
=
0.0
,
help
=
"fraction of the initial prompt blocks to share across sessions for applied_compute_agentic trace replay"
,
)
parser
.
add_argument
(
"--trace-num-prefix-groups"
,
type
=
int
,
default
=
0
,
help
=
"number of cross-session shared-prefix groups for applied_compute_agentic trace replay"
,
)
parser
.
add_argument
(
"--report-json"
,
help
=
"path to save the full replay report JSON; defaults to a timestamped file in the current directory"
,
...
...
@@ -459,6 +477,14 @@ def main(argv: Sequence[str] | None = None) -> int:
parser
.
error
(
"synthetic replay requires --input-tokens, --output-tokens, and --request-count"
)
if
(
using_trace_file
and
args
.
trace_format
==
"applied_compute_agentic"
and
args
.
replay_concurrency
is
None
):
parser
.
error
(
"--trace-format=applied_compute_agentic requires --replay-concurrency because the source traces do not include first-turn timestamps"
)
extra_engine_args
=
_load_engine_args
(
args
.
extra_engine_args
)
prefill_engine_args
=
_load_engine_args
(
args
.
prefill_engine_args
)
...
...
@@ -531,6 +557,9 @@ def main(argv: Sequence[str] | None = None) -> int:
router_mode
=
args
.
router_mode
,
arrival_speedup_ratio
=
args
.
arrival_speedup_ratio
,
trace_block_size
=
args
.
trace_block_size
,
trace_format
=
args
.
trace_format
,
trace_shared_prefix_ratio
=
args
.
trace_shared_prefix_ratio
,
trace_num_prefix_groups
=
args
.
trace_num_prefix_groups
,
)
else
:
report
=
run_synthetic_trace_replay
(
...
...
lib/bindings/python/tests/replay/replay_utils.py
View file @
88478b4b
...
...
@@ -152,6 +152,33 @@ def _write_multiturn_trace(tmp_path):
return
trace_path
def
_write_applied_compute_agentic_trace
(
tmp_path
):
trace_path
=
tmp_path
/
"applied_compute_agentic_trace.jsonl"
records
=
[
{
"num_turns"
:
2
,
"input_prompt_length"
:
64
,
"assistant_response_length"
:
[
2
,
2
],
"tool_call_output_length"
:
[
2
,
2
],
"tool_call_latency"
:
[
0.001
,
0.001
],
"final_assistant_response_length"
:
2
,
},
{
"num_turns"
:
1
,
"input_prompt_length"
:
64
,
"assistant_response_length"
:
[
2
],
"tool_call_output_length"
:
[
2
],
"tool_call_latency"
:
[
0.001
],
"final_assistant_response_length"
:
2
,
},
]
trace_path
.
write_text
(
"
\n
"
.
join
(
json
.
dumps
(
record
)
for
record
in
records
)
+
"
\n
"
,
encoding
=
"utf-8"
,
)
return
trace_path
def
_write_cli_smoke_trace
(
tmp_path
):
trace_path
=
tmp_path
/
"cli_smoke_trace.jsonl"
records
=
[]
...
...
lib/bindings/python/tests/replay/test_replay_smoke.py
View file @
88478b4b
...
...
@@ -16,6 +16,7 @@ from .replay_utils import (
_router_config
,
_sglang_args
,
_vllm_args
,
_write_applied_compute_agentic_trace
,
_write_multiturn_trace
,
_write_trace_and_args
,
)
...
...
@@ -125,6 +126,45 @@ def test_run_trace_replay_supports_multiturn_sessions(tmp_path, replay_mode):
)
@
pytest
.
mark
.
parametrize
(
"replay_mode"
,
[
"offline"
,
"online"
])
def
test_run_trace_replay_supports_applied_compute_agentic_format_with_concurrency
(
tmp_path
,
replay_mode
):
trace_path
=
_write_applied_compute_agentic_trace
(
tmp_path
)
report
=
run_trace_replay
(
trace_path
,
extra_engine_args
=
_vllm_args
(),
num_workers
=
2
,
replay_concurrency
=
2
,
replay_mode
=
replay_mode
,
router_mode
=
"kv_router"
,
trace_format
=
"applied_compute_agentic"
,
trace_shared_prefix_ratio
=
0.5
,
trace_num_prefix_groups
=
1
,
)
assert
report
[
"num_requests"
]
==
5
assert
report
[
"completed_requests"
]
==
5
assert
report
[
"total_input_tokens"
]
==
64
+
68
+
72
+
64
+
68
assert
report
[
"total_output_tokens"
]
==
10
def
test_run_trace_replay_rejects_applied_compute_agentic_format_without_concurrency
(
tmp_path
,
):
trace_path
=
_write_applied_compute_agentic_trace
(
tmp_path
)
with
pytest
.
raises
(
Exception
,
match
=
"replay_concurrency"
):
run_trace_replay
(
trace_path
,
extra_engine_args
=
_vllm_args
(),
num_workers
=
2
,
replay_mode
=
"offline"
,
trace_format
=
"applied_compute_agentic"
,
)
@
pytest
.
mark
.
parametrize
(
"replay_mode"
,
[
"offline"
,
"online"
])
def
test_run_trace_replay_supports_distinct_trace_and_engine_block_sizes
(
tmp_path
,
replay_mode
...
...
lib/mocker/src/common/sequence.rs
View file @
88478b4b
...
...
@@ -164,12 +164,8 @@ impl ActiveSequence {
/// Positional lineage hashes for all fully-tokenised blocks in the sequence.
/// Mirrors `block_hashes()` but returns the PLH identity used by kvbm-logical.
pub
fn
positional_lineage_hashes
(
&
self
)
->
Vec
<
PositionalLineageHash
>
{
self
.tokens
.blocks
()
.iter
()
.map
(|
block
|
block
.positional_lineage_hash
())
.collect
()
pub
fn
positional_lineage_hashes
(
&
self
)
->
&
[
PositionalLineageHash
]
{
&
self
.plhs
}
/// Commit a successful allocation by advancing `num_allocated_tokens`.
...
...
lib/mocker/src/kv_manager/kvbm_backend.rs
View file @
88478b4b
...
...
@@ -30,7 +30,6 @@
//! - `Lru` — simple recency-based LRU.
//! - `MultiLru` — 4-tier frequency-aware LRU (requires TinyLFU tracker).
use
std
::
collections
::
HashMap
;
use
std
::
sync
::
Arc
;
use
dynamo_kv_router
::
protocols
::{
...
...
@@ -42,6 +41,7 @@ use dynamo_tokens::{BlockHash, PositionalLineageHash, SequenceHash};
use
kvbm_logical
::
registry
::
BlockRegistry
;
use
kvbm_logical
::
tinylfu
::
TinyLFUTracker
;
use
kvbm_logical
::{
BlockManager
,
ImmutableBlock
,
MutableBlock
};
use
rustc_hash
::
FxHashMap
;
use
uuid
::
Uuid
;
use
crate
::
common
::
kv_cache_trace
;
...
...
@@ -79,13 +79,13 @@ pub struct KvManager {
/// PartialBlocks (still filling tokens) held as `MutableBlock`.
/// Dropped blocks return to kvbm-logical's reset pool.
active_partial
:
HashMap
<
Uuid
,
MutableBlock
<
G1
>>
,
active_partial
:
Fx
HashMap
<
Uuid
,
MutableBlock
<
G1
>>
,
/// FullBlocks held as `ImmutableBlock`, keyed by `SequenceHash`. The vec
/// length is the mocker's reference count — each `Use` pushes a clone,
/// each `Deref` pops one. When the vec empties, the block transitions to
/// kvbm-logical's inactive pool (RAII return on drop of the last clone).
active_full
:
HashMap
<
SequenceHash
,
Vec
<
ImmutableBlock
<
G1
>>>
,
active_full
:
Fx
HashMap
<
SequenceHash
,
Vec
<
ImmutableBlock
<
G1
>>>
,
/// Shadow registry of (PLH → mocker u64 seq_hash) for every block that has
/// been registered in kvbm-logical. kvbm-logical's registry is keyed by
...
...
@@ -93,7 +93,7 @@ pub struct KvManager {
/// mocker's u64 `SequenceHash` on `UniqueBlock::FullBlock`. We keep this
/// map so we can emit router-compatible `Removed` events when kvbm-logical
/// evicts inactive blocks as a side effect of `allocate_blocks_with_evictions`.
registered_plhs
:
HashMap
<
PositionalLineageHash
,
SequenceHash
>
,
registered_plhs
:
Fx
HashMap
<
PositionalLineageHash
,
SequenceHash
>
,
}
impl
KvManager
{
...
...
@@ -152,9 +152,9 @@ impl KvManager {
kv_event_publishers
,
dp_rank
,
next_event_id
:
0
,
active_partial
:
HashMap
::
new
(),
active_full
:
HashMap
::
new
(),
registered_plhs
:
HashMap
::
new
(),
active_partial
:
Fx
HashMap
::
default
(),
active_full
:
Fx
HashMap
::
default
(),
registered_plhs
:
Fx
HashMap
::
default
(),
}
}
...
...
@@ -554,14 +554,10 @@ impl KvManager {
overlap
+=
1
;
continue
;
}
let
Some
(
plh
)
=
plhs
.get
(
i
)
.copied
()
else
{
let
Some
(
plh
)
=
plhs
.get
(
i
)
else
{
break
;
};
let
presence
=
self
.block_manager
.block_registry
()
.check_presence
::
<
G1
>
(
&
[
plh
]);
if
presence
.first
()
.is_some_and
(|(
_
,
present
)|
*
present
)
{
if
self
.registered_plhs
.contains_key
(
plh
)
{
overlap
+=
1
;
}
else
{
break
;
...
...
lib/mocker/src/loadgen/mod.rs
View file @
88478b4b
...
...
@@ -8,7 +8,8 @@ mod types;
pub
use
driver
::
WorkloadDriver
;
pub
use
types
::{
ArrivalSpec
,
DelaySpec
,
LengthSpec
,
ReadyTurn
,
ReplayRequestHashes
,
RouterSequence
,
SequenceHashMode
,
SessionPartitionSpec
,
SessionTrace
,
SyntheticTraceSpec
,
Trace
,
TurnTrace
,
SequenceHashMode
,
SessionPartitionSpec
,
SessionTrace
,
SyntheticTraceSpec
,
Trace
,
TraceFileFormat
,
TurnTrace
,
};
#[cfg(test)]
...
...
lib/mocker/src/loadgen/tests.rs
View file @
88478b4b
...
...
@@ -84,6 +84,87 @@ fn test_from_mooncake_defaults_missing_input_length_from_hash_capacity() {
assert_eq!
(
trace
.sessions
[
0
]
.turns
[
0
]
.input_length
,
8
);
}
#[test]
fn
test_from_applied_compute_agentic_expands_rows_into_num_turns_plus_final_request
()
{
let
file
=
write_trace
(
&
[
serde_json
::
json!
({
"num_turns"
:
2
,
"input_prompt_length"
:
100
,
"assistant_response_length"
:
[
10
,
20
],
"tool_call_output_length"
:
[
30
,
40
],
"tool_call_latency"
:
[
0.5
,
1.25
],
"final_assistant_response_length"
:
50
,
})]);
let
trace
=
Trace
::
from_applied_compute_agentic
(
file
.path
(),
64
,
0.0
,
0
)
.unwrap
();
assert_eq!
(
trace
.sessions
.len
(),
1
);
let
session
=
&
trace
.sessions
[
0
];
assert_eq!
(
session
.first_arrival_timestamp_ms
,
None
);
assert_eq!
(
session
.turns
.len
(),
3
);
assert_eq!
(
session
.turns
[
0
]
.input_length
,
100
);
assert_eq!
(
session
.turns
[
0
]
.max_output_tokens
,
10
);
assert_eq!
(
session
.turns
[
0
]
.delay_after_previous_ms
,
0.0
);
assert_eq!
(
session
.turns
[
1
]
.input_length
,
140
);
assert_eq!
(
session
.turns
[
1
]
.max_output_tokens
,
20
);
assert_eq!
(
session
.turns
[
1
]
.delay_after_previous_ms
,
500.0
);
assert_eq!
(
session
.turns
[
2
]
.input_length
,
200
);
assert_eq!
(
session
.turns
[
2
]
.max_output_tokens
,
50
);
assert_eq!
(
session
.turns
[
2
]
.delay_after_previous_ms
,
1250.0
);
}
#[test]
fn
test_from_applied_compute_agentic_prefix_extends_hashes_across_turns
()
{
let
file
=
write_trace
(
&
[
serde_json
::
json!
({
"num_turns"
:
2
,
"input_prompt_length"
:
600
,
"assistant_response_length"
:
[
40
,
50
],
"tool_call_output_length"
:
[
40
,
50
],
"tool_call_latency"
:
[
0.1
,
0.2
],
"final_assistant_response_length"
:
60
,
})]);
let
trace
=
Trace
::
from_applied_compute_agentic
(
file
.path
(),
256
,
0.0
,
0
)
.unwrap
();
let
turns
=
&
trace
.sessions
[
0
]
.turns
;
assert_eq!
(
turns
[
0
]
.hash_ids
,
vec!
[
1
,
2
,
3
]);
assert_eq!
(
turns
[
1
]
.hash_ids
,
vec!
[
1
,
2
,
3
]);
assert_eq!
(
turns
[
2
]
.hash_ids
,
vec!
[
1
,
2
,
3
,
4
]);
}
#[test]
fn
test_from_applied_compute_agentic_can_share_initial_prefix_blocks_across_sessions
()
{
let
file
=
write_trace
(
&
[
serde_json
::
json!
({
"num_turns"
:
1
,
"input_prompt_length"
:
600
,
"assistant_response_length"
:
[
10
],
"tool_call_output_length"
:
[
10
],
"tool_call_latency"
:
[
0.1
],
"final_assistant_response_length"
:
10
,
}),
serde_json
::
json!
({
"num_turns"
:
1
,
"input_prompt_length"
:
600
,
"assistant_response_length"
:
[
20
],
"tool_call_output_length"
:
[
20
],
"tool_call_latency"
:
[
0.2
],
"final_assistant_response_length"
:
20
,
}),
]);
let
trace
=
Trace
::
from_applied_compute_agentic
(
file
.path
(),
256
,
0.5
,
1
)
.unwrap
();
assert_eq!
(
trace
.sessions
[
0
]
.turns
[
0
]
.hash_ids
[
0
],
trace
.sessions
[
1
]
.turns
[
0
]
.hash_ids
[
0
]
);
assert_eq!
(
trace
.sessions
[
0
]
.turns
[
0
]
.hash_ids
[
1
],
trace
.sessions
[
1
]
.turns
[
0
]
.hash_ids
[
1
]
);
assert_ne!
(
trace
.sessions
[
0
]
.turns
[
0
]
.hash_ids
[
2
],
trace
.sessions
[
1
]
.turns
[
0
]
.hash_ids
[
2
]
);
}
#[test]
fn
test_turn_to_direct_request_repeats_hash_ids_by_block_size
()
{
let
turn
=
TurnTrace
{
...
...
lib/mocker/src/loadgen/trace.rs
View file @
88478b4b
...
...
@@ -45,6 +45,16 @@ struct RawMooncakeRecord {
delay_ms
:
Option
<
f64
>
,
}
#[derive(Debug,
Deserialize)]
struct
RawAppliedComputeAgenticRecord
{
num_turns
:
usize
,
input_prompt_length
:
usize
,
assistant_response_length
:
Vec
<
usize
>
,
tool_call_output_length
:
Vec
<
usize
>
,
tool_call_latency
:
Vec
<
f64
>
,
final_assistant_response_length
:
usize
,
}
impl
TurnTrace
{
fn
validate_block_size_and_capacity
(
&
self
,
trace_block_size
:
usize
)
->
Result
<
()
>
{
if
trace_block_size
==
0
{
...
...
@@ -262,6 +272,157 @@ impl Trace {
})
}
pub
fn
from_applied_compute_agentic
(
path
:
&
Path
,
trace_block_size
:
usize
,
shared_prefix_ratio
:
f64
,
num_prefix_groups
:
usize
,
)
->
Result
<
Self
>
{
if
trace_block_size
==
0
{
bail!
(
"trace_block_size must be greater than 0"
);
}
if
!
(
0.0
..=
1.0
)
.contains
(
&
shared_prefix_ratio
)
{
bail!
(
"shared_prefix_ratio must be between 0.0 and 1.0, got {}"
,
shared_prefix_ratio
);
}
let
file
=
File
::
open
(
path
)
.with_context
(||
format!
(
"failed to open trace file {}"
,
path
.display
()))
?
;
let
reader
=
BufReader
::
new
(
file
);
let
mut
sessions
=
Vec
::
new
();
let
mut
next_unique_hash
=
1_u64
;
for
(
line_idx
,
line
)
in
reader
.lines
()
.enumerate
()
{
let
line
=
line
.with_context
(||
{
format!
(
"failed to read line {} from {}"
,
line_idx
+
1
,
path
.display
()
)
})
?
;
if
line
.trim
()
.is_empty
()
{
continue
;
}
let
raw
:
RawAppliedComputeAgenticRecord
=
serde_json
::
from_str
(
&
line
)
.with_context
(||
{
format!
(
"failed to parse line {} from {} as JSON"
,
line_idx
+
1
,
path
.display
()
)
})
?
;
for
(
name
,
values
)
in
[
(
"assistant_response_length"
,
raw
.assistant_response_length
.len
(),
),
(
"tool_call_output_length"
,
raw
.tool_call_output_length
.len
()),
(
"tool_call_latency"
,
raw
.tool_call_latency
.len
()),
]
{
if
values
!=
raw
.num_turns
{
bail!
(
"trace line {} field {} length {} does not match num_turns {}"
,
line_idx
+
1
,
name
,
values
,
raw
.num_turns
);
}
}
if
raw
.input_prompt_length
==
0
{
bail!
(
"trace line {} input_prompt_length must be positive"
,
line_idx
+
1
);
}
let
group_id
=
if
shared_prefix_ratio
>
0.0
&&
num_prefix_groups
>
0
{
Some
(
line_idx
%
num_prefix_groups
)
}
else
{
None
};
let
mut
current_input_length
=
raw
.input_prompt_length
;
let
mut
hash_ids
=
Vec
::
new
();
let
shared_initial_blocks
=
((
current_input_length
.div_ceil
(
trace_block_size
)
as
f64
)
*
shared_prefix_ratio
)
.round
()
as
usize
;
extend_applied_compute_agentic_hash_ids
(
&
mut
hash_ids
,
current_input_length
,
trace_block_size
,
shared_initial_blocks
,
group_id
,
&
mut
next_unique_hash
,
)
?
;
let
mut
turns
=
Vec
::
with_capacity
(
raw
.num_turns
+
1
);
let
mut
next_turn_delay_ms
=
0.0
;
for
turn_idx
in
0
..
raw
.num_turns
{
let
tool_call_latency
=
raw
.tool_call_latency
[
turn_idx
];
if
!
tool_call_latency
.is_finite
()
||
tool_call_latency
<
0.0
{
bail!
(
"trace line {} tool_call_latency[{}] must be a finite non-negative number"
,
line_idx
+
1
,
turn_idx
);
}
turns
.push
(
TurnTrace
{
input_length
:
current_input_length
,
max_output_tokens
:
raw
.assistant_response_length
[
turn_idx
],
hash_ids
:
hash_ids
.clone
(),
delay_after_previous_ms
:
next_turn_delay_ms
,
});
current_input_length
=
current_input_length
.checked_add
(
raw
.assistant_response_length
[
turn_idx
])
.and_then
(|
value
|
value
.checked_add
(
raw
.tool_call_output_length
[
turn_idx
]))
.ok_or_else
(||
{
anyhow!
(
"trace line {} cumulative input length overflow"
,
line_idx
+
1
)
})
?
;
extend_applied_compute_agentic_hash_ids
(
&
mut
hash_ids
,
current_input_length
,
trace_block_size
,
shared_initial_blocks
,
group_id
,
&
mut
next_unique_hash
,
)
?
;
next_turn_delay_ms
=
tool_call_latency
*
1000.0
;
}
turns
.push
(
TurnTrace
{
input_length
:
current_input_length
,
max_output_tokens
:
raw
.final_assistant_response_length
,
hash_ids
,
delay_after_previous_ms
:
next_turn_delay_ms
,
});
sessions
.push
(
SessionTrace
{
session_id
:
format!
(
"applied_compute_agentic_session_{}"
,
line_idx
+
1
),
first_arrival_timestamp_ms
:
None
,
turns
,
});
}
if
sessions
.is_empty
()
{
bail!
(
"trace file {} did not contain any requests"
,
path
.display
());
}
Ok
(
Self
{
block_size
:
trace_block_size
,
sessions
,
})
}
pub
fn
synthetic
(
spec
:
SyntheticTraceSpec
)
->
Result
<
Self
>
{
if
spec
.block_size
==
0
{
bail!
(
"block_size must be greater than 0"
);
...
...
@@ -726,6 +887,31 @@ impl Trace {
}
}
fn
extend_applied_compute_agentic_hash_ids
(
hash_ids
:
&
mut
Vec
<
u64
>
,
input_length
:
usize
,
trace_block_size
:
usize
,
shared_initial_blocks
:
usize
,
group_id
:
Option
<
usize
>
,
next_unique_hash
:
&
mut
u64
,
)
->
Result
<
()
>
{
let
target_blocks
=
input_length
.div_ceil
(
trace_block_size
);
while
hash_ids
.len
()
<
target_blocks
{
let
block_idx
=
hash_ids
.len
();
if
block_idx
<
shared_initial_blocks
&&
let
Some
(
group_id
)
=
group_id
{
hash_ids
.push
(
0xA63E_0000_0000_0000
|
((
group_id
as
u64
)
<<
32
)
|
block_idx
as
u64
);
continue
;
}
hash_ids
.push
(
*
next_unique_hash
);
*
next_unique_hash
=
next_unique_hash
.checked_add
(
1
)
.ok_or_else
(||
anyhow!
(
"synthetic hash id overflow"
))
?
;
}
Ok
(())
}
fn
arrival_spec_mean_gap_ms
(
spec
:
&
ArrivalSpec
)
->
Result
<
f64
>
{
match
spec
{
ArrivalSpec
::
Burst
=>
Ok
(
0.0
),
...
...
lib/mocker/src/loadgen/types.rs
View file @
88478b4b
...
...
@@ -14,6 +14,12 @@ pub struct Trace {
pub
sessions
:
Vec
<
SessionTrace
>
,
}
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq)]
pub
enum
TraceFileFormat
{
Mooncake
,
AppliedComputeAgentic
,
}
#[derive(Debug,
Clone)]
pub
struct
SessionTrace
{
pub
session_id
:
String
,
...
...
lib/mocker/src/replay/entrypoints.rs
View file @
88478b4b
...
...
@@ -18,7 +18,25 @@ use super::{
TraceSimulationReport
,
};
use
crate
::
common
::
protocols
::{
DirectRequest
,
MockEngineArgs
};
use
crate
::
loadgen
::
Trace
;
use
crate
::
loadgen
::{
Trace
,
TraceFileFormat
};
fn
load_trace_from_file
(
trace_path
:
&
Path
,
trace_block_size
:
usize
,
trace_format
:
TraceFileFormat
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
Result
<
Trace
>
{
match
trace_format
{
TraceFileFormat
::
Mooncake
=>
Trace
::
from_mooncake
(
trace_path
,
trace_block_size
),
TraceFileFormat
::
AppliedComputeAgentic
=>
Trace
::
from_applied_compute_agentic
(
trace_path
,
trace_block_size
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
),
}
}
pub
fn
generate_trace_worker_artifacts_offline
(
args
:
MockEngineArgs
,
...
...
@@ -57,12 +75,52 @@ pub fn simulate_trace_file_with_router_mode(
num_workers
:
usize
,
arrival_speedup_ratio
:
f64
,
router_mode
:
ReplayRouterMode
,
)
->
Result
<
TraceSimulationReport
>
{
simulate_trace_file_with_router_mode_and_format
(
args
,
router_config
,
prefill_load_estimator
,
trace_path
,
trace_block_size
,
num_workers
,
arrival_speedup_ratio
,
router_mode
,
TraceFileFormat
::
Mooncake
,
0.0
,
0
,
)
}
#[allow(clippy::too_many_arguments)]
pub
fn
simulate_trace_file_with_router_mode_and_format
(
args
:
MockEngineArgs
,
router_config
:
Option
<
KvRouterConfig
>
,
prefill_load_estimator
:
Option
<
ReplayPrefillLoadEstimator
>
,
trace_path
:
&
Path
,
trace_block_size
:
usize
,
num_workers
:
usize
,
arrival_speedup_ratio
:
f64
,
router_mode
:
ReplayRouterMode
,
trace_format
:
TraceFileFormat
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
Result
<
TraceSimulationReport
>
{
let
args
=
args
.normalized
()
?
;
validate_offline_replay_args
(
&
args
,
num_workers
,
router_mode
)
?
;
let
trace
=
Trace
::
from_mooncake
(
trace_path
,
trace_block_size
)
?
.normalize_session_starts
()
?
.speed_up_timing
(
arrival_speedup_ratio
)
?
;
if
trace_format
==
TraceFileFormat
::
AppliedComputeAgentic
{
bail!
(
"applied_compute_agentic trace format requires replay_concurrency because source traces do not contain first-turn timestamps"
);
}
let
trace
=
load_trace_from_file
(
trace_path
,
trace_block_size
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
?
.normalize_session_starts
()
?
.speed_up_timing
(
arrival_speedup_ratio
)
?
;
let
started_at
=
Instant
::
now
();
let
report
=
crate
::
replay
::
offline
::
simulate_trace_workload
(
args
,
...
...
@@ -83,12 +141,50 @@ pub fn simulate_trace_file_disagg_with_router_mode(
trace_block_size
:
usize
,
arrival_speedup_ratio
:
f64
,
router_mode
:
ReplayRouterMode
,
)
->
Result
<
TraceSimulationReport
>
{
simulate_trace_file_disagg_with_router_mode_and_format
(
config
,
router_config
,
prefill_load_estimator
,
trace_path
,
trace_block_size
,
arrival_speedup_ratio
,
router_mode
,
TraceFileFormat
::
Mooncake
,
0.0
,
0
,
)
}
#[allow(clippy::too_many_arguments)]
pub
fn
simulate_trace_file_disagg_with_router_mode_and_format
(
config
:
OfflineDisaggReplayConfig
,
router_config
:
Option
<
KvRouterConfig
>
,
prefill_load_estimator
:
Option
<
ReplayPrefillLoadEstimator
>
,
trace_path
:
&
Path
,
trace_block_size
:
usize
,
arrival_speedup_ratio
:
f64
,
router_mode
:
ReplayRouterMode
,
trace_format
:
TraceFileFormat
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
Result
<
TraceSimulationReport
>
{
let
config
=
config
.normalized
()
?
;
validate_offline_disagg_replay_args
(
&
config
,
router_mode
)
?
;
let
trace
=
Trace
::
from_mooncake
(
trace_path
,
trace_block_size
)
?
.normalize_session_starts
()
?
.speed_up_timing
(
arrival_speedup_ratio
)
?
;
if
trace_format
==
TraceFileFormat
::
AppliedComputeAgentic
{
bail!
(
"applied_compute_agentic trace format requires replay_concurrency because source traces do not contain first-turn timestamps"
);
}
let
trace
=
load_trace_from_file
(
trace_path
,
trace_block_size
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
?
.normalize_session_starts
()
?
.speed_up_timing
(
arrival_speedup_ratio
)
?
;
let
started_at
=
Instant
::
now
();
let
report
=
crate
::
replay
::
offline
::
simulate_trace_workload_disagg
(
config
,
...
...
@@ -129,12 +225,52 @@ pub fn simulate_trace_live_file_with_router_mode(
num_workers
:
usize
,
arrival_speedup_ratio
:
f64
,
router_mode
:
ReplayRouterMode
,
)
->
Result
<
TraceSimulationReport
>
{
simulate_trace_live_file_with_router_mode_and_format
(
args
,
router_config
,
prefill_load_estimator
,
trace_path
,
trace_block_size
,
num_workers
,
arrival_speedup_ratio
,
router_mode
,
TraceFileFormat
::
Mooncake
,
0.0
,
0
,
)
}
#[allow(clippy::too_many_arguments)]
pub
fn
simulate_trace_live_file_with_router_mode_and_format
(
args
:
MockEngineArgs
,
router_config
:
Option
<
KvRouterConfig
>
,
prefill_load_estimator
:
Option
<
ReplayPrefillLoadEstimator
>
,
trace_path
:
&
Path
,
trace_block_size
:
usize
,
num_workers
:
usize
,
arrival_speedup_ratio
:
f64
,
router_mode
:
ReplayRouterMode
,
trace_format
:
TraceFileFormat
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
Result
<
TraceSimulationReport
>
{
let
args
=
args
.normalized
()
?
;
validate_online_replay_args
(
&
args
,
num_workers
)
?
;
let
trace
=
Trace
::
from_mooncake
(
trace_path
,
trace_block_size
)
?
.normalize_session_starts
()
?
.speed_up_timing
(
arrival_speedup_ratio
)
?
;
if
trace_format
==
TraceFileFormat
::
AppliedComputeAgentic
{
bail!
(
"applied_compute_agentic trace format requires replay_concurrency because source traces do not contain first-turn timestamps"
);
}
let
trace
=
load_trace_from_file
(
trace_path
,
trace_block_size
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
?
.normalize_session_starts
()
?
.speed_up_timing
(
arrival_speedup_ratio
)
?
;
online
::
simulate_trace_workload
(
args
,
router_config
,
...
...
@@ -288,10 +424,45 @@ pub fn simulate_concurrency_file_with_router_mode(
max_in_flight
:
usize
,
num_workers
:
usize
,
router_mode
:
ReplayRouterMode
,
)
->
Result
<
TraceSimulationReport
>
{
simulate_concurrency_file_with_router_mode_and_format
(
args
,
router_config
,
prefill_load_estimator
,
trace_path
,
trace_block_size
,
max_in_flight
,
num_workers
,
router_mode
,
TraceFileFormat
::
Mooncake
,
0.0
,
0
,
)
}
#[allow(clippy::too_many_arguments)]
pub
fn
simulate_concurrency_file_with_router_mode_and_format
(
args
:
MockEngineArgs
,
router_config
:
Option
<
KvRouterConfig
>
,
prefill_load_estimator
:
Option
<
ReplayPrefillLoadEstimator
>
,
trace_path
:
&
Path
,
trace_block_size
:
usize
,
max_in_flight
:
usize
,
num_workers
:
usize
,
router_mode
:
ReplayRouterMode
,
trace_format
:
TraceFileFormat
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
Result
<
TraceSimulationReport
>
{
let
args
=
args
.normalized
()
?
;
validate_offline_concurrency_args
(
&
args
,
num_workers
,
max_in_flight
,
router_mode
)
?
;
let
trace
=
Trace
::
from_mooncake
(
trace_path
,
trace_block_size
)
?
;
let
trace
=
load_trace_from_file
(
trace_path
,
trace_block_size
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
?
;
let
started_at
=
Instant
::
now
();
let
report
=
simulate_concurrency_workload_with_router_mode
(
args
,
...
...
@@ -313,10 +484,43 @@ pub fn simulate_concurrency_file_disagg_with_router_mode(
trace_block_size
:
usize
,
max_in_flight
:
usize
,
router_mode
:
ReplayRouterMode
,
)
->
Result
<
TraceSimulationReport
>
{
simulate_concurrency_file_disagg_with_router_mode_and_format
(
config
,
router_config
,
prefill_load_estimator
,
trace_path
,
trace_block_size
,
max_in_flight
,
router_mode
,
TraceFileFormat
::
Mooncake
,
0.0
,
0
,
)
}
#[allow(clippy::too_many_arguments)]
pub
fn
simulate_concurrency_file_disagg_with_router_mode_and_format
(
config
:
OfflineDisaggReplayConfig
,
router_config
:
Option
<
KvRouterConfig
>
,
prefill_load_estimator
:
Option
<
ReplayPrefillLoadEstimator
>
,
trace_path
:
&
Path
,
trace_block_size
:
usize
,
max_in_flight
:
usize
,
router_mode
:
ReplayRouterMode
,
trace_format
:
TraceFileFormat
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
Result
<
TraceSimulationReport
>
{
let
config
=
config
.normalized
()
?
;
validate_offline_disagg_concurrency_args
(
&
config
,
max_in_flight
,
router_mode
)
?
;
let
trace
=
Trace
::
from_mooncake
(
trace_path
,
trace_block_size
)
?
;
let
trace
=
load_trace_from_file
(
trace_path
,
trace_block_size
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
?
;
let
started_at
=
Instant
::
now
();
let
report
=
simulate_concurrency_workload_disagg_with_router_mode
(
config
,
...
...
@@ -358,10 +562,45 @@ pub fn simulate_concurrency_live_file_with_router_mode(
max_in_flight
:
usize
,
num_workers
:
usize
,
router_mode
:
ReplayRouterMode
,
)
->
Result
<
TraceSimulationReport
>
{
simulate_concurrency_live_file_with_router_mode_and_format
(
args
,
router_config
,
prefill_load_estimator
,
trace_path
,
trace_block_size
,
max_in_flight
,
num_workers
,
router_mode
,
TraceFileFormat
::
Mooncake
,
0.0
,
0
,
)
}
#[allow(clippy::too_many_arguments)]
pub
fn
simulate_concurrency_live_file_with_router_mode_and_format
(
args
:
MockEngineArgs
,
router_config
:
Option
<
KvRouterConfig
>
,
prefill_load_estimator
:
Option
<
ReplayPrefillLoadEstimator
>
,
trace_path
:
&
Path
,
trace_block_size
:
usize
,
max_in_flight
:
usize
,
num_workers
:
usize
,
router_mode
:
ReplayRouterMode
,
trace_format
:
TraceFileFormat
,
trace_shared_prefix_ratio
:
f64
,
trace_num_prefix_groups
:
usize
,
)
->
Result
<
TraceSimulationReport
>
{
let
args
=
args
.normalized
()
?
;
validate_online_concurrency_args
(
&
args
,
num_workers
,
max_in_flight
)
?
;
let
trace
=
Trace
::
from_mooncake
(
trace_path
,
trace_block_size
)
?
;
let
trace
=
load_trace_from_file
(
trace_path
,
trace_block_size
,
trace_format
,
trace_shared_prefix_ratio
,
trace_num_prefix_groups
,
)
?
;
online
::
simulate_concurrency_workload
(
args
,
router_config
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment