Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
02b1c58a
Unverified
Commit
02b1c58a
authored
Mar 25, 2026
by
Yan Ru Pei
Committed by
GitHub
Mar 25, 2026
Browse files
feat(mocker): add offline disagg replay (#7617)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
4b8826b3
Changes
68
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
248 additions
and
14 deletions
+248
-14
lib/mocker/src/scheduler/sglang/config.rs
lib/mocker/src/scheduler/sglang/config.rs
+4
-0
lib/mocker/src/scheduler/sglang/core.rs
lib/mocker/src/scheduler/sglang/core.rs
+4
-0
lib/mocker/src/scheduler/sglang/decode.rs
lib/mocker/src/scheduler/sglang/decode.rs
+8
-0
lib/mocker/src/scheduler/sglang/tests.rs
lib/mocker/src/scheduler/sglang/tests.rs
+37
-1
lib/mocker/src/scheduler/vllm/core.rs
lib/mocker/src/scheduler/vllm/core.rs
+23
-1
lib/mocker/src/scheduler/vllm/tests.rs
lib/mocker/src/scheduler/vllm/tests.rs
+34
-0
tests/mocker/test_config.py
tests/mocker/test_config.py
+99
-0
tests/router/test_router_e2e_with_mockers.py
tests/router/test_router_e2e_with_mockers.py
+39
-12
No files found.
lib/mocker/src/scheduler/sglang/config.rs
View file @
02b1c58a
...
@@ -36,6 +36,8 @@ pub(super) struct SglangConfig {
...
@@ -36,6 +36,8 @@ pub(super) struct SglangConfig {
pub
(
super
)
decode_speedup_ratio
:
f64
,
pub
(
super
)
decode_speedup_ratio
:
f64
,
pub
(
super
)
worker_type
:
WorkerType
,
pub
(
super
)
worker_type
:
WorkerType
,
pub
(
super
)
block_size
:
usize
,
pub
(
super
)
block_size
:
usize
,
pub
(
super
)
kv_bytes_per_token
:
Option
<
usize
>
,
pub
(
super
)
kv_transfer_bandwidth
:
Option
<
f64
>
,
}
}
impl
SglangConfig
{
impl
SglangConfig
{
...
@@ -81,6 +83,8 @@ impl SglangConfig {
...
@@ -81,6 +83,8 @@ impl SglangConfig {
decode_speedup_ratio
:
args
.decode_speedup_ratio
,
decode_speedup_ratio
:
args
.decode_speedup_ratio
,
worker_type
:
args
.worker_type
,
worker_type
:
args
.worker_type
,
block_size
:
args
.block_size
,
block_size
:
args
.block_size
,
kv_bytes_per_token
:
args
.kv_bytes_per_token
,
kv_transfer_bandwidth
:
args
.kv_transfer_bandwidth
,
}
}
}
}
}
}
...
...
lib/mocker/src/scheduler/sglang/core.rs
View file @
02b1c58a
...
@@ -101,6 +101,10 @@ impl SglangCore {
...
@@ -101,6 +101,10 @@ impl SglangCore {
self
.execute_pass_internal
(
Some
(
collector
),
now_ms
)
self
.execute_pass_internal
(
Some
(
collector
),
now_ms
)
}
}
pub
(
crate
)
fn
execute_hidden_pass
(
&
mut
self
,
now_ms
:
f64
)
->
EnginePassResult
{
self
.execute_pass_internal
(
None
,
now_ms
)
}
pub
(
super
)
fn
execute_pass_internal
(
pub
(
super
)
fn
execute_pass_internal
(
&
mut
self
,
&
mut
self
,
mut
collector
:
Option
<&
mut
TraceCollector
>
,
mut
collector
:
Option
<&
mut
TraceCollector
>
,
...
...
lib/mocker/src/scheduler/sglang/decode.rs
View file @
02b1c58a
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
use
std
::
time
::
Duration
;
use
std
::
time
::
Duration
;
use
crate
::
common
::
protocols
::
OutputSignal
;
use
crate
::
common
::
protocols
::
OutputSignal
;
use
crate
::
common
::
utils
::
compute_prefill_handoff_delay_ms
;
use
crate
::
kv_manager
::
SglangKvManager
;
use
crate
::
kv_manager
::
SglangKvManager
;
use
super
::
config
::{
SglangConfig
,
floor_to_block
};
use
super
::
config
::{
SglangConfig
,
floor_to_block
};
...
@@ -179,6 +180,13 @@ pub(super) fn simulate_decode_step(
...
@@ -179,6 +180,13 @@ pub(super) fn simulate_decode_step(
output_signals
.push
(
OutputSignal
{
output_signals
.push
(
OutputSignal
{
uuid
:
req
.uuid
,
uuid
:
req
.uuid
,
completed
:
is_complete
,
completed
:
is_complete
,
handoff_delay_ms
:
compute_prefill_handoff_delay_ms
(
config
.worker_type
,
is_complete
,
req
.prompt_len
(),
config
.kv_transfer_bandwidth
,
config
.kv_bytes_per_token
,
),
});
});
if
is_complete
{
if
is_complete
{
...
...
lib/mocker/src/scheduler/sglang/tests.rs
View file @
02b1c58a
...
@@ -860,7 +860,6 @@ mod router_events {
...
@@ -860,7 +860,6 @@ mod router_events {
}
}
}
}
}
}
assert_eq!
(
seen
,
expected
);
assert_eq!
(
seen
,
expected
);
drop
(
scheduler
);
drop
(
scheduler
);
drop
(
sink
);
drop
(
sink
);
...
@@ -871,4 +870,41 @@ mod router_events {
...
@@ -871,4 +870,41 @@ mod router_events {
assert
!
(
harness
.ok_count
(
METRIC_EVENT_REMOVED
)
>
0
);
assert
!
(
harness
.ok_count
(
METRIC_EVENT_REMOVED
)
>
0
);
harness
.shutdown
();
harness
.shutdown
();
}
}
#[test]
fn
test_prefill_completion_emits_handoff_delay
()
{
let
args
=
MockEngineArgs
::
builder
()
.engine_type
(
EngineType
::
Sglang
)
.num_gpu_blocks
(
64
)
.block_size
(
4
)
.worker_type
(
crate
::
common
::
protocols
::
WorkerType
::
Prefill
)
.kv_transfer_bandwidth
(
Some
(
1.0
))
.kv_bytes_per_token
(
Some
(
1_000_000
))
.speedup_ratio
(
0.0
)
.sglang
(
Some
(
SglangArgs
{
page_size
:
Some
(
4
),
chunked_prefill_size
:
Some
(
16
),
..
Default
::
default
()
}))
.build
()
.unwrap
();
let
mut
core
=
SglangCore
::
new
(
args
);
core
.receive
(
DirectRequest
{
tokens
:
vec!
[
1
;
8
],
max_output_tokens
:
1
,
uuid
:
Some
(
Uuid
::
from_u128
(
91
)),
dp_rank
:
0
,
arrival_timestamp_ms
:
None
,
});
let
mut
collector
=
crate
::
replay
::
TraceCollector
::
default
();
let
pass
=
core
.execute_pass
(
&
mut
collector
,
0.0
);
let
signal
=
pass
.output_signals
.first
()
.expect
(
"prefill pass should emit one completed signal"
);
assert
!
(
signal
.completed
);
assert_eq!
(
signal
.handoff_delay_ms
,
Some
(
8.0
));
}
}
}
lib/mocker/src/scheduler/vllm/core.rs
View file @
02b1c58a
...
@@ -14,6 +14,7 @@ use crate::common::protocols::{
...
@@ -14,6 +14,7 @@ use crate::common::protocols::{
WorkerType
,
WorkerType
,
};
};
use
crate
::
common
::
sequence
::
ActiveSequence
;
use
crate
::
common
::
sequence
::
ActiveSequence
;
use
crate
::
common
::
utils
::
compute_prefill_handoff_delay_ms
;
use
crate
::
kv_manager
::
KvManager
;
use
crate
::
kv_manager
::
KvManager
;
use
crate
::
replay
::
TraceCollector
;
use
crate
::
replay
::
TraceCollector
;
use
crate
::
scheduler
::{
use
crate
::
scheduler
::{
...
@@ -278,6 +279,10 @@ impl VllmCore {
...
@@ -278,6 +279,10 @@ impl VllmCore {
self
.execute_pass_internal
(
Some
(
collector
),
now_ms
,
None
)
self
.execute_pass_internal
(
Some
(
collector
),
now_ms
,
None
)
}
}
pub
(
crate
)
fn
execute_hidden_pass
(
&
mut
self
,
now_ms
:
f64
)
->
EnginePassResult
{
self
.execute_pass_internal
(
None
,
now_ms
,
None
)
}
pub
(
super
)
fn
execute_pass_internal
(
pub
(
super
)
fn
execute_pass_internal
(
&
mut
self
,
&
mut
self
,
mut
collector
:
Option
<&
mut
TraceCollector
>
,
mut
collector
:
Option
<&
mut
TraceCollector
>
,
...
@@ -641,8 +646,25 @@ impl VllmCore {
...
@@ -641,8 +646,25 @@ impl VllmCore {
}
}
if
let
Some
(
request
)
=
self
.state.requests
.get
(
&
uuid
)
{
if
let
Some
(
request
)
=
self
.state.requests
.get
(
&
uuid
)
{
debug_assert_vllm_request_progress
(
uuid
,
request
);
debug_assert_vllm_request_progress
(
uuid
,
request
);
let
handoff_delay_ms
=
compute_prefill_handoff_delay_ms
(
self
.args.worker_type
,
completed
,
request
.sequence
.num_input_tokens
(),
self
.args.kv_transfer_bandwidth
,
self
.args.kv_bytes_per_token
,
);
output_signals
.push
(
OutputSignal
{
uuid
,
completed
,
handoff_delay_ms
,
});
}
else
{
output_signals
.push
(
OutputSignal
{
uuid
,
completed
,
handoff_delay_ms
:
None
,
});
}
}
output_signals
.push
(
OutputSignal
{
uuid
,
completed
});
if
completed
{
if
completed
{
self
.state
.complete
(
&
uuid
);
self
.state
.complete
(
&
uuid
);
}
}
...
...
lib/mocker/src/scheduler/vllm/tests.rs
View file @
02b1c58a
...
@@ -175,6 +175,40 @@ mod core_behavior {
...
@@ -175,6 +175,40 @@ mod core_behavior {
);
);
}
}
#[test]
fn
test_prefill_completion_emits_handoff_delay
()
{
let
args
=
MockEngineArgs
::
builder
()
.block_size
(
4
)
.num_gpu_blocks
(
8
)
.max_num_batched_tokens
(
Some
(
8
))
.max_num_seqs
(
Some
(
1
))
.enable_chunked_prefill
(
true
)
.worker_type
(
crate
::
common
::
protocols
::
WorkerType
::
Prefill
)
.kv_transfer_bandwidth
(
Some
(
1.0
))
.kv_bytes_per_token
(
Some
(
1_000_000
))
.speedup_ratio
(
0.0
)
.build
()
.unwrap
();
let
mut
core
=
VllmCore
::
new
(
args
);
core
.receive
(
DirectRequest
{
tokens
:
vec!
[
1
;
8
],
max_output_tokens
:
1
,
uuid
:
Some
(
Uuid
::
from_u128
(
81
)),
dp_rank
:
0
,
arrival_timestamp_ms
:
None
,
});
let
mut
collector
=
crate
::
replay
::
TraceCollector
::
default
();
let
pass
=
core
.execute_pass
(
&
mut
collector
,
0.0
);
let
signal
=
pass
.output_signals
.first
()
.expect
(
"prefill pass should emit one completed signal"
);
assert
!
(
signal
.completed
);
assert_eq!
(
signal
.handoff_delay_ms
,
Some
(
8.0
));
}
#[test]
#[test]
fn
test_first_token_can_arrive_on_prompt_completion_pass
()
{
fn
test_first_token_can_arrive_on_prompt_completion_pass
()
{
let
mut
core
=
VllmCore
::
new
(
make_args
());
let
mut
core
=
VllmCore
::
new
(
make_args
());
...
...
tests/mocker/test_config.py
View file @
02b1c58a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
importlib.util
import
importlib.util
import
json
from
pathlib
import
Path
from
pathlib
import
Path
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
import
numpy
as
np
import
pytest
import
pytest
from
dynamo.llm
import
EngineType
,
EntrypointArgs
from
dynamo.llm
import
EngineType
,
EntrypointArgs
...
@@ -133,3 +136,99 @@ def test_entrypoint_args_accept_typed_mocker_engine_args():
...
@@ -133,3 +136,99 @@ def test_entrypoint_args_accept_typed_mocker_engine_args():
)
)
assert
entrypoint_args
is
not
None
assert
entrypoint_args
is
not
None
def
test_build_mocker_engine_args_preserves_cli_mapped_fields
(
tmp_path
):
planner_profile_data
=
tmp_path
/
"planner_profile_data.npz"
np
.
savez
(
planner_profile_data
,
prefill_isl
=
np
.
array
([
128.0
,
256.0
]),
prefill_ttft_ms
=
np
.
array
([
4.0
,
8.0
]),
decode_active_kv_tokens
=
np
.
array
([
1024.0
,
2048.0
]),
decode_context_length
=
np
.
array
([
128.0
,
256.0
]),
decode_itl
=
np
.
array
([[
1.0
,
1.5
],
[
2.0
,
2.5
]]),
)
args
=
argparse
.
Namespace
(
engine_type
=
"sglang"
,
num_gpu_blocks
=
2048
,
block_size
=
128
,
max_num_seqs
=
64
,
max_num_batched_tokens
=
4096
,
enable_prefix_caching
=
False
,
enable_chunked_prefill
=
False
,
preemption_mode
=
"fifo"
,
speedup_ratio
=
2.0
,
decode_speedup_ratio
=
3.0
,
dp_size
=
4
,
startup_time
=
1.5
,
planner_profile_data
=
planner_profile_data
,
is_prefill_worker
=
True
,
is_decode_worker
=
False
,
durable_kv_events
=
False
,
kv_transfer_bandwidth
=
123.0
,
reasoning
=
json
.
dumps
(
{
"start_thinking_token_id"
:
11
,
"end_thinking_token_id"
:
12
,
"thinking_ratio"
:
0.25
,
}
),
sglang_schedule_policy
=
"lpm"
,
sglang_page_size
=
128
,
sglang_max_prefill_tokens
=
8192
,
sglang_chunked_prefill_size
=
2048
,
sglang_clip_max_new_tokens
=
1024
,
sglang_schedule_conservativeness
=
0.8
,
aic_perf_model
=
True
,
aic_system
=
"h200_sxm"
,
aic_backend_version
=
"0.5.6.post2"
,
aic_tp_size
=
8
,
model_path
=
"/models/mock"
,
)
engine_args
=
CONFIG
.
build_mocker_engine_args
(
args
)
payload
=
json
.
loads
(
engine_args
.
dump_json
())
assert
payload
==
{
"engine_type"
:
"sglang"
,
"num_gpu_blocks"
:
2048
,
"block_size"
:
128
,
"max_num_seqs"
:
64
,
"max_num_batched_tokens"
:
4096
,
"enable_prefix_caching"
:
False
,
"enable_chunked_prefill"
:
False
,
"speedup_ratio"
:
2.0
,
"decode_speedup_ratio"
:
3.0
,
"dp_size"
:
4
,
"startup_time"
:
1.5
,
"worker_type"
:
"prefill"
,
"planner_profile_data"
:
str
(
planner_profile_data
),
"aic_backend"
:
"sglang"
,
"aic_system"
:
"h200_sxm"
,
"aic_backend_version"
:
"0.5.6.post2"
,
"aic_tp_size"
:
8
,
"aic_model_path"
:
"/models/mock"
,
"enable_local_indexer"
:
True
,
"bootstrap_port"
:
None
,
"kv_bytes_per_token"
:
None
,
"kv_transfer_bandwidth"
:
123.0
,
"reasoning"
:
{
"start_thinking_token_id"
:
11
,
"end_thinking_token_id"
:
12
,
"thinking_ratio"
:
0.25
,
},
"zmq_kv_events_port"
:
None
,
"zmq_replay_port"
:
None
,
"preemption_mode"
:
"fifo"
,
"router_queue_policy"
:
None
,
"sglang"
:
{
"schedule_policy"
:
"lpm"
,
"page_size"
:
128
,
"max_prefill_tokens"
:
8192
,
"chunked_prefill_size"
:
2048
,
"clip_max_new_tokens"
:
1024
,
"schedule_conservativeness"
:
0.8
,
},
"has_perf_model"
:
True
,
}
tests/router/test_router_e2e_with_mockers.py
View file @
02b1c58a
...
@@ -8,14 +8,10 @@
...
@@ -8,14 +8,10 @@
# endpoint tables) that races under concurrent xdist workers. Do not add
# endpoint tables) that races under concurrent xdist workers. Do not add
# @pytest.mark.parallel until DRT endpoint registration is confirmed thread-safe.
# @pytest.mark.parallel until DRT endpoint registration is confirmed thread-safe.
#
#
# NOTE: TCP request plane is NOT tested here. These tests use --num-workers > 1 which spawns
# multiple workers in a single process sharing one TCP server. The shared TCP server uses
# endpoint_path (e.g., "generate") as the routing key, causing handler collisions when multiple
# workers register the same endpoint. This is a test-only limitation; production deployments
# with separate processes per worker work correctly with TCP.
import
asyncio
import
asyncio
import
logging
import
logging
import
os
import
os
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
Optional
from
typing
import
Any
,
Dict
,
Optional
import
aiohttp
import
aiohttp
...
@@ -64,6 +60,9 @@ BASE_PORT_BOOTSTRAP = 10100 # Base port for disagg bootstrap rendezvous
...
@@ -64,6 +60,9 @@ BASE_PORT_BOOTSTRAP = 10100 # Base port for disagg bootstrap rendezvous
BASE_PORT_ZMQ
=
11100
# Base port for ZMQ KV event publishing
BASE_PORT_ZMQ
=
11100
# Base port for ZMQ KV event publishing
NUM_REQUESTS
=
100
NUM_REQUESTS
=
100
BLOCK_SIZE
=
16
BLOCK_SIZE
=
16
PLANNER_PROFILE_DATA_DIR
=
(
Path
(
__file__
).
resolve
().
parents
[
1
]
/
"planner/profiling_results/H200_TP1P_TP1D"
)
def
get_unique_ports
(
def
get_unique_ports
(
...
@@ -172,6 +171,20 @@ def _build_mocker_command(
...
@@ -172,6 +171,20 @@ def _build_mocker_command(
command
.
extend
([
"--preemption-mode"
,
str
(
mocker_args
[
"preemption_mode"
])])
command
.
extend
([
"--preemption-mode"
,
str
(
mocker_args
[
"preemption_mode"
])])
if
"dp_size"
in
mocker_args
:
if
"dp_size"
in
mocker_args
:
command
.
extend
([
"--data-parallel-size"
,
str
(
mocker_args
[
"dp_size"
])])
command
.
extend
([
"--data-parallel-size"
,
str
(
mocker_args
[
"dp_size"
])])
if
"planner_profile_data"
in
mocker_args
:
command
.
extend
(
[
"--planner-profile-data"
,
str
(
mocker_args
[
"planner_profile_data"
])]
)
if
mocker_args
.
get
(
"aic_perf_model"
)
is
True
:
command
.
append
(
"--aic-perf-model"
)
if
"aic_system"
in
mocker_args
:
command
.
extend
([
"--aic-system"
,
str
(
mocker_args
[
"aic_system"
])])
if
"aic_backend_version"
in
mocker_args
:
command
.
extend
(
[
"--aic-backend-version"
,
str
(
mocker_args
[
"aic_backend_version"
])]
)
if
"aic_tp_size"
in
mocker_args
:
command
.
extend
([
"--aic-tp-size"
,
str
(
mocker_args
[
"aic_tp_size"
])])
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
if
mocker_args
.
get
(
"durable_kv_events"
)
is
True
:
if
mocker_args
.
get
(
"durable_kv_events"
)
is
True
:
command
.
append
(
"--durable-kv-events"
)
command
.
append
(
"--durable-kv-events"
)
...
@@ -641,17 +654,29 @@ class DisaggMockerProcess:
...
@@ -641,17 +654,29 @@ class DisaggMockerProcess:
@
pytest
.
mark
.
timeout
(
120
)
# bumped for xdist contention (was 42s; ~13.80s serial avg)
@
pytest
.
mark
.
timeout
(
120
)
# bumped for xdist contention (was 42s; ~13.80s serial avg)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"router_mode,durable_kv_events"
,
"router_mode,durable_kv_events
,mocker_args_override
"
,
[
[
pytest
.
param
(
"kv"
,
False
,
id
=
"kv-nondurable"
),
pytest
.
param
(
"kv"
,
False
,
{},
id
=
"kv-nondurable"
),
pytest
.
param
(
"kv"
,
True
,
id
=
"kv-durable"
),
pytest
.
param
(
pytest
.
param
(
"round-robin"
,
False
,
id
=
"roundrobin"
),
"kv"
,
pytest
.
param
(
"random"
,
False
,
id
=
"random"
),
False
,
pytest
.
param
(
"power-of-two"
,
False
,
id
=
"power-of-two"
),
{
"planner_profile_data"
:
PLANNER_PROFILE_DATA_DIR
},
id
=
"kv-planner"
,
),
pytest
.
param
(
"kv"
,
False
,
{
"aic_perf_model"
:
True
,
"aic_system"
:
"h200_sxm"
},
id
=
"kv-aic"
,
),
pytest
.
param
(
"kv"
,
True
,
{},
id
=
"kv-durable"
),
pytest
.
param
(
"round-robin"
,
False
,
{},
id
=
"roundrobin"
),
pytest
.
param
(
"random"
,
False
,
{},
id
=
"random"
),
pytest
.
param
(
"power-of-two"
,
False
,
{},
id
=
"power-of-two"
),
],
],
indirect
=
[
"durable_kv_events"
],
indirect
=
[
"durable_kv_events"
],
)
)
@
pytest
.
mark
.
parametrize
(
"request_plane"
,
[
"nats"
,
"tcp"
],
indirect
=
True
)
@
pytest
.
mark
.
parametrize
(
"request_plane"
,
[
"tcp"
],
indirect
=
True
)
def
test_mocker_router
(
def
test_mocker_router
(
request
,
request
,
runtime_services_dynamic_ports
,
runtime_services_dynamic_ports
,
...
@@ -659,6 +684,7 @@ def test_mocker_router(
...
@@ -659,6 +684,7 @@ def test_mocker_router(
router_mode
,
router_mode
,
request_plane
,
request_plane
,
durable_kv_events
,
durable_kv_events
,
mocker_args_override
,
):
):
"""Test router with multiple mocker engine instances across all router modes.
"""Test router with multiple mocker engine instances across all router modes.
...
@@ -675,6 +701,7 @@ def test_mocker_router(
...
@@ -675,6 +701,7 @@ def test_mocker_router(
"block_size"
:
BLOCK_SIZE
,
"block_size"
:
BLOCK_SIZE
,
"durable_kv_events"
:
durable_kv_events
,
"durable_kv_events"
:
durable_kv_events
,
}
}
mocker_args
.
update
(
mocker_args_override
)
with
MockerProcess
(
with
MockerProcess
(
request
,
request
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment