Unverified Commit 02b1c58a authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add offline disagg replay (#7617)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 4b8826b3
...@@ -36,6 +36,8 @@ pub(super) struct SglangConfig { ...@@ -36,6 +36,8 @@ pub(super) struct SglangConfig {
pub(super) decode_speedup_ratio: f64, pub(super) decode_speedup_ratio: f64,
pub(super) worker_type: WorkerType, pub(super) worker_type: WorkerType,
pub(super) block_size: usize, pub(super) block_size: usize,
pub(super) kv_bytes_per_token: Option<usize>,
pub(super) kv_transfer_bandwidth: Option<f64>,
} }
impl SglangConfig { impl SglangConfig {
...@@ -81,6 +83,8 @@ impl SglangConfig { ...@@ -81,6 +83,8 @@ impl SglangConfig {
decode_speedup_ratio: args.decode_speedup_ratio, decode_speedup_ratio: args.decode_speedup_ratio,
worker_type: args.worker_type, worker_type: args.worker_type,
block_size: args.block_size, block_size: args.block_size,
kv_bytes_per_token: args.kv_bytes_per_token,
kv_transfer_bandwidth: args.kv_transfer_bandwidth,
} }
} }
} }
......
...@@ -101,6 +101,10 @@ impl SglangCore { ...@@ -101,6 +101,10 @@ impl SglangCore {
self.execute_pass_internal(Some(collector), now_ms) self.execute_pass_internal(Some(collector), now_ms)
} }
pub(crate) fn execute_hidden_pass(&mut self, now_ms: f64) -> EnginePassResult {
self.execute_pass_internal(None, now_ms)
}
pub(super) fn execute_pass_internal( pub(super) fn execute_pass_internal(
&mut self, &mut self,
mut collector: Option<&mut TraceCollector>, mut collector: Option<&mut TraceCollector>,
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
use std::time::Duration; use std::time::Duration;
use crate::common::protocols::OutputSignal; use crate::common::protocols::OutputSignal;
use crate::common::utils::compute_prefill_handoff_delay_ms;
use crate::kv_manager::SglangKvManager; use crate::kv_manager::SglangKvManager;
use super::config::{SglangConfig, floor_to_block}; use super::config::{SglangConfig, floor_to_block};
...@@ -179,6 +180,13 @@ pub(super) fn simulate_decode_step( ...@@ -179,6 +180,13 @@ pub(super) fn simulate_decode_step(
output_signals.push(OutputSignal { output_signals.push(OutputSignal {
uuid: req.uuid, uuid: req.uuid,
completed: is_complete, completed: is_complete,
handoff_delay_ms: compute_prefill_handoff_delay_ms(
config.worker_type,
is_complete,
req.prompt_len(),
config.kv_transfer_bandwidth,
config.kv_bytes_per_token,
),
}); });
if is_complete { if is_complete {
......
...@@ -860,7 +860,6 @@ mod router_events { ...@@ -860,7 +860,6 @@ mod router_events {
} }
} }
} }
assert_eq!(seen, expected); assert_eq!(seen, expected);
drop(scheduler); drop(scheduler);
drop(sink); drop(sink);
...@@ -871,4 +870,41 @@ mod router_events { ...@@ -871,4 +870,41 @@ mod router_events {
assert!(harness.ok_count(METRIC_EVENT_REMOVED) > 0); assert!(harness.ok_count(METRIC_EVENT_REMOVED) > 0);
harness.shutdown(); harness.shutdown();
} }
#[test]
fn test_prefill_completion_emits_handoff_delay() {
let args = MockEngineArgs::builder()
.engine_type(EngineType::Sglang)
.num_gpu_blocks(64)
.block_size(4)
.worker_type(crate::common::protocols::WorkerType::Prefill)
.kv_transfer_bandwidth(Some(1.0))
.kv_bytes_per_token(Some(1_000_000))
.speedup_ratio(0.0)
.sglang(Some(SglangArgs {
page_size: Some(4),
chunked_prefill_size: Some(16),
..Default::default()
}))
.build()
.unwrap();
let mut core = SglangCore::new(args);
core.receive(DirectRequest {
tokens: vec![1; 8],
max_output_tokens: 1,
uuid: Some(Uuid::from_u128(91)),
dp_rank: 0,
arrival_timestamp_ms: None,
});
let mut collector = crate::replay::TraceCollector::default();
let pass = core.execute_pass(&mut collector, 0.0);
let signal = pass
.output_signals
.first()
.expect("prefill pass should emit one completed signal");
assert!(signal.completed);
assert_eq!(signal.handoff_delay_ms, Some(8.0));
}
} }
...@@ -14,6 +14,7 @@ use crate::common::protocols::{ ...@@ -14,6 +14,7 @@ use crate::common::protocols::{
WorkerType, WorkerType,
}; };
use crate::common::sequence::ActiveSequence; use crate::common::sequence::ActiveSequence;
use crate::common::utils::compute_prefill_handoff_delay_ms;
use crate::kv_manager::KvManager; use crate::kv_manager::KvManager;
use crate::replay::TraceCollector; use crate::replay::TraceCollector;
use crate::scheduler::{ use crate::scheduler::{
...@@ -278,6 +279,10 @@ impl VllmCore { ...@@ -278,6 +279,10 @@ impl VllmCore {
self.execute_pass_internal(Some(collector), now_ms, None) self.execute_pass_internal(Some(collector), now_ms, None)
} }
pub(crate) fn execute_hidden_pass(&mut self, now_ms: f64) -> EnginePassResult {
self.execute_pass_internal(None, now_ms, None)
}
pub(super) fn execute_pass_internal( pub(super) fn execute_pass_internal(
&mut self, &mut self,
mut collector: Option<&mut TraceCollector>, mut collector: Option<&mut TraceCollector>,
...@@ -641,8 +646,25 @@ impl VllmCore { ...@@ -641,8 +646,25 @@ impl VllmCore {
} }
if let Some(request) = self.state.requests.get(&uuid) { if let Some(request) = self.state.requests.get(&uuid) {
debug_assert_vllm_request_progress(uuid, request); debug_assert_vllm_request_progress(uuid, request);
let handoff_delay_ms = compute_prefill_handoff_delay_ms(
self.args.worker_type,
completed,
request.sequence.num_input_tokens(),
self.args.kv_transfer_bandwidth,
self.args.kv_bytes_per_token,
);
output_signals.push(OutputSignal {
uuid,
completed,
handoff_delay_ms,
});
} else {
output_signals.push(OutputSignal {
uuid,
completed,
handoff_delay_ms: None,
});
} }
output_signals.push(OutputSignal { uuid, completed });
if completed { if completed {
self.state.complete(&uuid); self.state.complete(&uuid);
} }
......
...@@ -175,6 +175,40 @@ mod core_behavior { ...@@ -175,6 +175,40 @@ mod core_behavior {
); );
} }
#[test]
fn test_prefill_completion_emits_handoff_delay() {
let args = MockEngineArgs::builder()
.block_size(4)
.num_gpu_blocks(8)
.max_num_batched_tokens(Some(8))
.max_num_seqs(Some(1))
.enable_chunked_prefill(true)
.worker_type(crate::common::protocols::WorkerType::Prefill)
.kv_transfer_bandwidth(Some(1.0))
.kv_bytes_per_token(Some(1_000_000))
.speedup_ratio(0.0)
.build()
.unwrap();
let mut core = VllmCore::new(args);
core.receive(DirectRequest {
tokens: vec![1; 8],
max_output_tokens: 1,
uuid: Some(Uuid::from_u128(81)),
dp_rank: 0,
arrival_timestamp_ms: None,
});
let mut collector = crate::replay::TraceCollector::default();
let pass = core.execute_pass(&mut collector, 0.0);
let signal = pass
.output_signals
.first()
.expect("prefill pass should emit one completed signal");
assert!(signal.completed);
assert_eq!(signal.handoff_delay_ms, Some(8.0));
}
#[test] #[test]
fn test_first_token_can_arrive_on_prompt_completion_pass() { fn test_first_token_can_arrive_on_prompt_completion_pass() {
let mut core = VllmCore::new(make_args()); let mut core = VllmCore::new(make_args());
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import argparse
import importlib.util import importlib.util
import json
from pathlib import Path from pathlib import Path
from types import SimpleNamespace from types import SimpleNamespace
import numpy as np
import pytest import pytest
from dynamo.llm import EngineType, EntrypointArgs from dynamo.llm import EngineType, EntrypointArgs
...@@ -133,3 +136,99 @@ def test_entrypoint_args_accept_typed_mocker_engine_args(): ...@@ -133,3 +136,99 @@ def test_entrypoint_args_accept_typed_mocker_engine_args():
) )
assert entrypoint_args is not None assert entrypoint_args is not None
def test_build_mocker_engine_args_preserves_cli_mapped_fields(tmp_path):
planner_profile_data = tmp_path / "planner_profile_data.npz"
np.savez(
planner_profile_data,
prefill_isl=np.array([128.0, 256.0]),
prefill_ttft_ms=np.array([4.0, 8.0]),
decode_active_kv_tokens=np.array([1024.0, 2048.0]),
decode_context_length=np.array([128.0, 256.0]),
decode_itl=np.array([[1.0, 1.5], [2.0, 2.5]]),
)
args = argparse.Namespace(
engine_type="sglang",
num_gpu_blocks=2048,
block_size=128,
max_num_seqs=64,
max_num_batched_tokens=4096,
enable_prefix_caching=False,
enable_chunked_prefill=False,
preemption_mode="fifo",
speedup_ratio=2.0,
decode_speedup_ratio=3.0,
dp_size=4,
startup_time=1.5,
planner_profile_data=planner_profile_data,
is_prefill_worker=True,
is_decode_worker=False,
durable_kv_events=False,
kv_transfer_bandwidth=123.0,
reasoning=json.dumps(
{
"start_thinking_token_id": 11,
"end_thinking_token_id": 12,
"thinking_ratio": 0.25,
}
),
sglang_schedule_policy="lpm",
sglang_page_size=128,
sglang_max_prefill_tokens=8192,
sglang_chunked_prefill_size=2048,
sglang_clip_max_new_tokens=1024,
sglang_schedule_conservativeness=0.8,
aic_perf_model=True,
aic_system="h200_sxm",
aic_backend_version="0.5.6.post2",
aic_tp_size=8,
model_path="/models/mock",
)
engine_args = CONFIG.build_mocker_engine_args(args)
payload = json.loads(engine_args.dump_json())
assert payload == {
"engine_type": "sglang",
"num_gpu_blocks": 2048,
"block_size": 128,
"max_num_seqs": 64,
"max_num_batched_tokens": 4096,
"enable_prefix_caching": False,
"enable_chunked_prefill": False,
"speedup_ratio": 2.0,
"decode_speedup_ratio": 3.0,
"dp_size": 4,
"startup_time": 1.5,
"worker_type": "prefill",
"planner_profile_data": str(planner_profile_data),
"aic_backend": "sglang",
"aic_system": "h200_sxm",
"aic_backend_version": "0.5.6.post2",
"aic_tp_size": 8,
"aic_model_path": "/models/mock",
"enable_local_indexer": True,
"bootstrap_port": None,
"kv_bytes_per_token": None,
"kv_transfer_bandwidth": 123.0,
"reasoning": {
"start_thinking_token_id": 11,
"end_thinking_token_id": 12,
"thinking_ratio": 0.25,
},
"zmq_kv_events_port": None,
"zmq_replay_port": None,
"preemption_mode": "fifo",
"router_queue_policy": None,
"sglang": {
"schedule_policy": "lpm",
"page_size": 128,
"max_prefill_tokens": 8192,
"chunked_prefill_size": 2048,
"clip_max_new_tokens": 1024,
"schedule_conservativeness": 0.8,
},
"has_perf_model": True,
}
...@@ -8,14 +8,10 @@ ...@@ -8,14 +8,10 @@
# endpoint tables) that races under concurrent xdist workers. Do not add # endpoint tables) that races under concurrent xdist workers. Do not add
# @pytest.mark.parallel until DRT endpoint registration is confirmed thread-safe. # @pytest.mark.parallel until DRT endpoint registration is confirmed thread-safe.
# #
# NOTE: TCP request plane is NOT tested here. These tests use --num-workers > 1 which spawns
# multiple workers in a single process sharing one TCP server. The shared TCP server uses
# endpoint_path (e.g., "generate") as the routing key, causing handler collisions when multiple
# workers register the same endpoint. This is a test-only limitation; production deployments
# with separate processes per worker work correctly with TCP.
import asyncio import asyncio
import logging import logging
import os import os
from pathlib import Path
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import aiohttp import aiohttp
...@@ -64,6 +60,9 @@ BASE_PORT_BOOTSTRAP = 10100 # Base port for disagg bootstrap rendezvous ...@@ -64,6 +60,9 @@ BASE_PORT_BOOTSTRAP = 10100 # Base port for disagg bootstrap rendezvous
BASE_PORT_ZMQ = 11100 # Base port for ZMQ KV event publishing BASE_PORT_ZMQ = 11100 # Base port for ZMQ KV event publishing
NUM_REQUESTS = 100 NUM_REQUESTS = 100
BLOCK_SIZE = 16 BLOCK_SIZE = 16
PLANNER_PROFILE_DATA_DIR = (
Path(__file__).resolve().parents[1] / "planner/profiling_results/H200_TP1P_TP1D"
)
def get_unique_ports( def get_unique_ports(
...@@ -172,6 +171,20 @@ def _build_mocker_command( ...@@ -172,6 +171,20 @@ def _build_mocker_command(
command.extend(["--preemption-mode", str(mocker_args["preemption_mode"])]) command.extend(["--preemption-mode", str(mocker_args["preemption_mode"])])
if "dp_size" in mocker_args: if "dp_size" in mocker_args:
command.extend(["--data-parallel-size", str(mocker_args["dp_size"])]) command.extend(["--data-parallel-size", str(mocker_args["dp_size"])])
if "planner_profile_data" in mocker_args:
command.extend(
["--planner-profile-data", str(mocker_args["planner_profile_data"])]
)
if mocker_args.get("aic_perf_model") is True:
command.append("--aic-perf-model")
if "aic_system" in mocker_args:
command.extend(["--aic-system", str(mocker_args["aic_system"])])
if "aic_backend_version" in mocker_args:
command.extend(
["--aic-backend-version", str(mocker_args["aic_backend_version"])]
)
if "aic_tp_size" in mocker_args:
command.extend(["--aic-tp-size", str(mocker_args["aic_tp_size"])])
# Use --durable-kv-events to enable JetStream mode (local indexer disabled) # Use --durable-kv-events to enable JetStream mode (local indexer disabled)
if mocker_args.get("durable_kv_events") is True: if mocker_args.get("durable_kv_events") is True:
command.append("--durable-kv-events") command.append("--durable-kv-events")
...@@ -641,17 +654,29 @@ class DisaggMockerProcess: ...@@ -641,17 +654,29 @@ class DisaggMockerProcess:
@pytest.mark.timeout(120) # bumped for xdist contention (was 42s; ~13.80s serial avg) @pytest.mark.timeout(120) # bumped for xdist contention (was 42s; ~13.80s serial avg)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"router_mode,durable_kv_events", "router_mode,durable_kv_events,mocker_args_override",
[ [
pytest.param("kv", False, id="kv-nondurable"), pytest.param("kv", False, {}, id="kv-nondurable"),
pytest.param("kv", True, id="kv-durable"), pytest.param(
pytest.param("round-robin", False, id="roundrobin"), "kv",
pytest.param("random", False, id="random"), False,
pytest.param("power-of-two", False, id="power-of-two"), {"planner_profile_data": PLANNER_PROFILE_DATA_DIR},
id="kv-planner",
),
pytest.param(
"kv",
False,
{"aic_perf_model": True, "aic_system": "h200_sxm"},
id="kv-aic",
),
pytest.param("kv", True, {}, id="kv-durable"),
pytest.param("round-robin", False, {}, id="roundrobin"),
pytest.param("random", False, {}, id="random"),
pytest.param("power-of-two", False, {}, id="power-of-two"),
], ],
indirect=["durable_kv_events"], indirect=["durable_kv_events"],
) )
@pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True) @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
def test_mocker_router( def test_mocker_router(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -659,6 +684,7 @@ def test_mocker_router( ...@@ -659,6 +684,7 @@ def test_mocker_router(
router_mode, router_mode,
request_plane, request_plane,
durable_kv_events, durable_kv_events,
mocker_args_override,
): ):
"""Test router with multiple mocker engine instances across all router modes. """Test router with multiple mocker engine instances across all router modes.
...@@ -675,6 +701,7 @@ def test_mocker_router( ...@@ -675,6 +701,7 @@ def test_mocker_router(
"block_size": BLOCK_SIZE, "block_size": BLOCK_SIZE,
"durable_kv_events": durable_kv_events, "durable_kv_events": durable_kv_events,
} }
mocker_args.update(mocker_args_override)
with MockerProcess( with MockerProcess(
request, request,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment