Unverified Commit 02b1c58a authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add offline disagg replay (#7617)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 4b8826b3
......@@ -36,6 +36,8 @@ pub(super) struct SglangConfig {
pub(super) decode_speedup_ratio: f64,
pub(super) worker_type: WorkerType,
pub(super) block_size: usize,
pub(super) kv_bytes_per_token: Option<usize>,
pub(super) kv_transfer_bandwidth: Option<f64>,
}
impl SglangConfig {
......@@ -81,6 +83,8 @@ impl SglangConfig {
decode_speedup_ratio: args.decode_speedup_ratio,
worker_type: args.worker_type,
block_size: args.block_size,
kv_bytes_per_token: args.kv_bytes_per_token,
kv_transfer_bandwidth: args.kv_transfer_bandwidth,
}
}
}
......
......@@ -101,6 +101,10 @@ impl SglangCore {
self.execute_pass_internal(Some(collector), now_ms)
}
pub(crate) fn execute_hidden_pass(&mut self, now_ms: f64) -> EnginePassResult {
self.execute_pass_internal(None, now_ms)
}
pub(super) fn execute_pass_internal(
&mut self,
mut collector: Option<&mut TraceCollector>,
......
......@@ -4,6 +4,7 @@
use std::time::Duration;
use crate::common::protocols::OutputSignal;
use crate::common::utils::compute_prefill_handoff_delay_ms;
use crate::kv_manager::SglangKvManager;
use super::config::{SglangConfig, floor_to_block};
......@@ -179,6 +180,13 @@ pub(super) fn simulate_decode_step(
output_signals.push(OutputSignal {
uuid: req.uuid,
completed: is_complete,
handoff_delay_ms: compute_prefill_handoff_delay_ms(
config.worker_type,
is_complete,
req.prompt_len(),
config.kv_transfer_bandwidth,
config.kv_bytes_per_token,
),
});
if is_complete {
......
......@@ -860,7 +860,6 @@ mod router_events {
}
}
}
assert_eq!(seen, expected);
drop(scheduler);
drop(sink);
......@@ -871,4 +870,41 @@ mod router_events {
assert!(harness.ok_count(METRIC_EVENT_REMOVED) > 0);
harness.shutdown();
}
#[test]
fn test_prefill_completion_emits_handoff_delay() {
let args = MockEngineArgs::builder()
.engine_type(EngineType::Sglang)
.num_gpu_blocks(64)
.block_size(4)
.worker_type(crate::common::protocols::WorkerType::Prefill)
.kv_transfer_bandwidth(Some(1.0))
.kv_bytes_per_token(Some(1_000_000))
.speedup_ratio(0.0)
.sglang(Some(SglangArgs {
page_size: Some(4),
chunked_prefill_size: Some(16),
..Default::default()
}))
.build()
.unwrap();
let mut core = SglangCore::new(args);
core.receive(DirectRequest {
tokens: vec![1; 8],
max_output_tokens: 1,
uuid: Some(Uuid::from_u128(91)),
dp_rank: 0,
arrival_timestamp_ms: None,
});
let mut collector = crate::replay::TraceCollector::default();
let pass = core.execute_pass(&mut collector, 0.0);
let signal = pass
.output_signals
.first()
.expect("prefill pass should emit one completed signal");
assert!(signal.completed);
assert_eq!(signal.handoff_delay_ms, Some(8.0));
}
}
......@@ -14,6 +14,7 @@ use crate::common::protocols::{
WorkerType,
};
use crate::common::sequence::ActiveSequence;
use crate::common::utils::compute_prefill_handoff_delay_ms;
use crate::kv_manager::KvManager;
use crate::replay::TraceCollector;
use crate::scheduler::{
......@@ -278,6 +279,10 @@ impl VllmCore {
self.execute_pass_internal(Some(collector), now_ms, None)
}
pub(crate) fn execute_hidden_pass(&mut self, now_ms: f64) -> EnginePassResult {
self.execute_pass_internal(None, now_ms, None)
}
pub(super) fn execute_pass_internal(
&mut self,
mut collector: Option<&mut TraceCollector>,
......@@ -641,8 +646,25 @@ impl VllmCore {
}
if let Some(request) = self.state.requests.get(&uuid) {
debug_assert_vllm_request_progress(uuid, request);
let handoff_delay_ms = compute_prefill_handoff_delay_ms(
self.args.worker_type,
completed,
request.sequence.num_input_tokens(),
self.args.kv_transfer_bandwidth,
self.args.kv_bytes_per_token,
);
output_signals.push(OutputSignal {
uuid,
completed,
handoff_delay_ms,
});
} else {
output_signals.push(OutputSignal {
uuid,
completed,
handoff_delay_ms: None,
});
}
output_signals.push(OutputSignal { uuid, completed });
if completed {
self.state.complete(&uuid);
}
......
......@@ -175,6 +175,40 @@ mod core_behavior {
);
}
#[test]
fn test_prefill_completion_emits_handoff_delay() {
let args = MockEngineArgs::builder()
.block_size(4)
.num_gpu_blocks(8)
.max_num_batched_tokens(Some(8))
.max_num_seqs(Some(1))
.enable_chunked_prefill(true)
.worker_type(crate::common::protocols::WorkerType::Prefill)
.kv_transfer_bandwidth(Some(1.0))
.kv_bytes_per_token(Some(1_000_000))
.speedup_ratio(0.0)
.build()
.unwrap();
let mut core = VllmCore::new(args);
core.receive(DirectRequest {
tokens: vec![1; 8],
max_output_tokens: 1,
uuid: Some(Uuid::from_u128(81)),
dp_rank: 0,
arrival_timestamp_ms: None,
});
let mut collector = crate::replay::TraceCollector::default();
let pass = core.execute_pass(&mut collector, 0.0);
let signal = pass
.output_signals
.first()
.expect("prefill pass should emit one completed signal");
assert!(signal.completed);
assert_eq!(signal.handoff_delay_ms, Some(8.0));
}
#[test]
fn test_first_token_can_arrive_on_prompt_completion_pass() {
let mut core = VllmCore::new(make_args());
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import importlib.util
import json
from pathlib import Path
from types import SimpleNamespace
import numpy as np
import pytest
from dynamo.llm import EngineType, EntrypointArgs
......@@ -133,3 +136,99 @@ def test_entrypoint_args_accept_typed_mocker_engine_args():
)
assert entrypoint_args is not None
def test_build_mocker_engine_args_preserves_cli_mapped_fields(tmp_path):
planner_profile_data = tmp_path / "planner_profile_data.npz"
np.savez(
planner_profile_data,
prefill_isl=np.array([128.0, 256.0]),
prefill_ttft_ms=np.array([4.0, 8.0]),
decode_active_kv_tokens=np.array([1024.0, 2048.0]),
decode_context_length=np.array([128.0, 256.0]),
decode_itl=np.array([[1.0, 1.5], [2.0, 2.5]]),
)
args = argparse.Namespace(
engine_type="sglang",
num_gpu_blocks=2048,
block_size=128,
max_num_seqs=64,
max_num_batched_tokens=4096,
enable_prefix_caching=False,
enable_chunked_prefill=False,
preemption_mode="fifo",
speedup_ratio=2.0,
decode_speedup_ratio=3.0,
dp_size=4,
startup_time=1.5,
planner_profile_data=planner_profile_data,
is_prefill_worker=True,
is_decode_worker=False,
durable_kv_events=False,
kv_transfer_bandwidth=123.0,
reasoning=json.dumps(
{
"start_thinking_token_id": 11,
"end_thinking_token_id": 12,
"thinking_ratio": 0.25,
}
),
sglang_schedule_policy="lpm",
sglang_page_size=128,
sglang_max_prefill_tokens=8192,
sglang_chunked_prefill_size=2048,
sglang_clip_max_new_tokens=1024,
sglang_schedule_conservativeness=0.8,
aic_perf_model=True,
aic_system="h200_sxm",
aic_backend_version="0.5.6.post2",
aic_tp_size=8,
model_path="/models/mock",
)
engine_args = CONFIG.build_mocker_engine_args(args)
payload = json.loads(engine_args.dump_json())
assert payload == {
"engine_type": "sglang",
"num_gpu_blocks": 2048,
"block_size": 128,
"max_num_seqs": 64,
"max_num_batched_tokens": 4096,
"enable_prefix_caching": False,
"enable_chunked_prefill": False,
"speedup_ratio": 2.0,
"decode_speedup_ratio": 3.0,
"dp_size": 4,
"startup_time": 1.5,
"worker_type": "prefill",
"planner_profile_data": str(planner_profile_data),
"aic_backend": "sglang",
"aic_system": "h200_sxm",
"aic_backend_version": "0.5.6.post2",
"aic_tp_size": 8,
"aic_model_path": "/models/mock",
"enable_local_indexer": True,
"bootstrap_port": None,
"kv_bytes_per_token": None,
"kv_transfer_bandwidth": 123.0,
"reasoning": {
"start_thinking_token_id": 11,
"end_thinking_token_id": 12,
"thinking_ratio": 0.25,
},
"zmq_kv_events_port": None,
"zmq_replay_port": None,
"preemption_mode": "fifo",
"router_queue_policy": None,
"sglang": {
"schedule_policy": "lpm",
"page_size": 128,
"max_prefill_tokens": 8192,
"chunked_prefill_size": 2048,
"clip_max_new_tokens": 1024,
"schedule_conservativeness": 0.8,
},
"has_perf_model": True,
}
......@@ -8,14 +8,10 @@
# endpoint tables) that races under concurrent xdist workers. Do not add
# @pytest.mark.parallel until DRT endpoint registration is confirmed thread-safe.
#
# NOTE: TCP request plane is NOT tested here. These tests use --num-workers > 1 which spawns
# multiple workers in a single process sharing one TCP server. The shared TCP server uses
# endpoint_path (e.g., "generate") as the routing key, causing handler collisions when multiple
# workers register the same endpoint. This is a test-only limitation; production deployments
# with separate processes per worker work correctly with TCP.
import asyncio
import logging
import os
from pathlib import Path
from typing import Any, Dict, Optional
import aiohttp
......@@ -64,6 +60,9 @@ BASE_PORT_BOOTSTRAP = 10100 # Base port for disagg bootstrap rendezvous
BASE_PORT_ZMQ = 11100 # Base port for ZMQ KV event publishing
NUM_REQUESTS = 100
BLOCK_SIZE = 16
PLANNER_PROFILE_DATA_DIR = (
Path(__file__).resolve().parents[1] / "planner/profiling_results/H200_TP1P_TP1D"
)
def get_unique_ports(
......@@ -172,6 +171,20 @@ def _build_mocker_command(
command.extend(["--preemption-mode", str(mocker_args["preemption_mode"])])
if "dp_size" in mocker_args:
command.extend(["--data-parallel-size", str(mocker_args["dp_size"])])
if "planner_profile_data" in mocker_args:
command.extend(
["--planner-profile-data", str(mocker_args["planner_profile_data"])]
)
if mocker_args.get("aic_perf_model") is True:
command.append("--aic-perf-model")
if "aic_system" in mocker_args:
command.extend(["--aic-system", str(mocker_args["aic_system"])])
if "aic_backend_version" in mocker_args:
command.extend(
["--aic-backend-version", str(mocker_args["aic_backend_version"])]
)
if "aic_tp_size" in mocker_args:
command.extend(["--aic-tp-size", str(mocker_args["aic_tp_size"])])
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
if mocker_args.get("durable_kv_events") is True:
command.append("--durable-kv-events")
......@@ -641,17 +654,29 @@ class DisaggMockerProcess:
@pytest.mark.timeout(120) # bumped for xdist contention (was 42s; ~13.80s serial avg)
@pytest.mark.parametrize(
"router_mode,durable_kv_events",
"router_mode,durable_kv_events,mocker_args_override",
[
pytest.param("kv", False, id="kv-nondurable"),
pytest.param("kv", True, id="kv-durable"),
pytest.param("round-robin", False, id="roundrobin"),
pytest.param("random", False, id="random"),
pytest.param("power-of-two", False, id="power-of-two"),
pytest.param("kv", False, {}, id="kv-nondurable"),
pytest.param(
"kv",
False,
{"planner_profile_data": PLANNER_PROFILE_DATA_DIR},
id="kv-planner",
),
pytest.param(
"kv",
False,
{"aic_perf_model": True, "aic_system": "h200_sxm"},
id="kv-aic",
),
pytest.param("kv", True, {}, id="kv-durable"),
pytest.param("round-robin", False, {}, id="roundrobin"),
pytest.param("random", False, {}, id="random"),
pytest.param("power-of-two", False, {}, id="power-of-two"),
],
indirect=["durable_kv_events"],
)
@pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True)
@pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
def test_mocker_router(
request,
runtime_services_dynamic_ports,
......@@ -659,6 +684,7 @@ def test_mocker_router(
router_mode,
request_plane,
durable_kv_events,
mocker_args_override,
):
"""Test router with multiple mocker engine instances across all router modes.
......@@ -675,6 +701,7 @@ def test_mocker_router(
"block_size": BLOCK_SIZE,
"durable_kv_events": durable_kv_events,
}
mocker_args.update(mocker_args_override)
with MockerProcess(
request,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment