Unverified Commit b2c59aa4 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(replay): add shared loadgen workload paths [DYN-2510] (#7593)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 2b36b175
......@@ -360,6 +360,22 @@ impl TraceCollector {
reused_input_tokens: stats.reused_input_tokens,
})
}
#[cfg(test)]
pub(crate) fn snapshots(&self) -> Vec<TraceRequestStatsSnapshot> {
self.requests
.values()
.map(|stats| TraceRequestStatsSnapshot {
arrival_time_ms: stats.arrival_time_ms,
first_admit_ms: stats.first_admit_ms,
first_token_ms: stats.first_token_ms(),
last_token_ms: stats.last_token_ms(),
input_length: stats.input_length,
output_length: stats.output_length,
reused_input_tokens: stats.reused_input_tokens,
})
.collect()
}
}
fn mean(values: &[f64]) -> f64 {
......
......@@ -7,7 +7,6 @@ use std::time::Instant;
use anyhow::{Result, bail};
use dynamo_kv_router::config::KvRouterConfig;
use super::loader::load_trace_requests;
use super::online;
use super::validate::{
validate_offline_concurrency_args, validate_offline_replay_args,
......@@ -15,6 +14,7 @@ use super::validate::{
};
use super::{ReplayRouterMode, TraceSimulationReport};
use crate::common::protocols::{DirectRequest, MockEngineArgs};
use crate::loadgen::Trace;
pub fn simulate_trace_file(
args: MockEngineArgs,
......@@ -42,14 +42,15 @@ pub fn simulate_trace_file_with_router_mode(
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_offline_replay_args(&args, num_workers, router_mode)?;
let requests = load_trace_requests(trace_path, args.block_size, true)?;
let trace = Trace::from_mooncake(trace_path, args.block_size)?
.normalize_session_starts()?
.speed_up_timing(arrival_speedup_ratio)?;
let started_at = Instant::now();
let report = crate::replay::offline::simulate_trace(
let report = crate::replay::offline::simulate_trace_workload(
args,
router_config,
requests,
trace,
num_workers,
arrival_speedup_ratio,
router_mode,
)?;
Ok(report.with_wall_time_ms(started_at.elapsed().as_secs_f64() * 1000.0))
......@@ -81,15 +82,10 @@ pub fn simulate_trace_live_file_with_router_mode(
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_online_replay_args(&args, num_workers)?;
let requests = load_trace_requests(trace_path, args.block_size, true)?;
online::simulate_trace_requests(
args,
router_config,
requests,
num_workers,
arrival_speedup_ratio,
router_mode,
)
let trace = Trace::from_mooncake(trace_path, args.block_size)?
.normalize_session_starts()?
.speed_up_timing(arrival_speedup_ratio)?;
online::simulate_trace_workload(args, router_config, trace, num_workers, router_mode)
}
pub fn simulate_trace_requests(
......@@ -199,12 +195,13 @@ pub fn simulate_concurrency_file_with_router_mode(
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
let requests = load_trace_requests(trace_path, args.block_size, false)?;
validate_offline_concurrency_args(&args, num_workers, max_in_flight, router_mode)?;
let trace = Trace::from_mooncake(trace_path, args.block_size)?;
let started_at = Instant::now();
let report = simulate_concurrency_requests_with_router_mode(
let report = simulate_concurrency_workload_with_router_mode(
args,
router_config,
requests,
trace,
max_in_flight,
num_workers,
router_mode,
......@@ -238,11 +235,11 @@ pub fn simulate_concurrency_live_file_with_router_mode(
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_online_concurrency_args(&args, num_workers, max_in_flight)?;
let requests = load_trace_requests(trace_path, args.block_size, false)?;
online::simulate_concurrency_requests(
let trace = Trace::from_mooncake(trace_path, args.block_size)?;
online::simulate_concurrency_workload(
args,
router_config,
requests,
trace,
max_in_flight,
num_workers,
router_mode,
......@@ -328,3 +325,135 @@ pub fn simulate_concurrency_requests_with_router_mode(
router_mode,
)
}
pub fn simulate_trace_workload(
args: MockEngineArgs,
trace: Trace,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_trace_workload_with_router_mode(
args,
None,
trace,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_trace_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_offline_replay_args(&args, num_workers, router_mode)?;
let started_at = Instant::now();
let report = crate::replay::offline::simulate_trace_workload(
args,
router_config,
trace,
num_workers,
router_mode,
)?;
Ok(report.with_wall_time_ms(started_at.elapsed().as_secs_f64() * 1000.0))
}
pub fn simulate_trace_live_workload(
args: MockEngineArgs,
trace: Trace,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_trace_live_workload_with_router_mode(
args,
None,
trace,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_trace_live_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_online_replay_args(&args, num_workers)?;
online::simulate_trace_workload(args, router_config, trace, num_workers, router_mode)
}
pub fn simulate_concurrency_workload(
args: MockEngineArgs,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_concurrency_workload_with_router_mode(
args,
None,
trace,
max_in_flight,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_concurrency_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_offline_concurrency_args(&args, num_workers, max_in_flight, router_mode)?;
crate::replay::offline::simulate_concurrency_workload(
args,
router_config,
trace,
max_in_flight,
num_workers,
router_mode,
)
}
pub fn simulate_concurrency_live_workload(
args: MockEngineArgs,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_concurrency_live_workload_with_router_mode(
args,
None,
trace,
max_in_flight,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_concurrency_live_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_online_concurrency_args(&args, num_workers, max_in_flight)?;
online::simulate_concurrency_workload(
args,
router_config,
trace,
max_in_flight,
num_workers,
router_mode,
)
}
......@@ -3,7 +3,6 @@
mod collector;
mod entrypoints;
mod loader;
pub(crate) mod offline;
mod online;
mod router;
......@@ -30,11 +29,15 @@ pub use entrypoints::{
simulate_concurrency_file, simulate_concurrency_file_with_router_mode,
simulate_concurrency_live_file, simulate_concurrency_live_file_with_router_mode,
simulate_concurrency_live_requests, simulate_concurrency_live_requests_with_router_mode,
simulate_concurrency_live_workload, simulate_concurrency_live_workload_with_router_mode,
simulate_concurrency_requests, simulate_concurrency_requests_with_router_mode,
simulate_concurrency_workload, simulate_concurrency_workload_with_router_mode,
simulate_trace_file, simulate_trace_file_with_router_mode, simulate_trace_live_file,
simulate_trace_live_file_with_router_mode, simulate_trace_live_requests,
simulate_trace_live_requests_with_router_mode, simulate_trace_requests,
simulate_trace_requests_with_router_mode,
simulate_trace_live_requests_with_router_mode, simulate_trace_live_workload,
simulate_trace_live_workload_with_router_mode, simulate_trace_requests,
simulate_trace_requests_with_router_mode, simulate_trace_workload,
simulate_trace_workload_with_router_mode,
};
pub(crate) fn normalize_trace_requests(
......
......@@ -9,7 +9,7 @@ The goal is to simulate trace execution without spinning up async runtimes, netw
The public replay entrypoints live one level up in `lib/mocker/src/replay/entrypoints.rs`. They:
- normalize `MockEngineArgs`
- load or accept `DirectRequest`s
- load or accept `DirectRequest`s or `loadgen::Trace` workloads
- validate replay arguments
- dispatch to offline or online replay
......@@ -42,7 +42,10 @@ The single-worker path is intentionally simple and only used when:
- `num_workers == 1`
- engine type is `vllm`
That path avoids the cluster event queue and router machinery entirely.
That path avoids the cluster event queue and router machinery entirely, but it now supports both:
- flat request replay
- workload-driven replay through `WorkloadDriver` for multi-turn/session traces
```mermaid
flowchart TD
......@@ -63,6 +66,8 @@ Important details:
- Trace mode uses `normalize_trace_requests` in `lib/mocker/src/replay/mod.rs` so the first request starts at `0 ms`, then applies `arrival_speedup_ratio`.
- Concurrency mode ignores original arrival spacing and keeps the worker filled up to `max_in_flight`.
- Workload trace mode honors first-turn timestamps and inter-turn delays.
- Workload concurrency mode ignores first-turn timestamps but still enforces inter-turn delays after completion.
- The worker itself is still the real mocker engine core; only the scheduling loop is simplified.
## Multi-Worker Harness
......@@ -178,13 +183,15 @@ In round-robin mode, this capture is skipped because nothing consumes those even
Both single and multi harnesses support two admission modes:
- Trace mode
- respects input arrival timestamps
- timestamps are normalized so the first request starts at `0 ms`
- `arrival_speedup_ratio` compresses or stretches inter-arrival gaps
- for flat requests, respects input arrival timestamps
- for workloads, respects first-turn timestamps and inter-turn delays
- timestamps are normalized so the first request or first session starts at `0 ms`
- `arrival_speedup_ratio` compresses or stretches inter-arrival gaps and inter-turn delays
- Concurrency mode
- ignores original spacing
- ignores original first-turn spacing
- keeps up to `max_in_flight` requests resident in the cluster
- for workloads, still unlocks follow-up turns only after completion plus inter-turn delay
- stamps synthetic arrival times as requests are admitted
This split is why `lib/mocker/src/replay/offline/mod.rs` exposes both:
......
......@@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
use crate::common::protocols::{DirectRequest, MockEngineArgs};
use crate::loadgen::Trace;
pub(crate) use crate::replay::normalize_trace_requests;
use crate::replay::{ReplayRouterMode, TraceSimulationReport};
use dynamo_kv_router::config::KvRouterConfig;
......@@ -55,3 +56,39 @@ pub(crate) fn simulate_concurrency(
)
}
}
pub(crate) fn simulate_trace_workload(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> anyhow::Result<TraceSimulationReport> {
if num_workers == 1 && args.engine_type == crate::common::protocols::EngineType::Vllm {
single::simulate_trace_workload_single(args, trace)
} else {
multi::simulate_trace_workload_multi(args, router_config, trace, num_workers, router_mode)
}
}
pub(crate) fn simulate_concurrency_workload(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> anyhow::Result<TraceSimulationReport> {
if num_workers == 1 && args.engine_type == crate::common::protocols::EngineType::Vllm {
single::simulate_concurrency_workload_single(args, trace, max_in_flight)
} else {
multi::simulate_concurrency_workload_multi(
args,
router_config,
trace,
max_in_flight,
num_workers,
router_mode,
)
}
}
This diff is collapsed.
This diff is collapsed.
......@@ -12,6 +12,15 @@ pub(crate) struct OfflineWorkerState {
in_flight: usize,
}
#[cfg(test)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct OfflineWorkerSnapshot {
pub(crate) busy: bool,
pub(crate) in_flight: usize,
pub(crate) ready: bool,
pub(crate) drained: bool,
}
impl OfflineWorkerState {
pub(crate) fn new(worker_idx: usize, args: MockEngineArgs, capture_kv_events: bool) -> Self {
let core = match args.engine_type {
......@@ -81,4 +90,14 @@ impl OfflineWorkerState {
) -> EnginePassResult {
self.core.execute_pass(collector, now_ms)
}
#[cfg(test)]
pub(crate) fn debug_snapshot(&self) -> OfflineWorkerSnapshot {
OfflineWorkerSnapshot {
busy: self.busy,
in_flight: self.in_flight,
ready: self.is_ready(),
drained: self.is_drained(),
}
}
}
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
mod runtime;
mod demux;
mod live_runtime;
mod state;
mod task;
pub(crate) use runtime::{simulate_concurrency_requests, simulate_trace_requests};
#[cfg(test)]
mod tests;
pub(crate) use live_runtime::{
simulate_concurrency_requests, simulate_concurrency_workload, simulate_trace_requests,
simulate_trace_workload,
};
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -6,4 +6,6 @@ mod online;
mod shared;
pub(crate) use offline::OfflineReplayRouter;
#[cfg(test)]
pub(crate) use offline::OfflineRouterSnapshot;
pub(crate) use online::ReplayRouter;
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment