Unverified Commit b2c59aa4 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(replay): add shared loadgen workload paths [DYN-2510] (#7593)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 2b36b175
...@@ -360,6 +360,22 @@ impl TraceCollector { ...@@ -360,6 +360,22 @@ impl TraceCollector {
reused_input_tokens: stats.reused_input_tokens, reused_input_tokens: stats.reused_input_tokens,
}) })
} }
#[cfg(test)]
pub(crate) fn snapshots(&self) -> Vec<TraceRequestStatsSnapshot> {
self.requests
.values()
.map(|stats| TraceRequestStatsSnapshot {
arrival_time_ms: stats.arrival_time_ms,
first_admit_ms: stats.first_admit_ms,
first_token_ms: stats.first_token_ms(),
last_token_ms: stats.last_token_ms(),
input_length: stats.input_length,
output_length: stats.output_length,
reused_input_tokens: stats.reused_input_tokens,
})
.collect()
}
} }
fn mean(values: &[f64]) -> f64 { fn mean(values: &[f64]) -> f64 {
......
...@@ -7,7 +7,6 @@ use std::time::Instant; ...@@ -7,7 +7,6 @@ use std::time::Instant;
use anyhow::{Result, bail}; use anyhow::{Result, bail};
use dynamo_kv_router::config::KvRouterConfig; use dynamo_kv_router::config::KvRouterConfig;
use super::loader::load_trace_requests;
use super::online; use super::online;
use super::validate::{ use super::validate::{
validate_offline_concurrency_args, validate_offline_replay_args, validate_offline_concurrency_args, validate_offline_replay_args,
...@@ -15,6 +14,7 @@ use super::validate::{ ...@@ -15,6 +14,7 @@ use super::validate::{
}; };
use super::{ReplayRouterMode, TraceSimulationReport}; use super::{ReplayRouterMode, TraceSimulationReport};
use crate::common::protocols::{DirectRequest, MockEngineArgs}; use crate::common::protocols::{DirectRequest, MockEngineArgs};
use crate::loadgen::Trace;
pub fn simulate_trace_file( pub fn simulate_trace_file(
args: MockEngineArgs, args: MockEngineArgs,
...@@ -42,14 +42,15 @@ pub fn simulate_trace_file_with_router_mode( ...@@ -42,14 +42,15 @@ pub fn simulate_trace_file_with_router_mode(
) -> Result<TraceSimulationReport> { ) -> Result<TraceSimulationReport> {
let args = args.normalized()?; let args = args.normalized()?;
validate_offline_replay_args(&args, num_workers, router_mode)?; validate_offline_replay_args(&args, num_workers, router_mode)?;
let requests = load_trace_requests(trace_path, args.block_size, true)?; let trace = Trace::from_mooncake(trace_path, args.block_size)?
.normalize_session_starts()?
.speed_up_timing(arrival_speedup_ratio)?;
let started_at = Instant::now(); let started_at = Instant::now();
let report = crate::replay::offline::simulate_trace( let report = crate::replay::offline::simulate_trace_workload(
args, args,
router_config, router_config,
requests, trace,
num_workers, num_workers,
arrival_speedup_ratio,
router_mode, router_mode,
)?; )?;
Ok(report.with_wall_time_ms(started_at.elapsed().as_secs_f64() * 1000.0)) Ok(report.with_wall_time_ms(started_at.elapsed().as_secs_f64() * 1000.0))
...@@ -81,15 +82,10 @@ pub fn simulate_trace_live_file_with_router_mode( ...@@ -81,15 +82,10 @@ pub fn simulate_trace_live_file_with_router_mode(
) -> Result<TraceSimulationReport> { ) -> Result<TraceSimulationReport> {
let args = args.normalized()?; let args = args.normalized()?;
validate_online_replay_args(&args, num_workers)?; validate_online_replay_args(&args, num_workers)?;
let requests = load_trace_requests(trace_path, args.block_size, true)?; let trace = Trace::from_mooncake(trace_path, args.block_size)?
online::simulate_trace_requests( .normalize_session_starts()?
args, .speed_up_timing(arrival_speedup_ratio)?;
router_config, online::simulate_trace_workload(args, router_config, trace, num_workers, router_mode)
requests,
num_workers,
arrival_speedup_ratio,
router_mode,
)
} }
pub fn simulate_trace_requests( pub fn simulate_trace_requests(
...@@ -199,12 +195,13 @@ pub fn simulate_concurrency_file_with_router_mode( ...@@ -199,12 +195,13 @@ pub fn simulate_concurrency_file_with_router_mode(
router_mode: ReplayRouterMode, router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> { ) -> Result<TraceSimulationReport> {
let args = args.normalized()?; let args = args.normalized()?;
let requests = load_trace_requests(trace_path, args.block_size, false)?; validate_offline_concurrency_args(&args, num_workers, max_in_flight, router_mode)?;
let trace = Trace::from_mooncake(trace_path, args.block_size)?;
let started_at = Instant::now(); let started_at = Instant::now();
let report = simulate_concurrency_requests_with_router_mode( let report = simulate_concurrency_workload_with_router_mode(
args, args,
router_config, router_config,
requests, trace,
max_in_flight, max_in_flight,
num_workers, num_workers,
router_mode, router_mode,
...@@ -238,11 +235,11 @@ pub fn simulate_concurrency_live_file_with_router_mode( ...@@ -238,11 +235,11 @@ pub fn simulate_concurrency_live_file_with_router_mode(
) -> Result<TraceSimulationReport> { ) -> Result<TraceSimulationReport> {
let args = args.normalized()?; let args = args.normalized()?;
validate_online_concurrency_args(&args, num_workers, max_in_flight)?; validate_online_concurrency_args(&args, num_workers, max_in_flight)?;
let requests = load_trace_requests(trace_path, args.block_size, false)?; let trace = Trace::from_mooncake(trace_path, args.block_size)?;
online::simulate_concurrency_requests( online::simulate_concurrency_workload(
args, args,
router_config, router_config,
requests, trace,
max_in_flight, max_in_flight,
num_workers, num_workers,
router_mode, router_mode,
...@@ -328,3 +325,135 @@ pub fn simulate_concurrency_requests_with_router_mode( ...@@ -328,3 +325,135 @@ pub fn simulate_concurrency_requests_with_router_mode(
router_mode, router_mode,
) )
} }
pub fn simulate_trace_workload(
args: MockEngineArgs,
trace: Trace,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_trace_workload_with_router_mode(
args,
None,
trace,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_trace_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_offline_replay_args(&args, num_workers, router_mode)?;
let started_at = Instant::now();
let report = crate::replay::offline::simulate_trace_workload(
args,
router_config,
trace,
num_workers,
router_mode,
)?;
Ok(report.with_wall_time_ms(started_at.elapsed().as_secs_f64() * 1000.0))
}
pub fn simulate_trace_live_workload(
args: MockEngineArgs,
trace: Trace,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_trace_live_workload_with_router_mode(
args,
None,
trace,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_trace_live_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_online_replay_args(&args, num_workers)?;
online::simulate_trace_workload(args, router_config, trace, num_workers, router_mode)
}
pub fn simulate_concurrency_workload(
args: MockEngineArgs,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_concurrency_workload_with_router_mode(
args,
None,
trace,
max_in_flight,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_concurrency_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_offline_concurrency_args(&args, num_workers, max_in_flight, router_mode)?;
crate::replay::offline::simulate_concurrency_workload(
args,
router_config,
trace,
max_in_flight,
num_workers,
router_mode,
)
}
pub fn simulate_concurrency_live_workload(
args: MockEngineArgs,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
) -> Result<TraceSimulationReport> {
simulate_concurrency_live_workload_with_router_mode(
args,
None,
trace,
max_in_flight,
num_workers,
ReplayRouterMode::RoundRobin,
)
}
pub fn simulate_concurrency_live_workload_with_router_mode(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> Result<TraceSimulationReport> {
let args = args.normalized()?;
validate_online_concurrency_args(&args, num_workers, max_in_flight)?;
online::simulate_concurrency_workload(
args,
router_config,
trace,
max_in_flight,
num_workers,
router_mode,
)
}
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
mod collector; mod collector;
mod entrypoints; mod entrypoints;
mod loader;
pub(crate) mod offline; pub(crate) mod offline;
mod online; mod online;
mod router; mod router;
...@@ -30,11 +29,15 @@ pub use entrypoints::{ ...@@ -30,11 +29,15 @@ pub use entrypoints::{
simulate_concurrency_file, simulate_concurrency_file_with_router_mode, simulate_concurrency_file, simulate_concurrency_file_with_router_mode,
simulate_concurrency_live_file, simulate_concurrency_live_file_with_router_mode, simulate_concurrency_live_file, simulate_concurrency_live_file_with_router_mode,
simulate_concurrency_live_requests, simulate_concurrency_live_requests_with_router_mode, simulate_concurrency_live_requests, simulate_concurrency_live_requests_with_router_mode,
simulate_concurrency_live_workload, simulate_concurrency_live_workload_with_router_mode,
simulate_concurrency_requests, simulate_concurrency_requests_with_router_mode, simulate_concurrency_requests, simulate_concurrency_requests_with_router_mode,
simulate_concurrency_workload, simulate_concurrency_workload_with_router_mode,
simulate_trace_file, simulate_trace_file_with_router_mode, simulate_trace_live_file, simulate_trace_file, simulate_trace_file_with_router_mode, simulate_trace_live_file,
simulate_trace_live_file_with_router_mode, simulate_trace_live_requests, simulate_trace_live_file_with_router_mode, simulate_trace_live_requests,
simulate_trace_live_requests_with_router_mode, simulate_trace_requests, simulate_trace_live_requests_with_router_mode, simulate_trace_live_workload,
simulate_trace_requests_with_router_mode, simulate_trace_live_workload_with_router_mode, simulate_trace_requests,
simulate_trace_requests_with_router_mode, simulate_trace_workload,
simulate_trace_workload_with_router_mode,
}; };
pub(crate) fn normalize_trace_requests( pub(crate) fn normalize_trace_requests(
......
...@@ -9,7 +9,7 @@ The goal is to simulate trace execution without spinning up async runtimes, netw ...@@ -9,7 +9,7 @@ The goal is to simulate trace execution without spinning up async runtimes, netw
The public replay entrypoints live one level up in `lib/mocker/src/replay/entrypoints.rs`. They: The public replay entrypoints live one level up in `lib/mocker/src/replay/entrypoints.rs`. They:
- normalize `MockEngineArgs` - normalize `MockEngineArgs`
- load or accept `DirectRequest`s - load or accept `DirectRequest`s or `loadgen::Trace` workloads
- validate replay arguments - validate replay arguments
- dispatch to offline or online replay - dispatch to offline or online replay
...@@ -42,7 +42,10 @@ The single-worker path is intentionally simple and only used when: ...@@ -42,7 +42,10 @@ The single-worker path is intentionally simple and only used when:
- `num_workers == 1` - `num_workers == 1`
- engine type is `vllm` - engine type is `vllm`
That path avoids the cluster event queue and router machinery entirely. That path avoids the cluster event queue and router machinery entirely, but it now supports both:
- flat request replay
- workload-driven replay through `WorkloadDriver` for multi-turn/session traces
```mermaid ```mermaid
flowchart TD flowchart TD
...@@ -63,6 +66,8 @@ Important details: ...@@ -63,6 +66,8 @@ Important details:
- Trace mode uses `normalize_trace_requests` in `lib/mocker/src/replay/mod.rs` so the first request starts at `0 ms`, then applies `arrival_speedup_ratio`. - Trace mode uses `normalize_trace_requests` in `lib/mocker/src/replay/mod.rs` so the first request starts at `0 ms`, then applies `arrival_speedup_ratio`.
- Concurrency mode ignores original arrival spacing and keeps the worker filled up to `max_in_flight`. - Concurrency mode ignores original arrival spacing and keeps the worker filled up to `max_in_flight`.
- Workload trace mode honors first-turn timestamps and inter-turn delays.
- Workload concurrency mode ignores first-turn timestamps but still enforces inter-turn delays after completion.
- The worker itself is still the real mocker engine core; only the scheduling loop is simplified. - The worker itself is still the real mocker engine core; only the scheduling loop is simplified.
## Multi-Worker Harness ## Multi-Worker Harness
...@@ -178,13 +183,15 @@ In round-robin mode, this capture is skipped because nothing consumes those even ...@@ -178,13 +183,15 @@ In round-robin mode, this capture is skipped because nothing consumes those even
Both single and multi harnesses support two admission modes: Both single and multi harnesses support two admission modes:
- Trace mode - Trace mode
- respects input arrival timestamps - for flat requests, respects input arrival timestamps
- timestamps are normalized so the first request starts at `0 ms` - for workloads, respects first-turn timestamps and inter-turn delays
- `arrival_speedup_ratio` compresses or stretches inter-arrival gaps - timestamps are normalized so the first request or first session starts at `0 ms`
- `arrival_speedup_ratio` compresses or stretches inter-arrival gaps and inter-turn delays
- Concurrency mode - Concurrency mode
- ignores original spacing - ignores original first-turn spacing
- keeps up to `max_in_flight` requests resident in the cluster - keeps up to `max_in_flight` requests resident in the cluster
- for workloads, still unlocks follow-up turns only after completion plus inter-turn delay
- stamps synthetic arrival times as requests are admitted - stamps synthetic arrival times as requests are admitted
This split is why `lib/mocker/src/replay/offline/mod.rs` exposes both: This split is why `lib/mocker/src/replay/offline/mod.rs` exposes both:
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use crate::common::protocols::{DirectRequest, MockEngineArgs}; use crate::common::protocols::{DirectRequest, MockEngineArgs};
use crate::loadgen::Trace;
pub(crate) use crate::replay::normalize_trace_requests; pub(crate) use crate::replay::normalize_trace_requests;
use crate::replay::{ReplayRouterMode, TraceSimulationReport}; use crate::replay::{ReplayRouterMode, TraceSimulationReport};
use dynamo_kv_router::config::KvRouterConfig; use dynamo_kv_router::config::KvRouterConfig;
...@@ -55,3 +56,39 @@ pub(crate) fn simulate_concurrency( ...@@ -55,3 +56,39 @@ pub(crate) fn simulate_concurrency(
) )
} }
} }
pub(crate) fn simulate_trace_workload(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> anyhow::Result<TraceSimulationReport> {
if num_workers == 1 && args.engine_type == crate::common::protocols::EngineType::Vllm {
single::simulate_trace_workload_single(args, trace)
} else {
multi::simulate_trace_workload_multi(args, router_config, trace, num_workers, router_mode)
}
}
pub(crate) fn simulate_concurrency_workload(
args: MockEngineArgs,
router_config: Option<KvRouterConfig>,
trace: Trace,
max_in_flight: usize,
num_workers: usize,
router_mode: ReplayRouterMode,
) -> anyhow::Result<TraceSimulationReport> {
if num_workers == 1 && args.engine_type == crate::common::protocols::EngineType::Vllm {
single::simulate_concurrency_workload_single(args, trace, max_in_flight)
} else {
multi::simulate_concurrency_workload_multi(
args,
router_config,
trace,
max_in_flight,
num_workers,
router_mode,
)
}
}
This diff is collapsed.
This diff is collapsed.
...@@ -12,6 +12,15 @@ pub(crate) struct OfflineWorkerState { ...@@ -12,6 +12,15 @@ pub(crate) struct OfflineWorkerState {
in_flight: usize, in_flight: usize,
} }
#[cfg(test)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct OfflineWorkerSnapshot {
pub(crate) busy: bool,
pub(crate) in_flight: usize,
pub(crate) ready: bool,
pub(crate) drained: bool,
}
impl OfflineWorkerState { impl OfflineWorkerState {
pub(crate) fn new(worker_idx: usize, args: MockEngineArgs, capture_kv_events: bool) -> Self { pub(crate) fn new(worker_idx: usize, args: MockEngineArgs, capture_kv_events: bool) -> Self {
let core = match args.engine_type { let core = match args.engine_type {
...@@ -81,4 +90,14 @@ impl OfflineWorkerState { ...@@ -81,4 +90,14 @@ impl OfflineWorkerState {
) -> EnginePassResult { ) -> EnginePassResult {
self.core.execute_pass(collector, now_ms) self.core.execute_pass(collector, now_ms)
} }
#[cfg(test)]
pub(crate) fn debug_snapshot(&self) -> OfflineWorkerSnapshot {
OfflineWorkerSnapshot {
busy: self.busy,
in_flight: self.in_flight,
ready: self.is_ready(),
drained: self.is_drained(),
}
}
} }
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
mod runtime; mod demux;
mod live_runtime;
mod state;
mod task;
pub(crate) use runtime::{simulate_concurrency_requests, simulate_trace_requests}; #[cfg(test)]
mod tests;
pub(crate) use live_runtime::{
simulate_concurrency_requests, simulate_concurrency_workload, simulate_trace_requests,
simulate_trace_workload,
};
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -6,4 +6,6 @@ mod online; ...@@ -6,4 +6,6 @@ mod online;
mod shared; mod shared;
pub(crate) use offline::OfflineReplayRouter; pub(crate) use offline::OfflineReplayRouter;
#[cfg(test)]
pub(crate) use offline::OfflineRouterSnapshot;
pub(crate) use online::ReplayRouter; pub(crate) use online::ReplayRouter;
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment