Unverified Commit 236cb17d authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

fix(mocker): align vLLM scheduler with v1 — drop watermark, LIFO preemption, retry loop (#7020)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent b407b419
......@@ -21,7 +21,7 @@ The mocker engine now supports a vLLM-style CLI interface with individual argume
- `--max-num-batched-tokens`: Maximum number of batched tokens per iteration (default: 8192)
- `--enable-prefix-caching` / `--no-enable-prefix-caching`: Enable/disable automatic prefix caching (default: True)
- `--enable-chunked-prefill` / `--no-enable-chunked-prefill`: Enable/disable chunked prefill (default: True)
- `--watermark`: KV cache watermark threshold as a fraction (default: 0.01)
- `--preemption-mode`: Preemption mode for decode eviction under memory pressure: `lifo` (default, matches vLLM v1) or `fifo`
- `--speedup-ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. Use `0` for infinite speedup (no simulation delays)
- `--data-parallel-size`: Number of data parallel workers to simulate (default: 1)
- `--num-workers`: Number of mocker workers to launch in the same process (default: 1). All workers share the same tokio runtime and thread pool
......
......@@ -107,7 +107,7 @@ def create_temp_engine_args_file(args: argparse.Namespace) -> Path:
"max_num_batched_tokens": getattr(args, "max_num_batched_tokens", None),
"enable_prefix_caching": getattr(args, "enable_prefix_caching", None),
"enable_chunked_prefill": getattr(args, "enable_chunked_prefill", None),
"watermark": getattr(args, "watermark", None),
"preemption_mode": getattr(args, "preemption_mode", None),
"speedup_ratio": getattr(args, "speedup_ratio", None),
"dp_size": getattr(args, "dp_size", None),
"startup_time": getattr(args, "startup_time", None),
......@@ -287,10 +287,13 @@ def parse_args() -> argparse.Namespace:
help="Disable chunked prefill",
)
parser.add_argument(
"--watermark",
type=float,
"--preemption-mode",
type=str,
default=None,
help="Watermark value for the mocker engine (default: 0.01)",
choices=["lifo", "fifo"],
help="Preemption mode for decode eviction under memory pressure. "
"'lifo' (default) evicts the newest request (matches vLLM v1), "
"'fifo' evicts the oldest request.",
)
parser.add_argument(
"--speedup-ratio",
......
......@@ -75,6 +75,16 @@ pub struct OutputSignal {
pub completed: bool,
}
/// Preemption policy for evicting decode requests under memory pressure
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum PreemptionMode {
/// Evict the newest request (matches vLLM v1 default)
#[default]
Lifo,
/// Evict the oldest request
Fifo,
}
/// Worker type for disaggregated serving configurations
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum WorkerType {
......@@ -147,10 +157,6 @@ pub struct MockEngineArgs {
#[builder(default = true)]
pub enable_chunked_prefill: bool,
#[builder(default = "0.01")]
#[validate(range(min = 0.0, max = 1.0))]
pub watermark: f64,
#[builder(default = "1.0")]
#[validate(range(min = 0.0))]
pub speedup_ratio: f64,
......@@ -205,6 +211,11 @@ pub struct MockEngineArgs {
/// A KvEventPublisher relay subscribes to this socket and forwards events to NATS.
#[builder(default = "None")]
pub zmq_kv_events_port: Option<u16>,
/// Preemption mode for decode eviction under memory pressure.
/// Lifo (default) evicts the newest request; Fifo evicts the oldest.
#[builder(default)]
pub preemption_mode: PreemptionMode,
}
impl Default for MockEngineArgs {
......@@ -248,7 +259,6 @@ impl MockEngineArgs {
"max_num_batched_tokens",
"enable_prefix_caching",
"enable_chunked_prefill",
"watermark",
"speedup_ratio",
"dp_size",
"startup_time",
......@@ -261,6 +271,7 @@ impl MockEngineArgs {
"kv_transfer_bandwidth",
"reasoning",
"zmq_kv_events_port",
"preemption_mode",
]
.iter()
.cloned()
......@@ -318,12 +329,6 @@ impl MockEngineArgs {
builder = builder.enable_chunked_prefill(enabled);
}
if let Some(value) = extra_args.get("watermark")
&& let Some(num) = value.as_f64()
{
builder = builder.watermark(num);
}
if let Some(value) = extra_args.get("speedup_ratio")
&& let Some(num) = value.as_f64()
{
......@@ -378,6 +383,22 @@ impl MockEngineArgs {
builder = builder.zmq_kv_events_port(Some(port as u16));
}
if let Some(value) = extra_args.get("preemption_mode")
&& let Some(mode_str) = value.as_str()
{
let mode = match mode_str {
"lifo" => PreemptionMode::Lifo,
"fifo" => PreemptionMode::Fifo,
_ => {
return Err(anyhow::anyhow!(
"Invalid preemption_mode: '{}'. Must be 'lifo' or 'fifo'.",
mode_str
));
}
};
builder = builder.preemption_mode(mode);
}
// Parse worker type from is_prefill and is_decode flags
let is_prefill = extra_args
.get("is_prefill")
......
......@@ -54,7 +54,8 @@ pub struct ActiveSequence {
#[getter(copy)]
num_input_tokens: usize,
creation_signal: Option<MoveBlock>,
#[getter(copy)]
num_allocated_tokens: usize,
#[getter(copy)]
enable_prefix_caching: bool,
......@@ -75,28 +76,9 @@ impl ActiveSequence {
let block_size = block_size.unwrap_or(64);
let num_input_tokens = tokens.len();
let block_token_ids: Option<Vec<Vec<u32>>> = if emit_token_ids {
let num_complete = tokens.len() / block_size;
Some(
tokens
.chunks(block_size)
.take(num_complete)
.map(|c| c.to_vec())
.collect(),
)
} else {
None
};
let tokens = Tokens::from(tokens).into_sequence(block_size as u32, Some(1337));
let unique_blocks =
create_unique_blocks_from_sequence(&tokens, block_size, enable_prefix_caching);
let block_hashes = tokens.blocks().iter().map(|b| b.block_hash()).collect();
let creation_signal = Some(MoveBlock::Use(
unique_blocks.clone(),
block_hashes,
block_token_ids,
));
let seq = Self {
unique_blocks,
......@@ -105,7 +87,7 @@ impl ActiveSequence {
max_output_tokens,
generated_tokens: 0,
num_input_tokens,
creation_signal,
num_allocated_tokens: 0,
enable_prefix_caching,
emit_token_ids,
};
......@@ -125,8 +107,60 @@ impl ActiveSequence {
self.tokens.total_tokens() == 0
}
/// Build a `MoveBlock::Use` signal for blocks up to `cumulative_tokens`
/// without updating internal state. Returns `None` if no new blocks are needed.
/// Call `commit_allocation` after the signal is successfully processed.
pub fn prepare_allocation(&self, cumulative_tokens: usize) -> Option<MoveBlock> {
let prev_blocks = self
.num_allocated_tokens
.div_ceil(self.block_size)
.min(self.unique_blocks.len());
let target_blocks = cumulative_tokens
.div_ceil(self.block_size)
.min(self.unique_blocks.len());
if target_blocks <= prev_blocks {
return None;
}
let range = prev_blocks..target_blocks;
let blocks = self.unique_blocks[range.clone()].to_vec();
let all_hashes = self.block_hashes();
let num_full = all_hashes.len();
let hash_start = prev_blocks.min(num_full);
let hash_end = target_blocks.min(num_full);
let hashes = all_hashes[hash_start..hash_end].to_vec();
let token_ids = if self.emit_token_ids && hash_start < hash_end {
let all_token_ids: Vec<Vec<u32>> = self
.tokens
.blocks()
.iter()
.map(|b| b.tokens().to_vec())
.collect();
Some(all_token_ids[hash_start..hash_end].to_vec())
} else {
None
};
Some(MoveBlock::Use(blocks, hashes, token_ids))
}
/// Commit a successful allocation by advancing `num_allocated_tokens`.
pub fn commit_allocation(&mut self, cumulative_tokens: usize) {
self.num_allocated_tokens = cumulative_tokens;
}
/// Prepare + commit in one call (convenience for paths where failure is impossible).
pub fn allocate_blocks_for_chunk(&mut self, cumulative_tokens: usize) -> Option<MoveBlock> {
let signal = self.prepare_allocation(cumulative_tokens);
self.commit_allocation(cumulative_tokens);
signal
}
/// Allocate all remaining blocks at once (backward compat).
pub fn take_creation_signal(&mut self) -> Option<MoveBlock> {
self.creation_signal.take()
self.allocate_blocks_for_chunk(self.len())
}
pub fn block_hashes(&self) -> Vec<u64> {
......@@ -262,31 +296,12 @@ impl ActiveSequence {
.collect()
}
/// Move the request to a preempted state and return the free signals from freeing current blocks
/// Move the request to a preempted state and return the free signals from freeing current blocks.
/// Upon preemption, the sequence retains the tokens generated during the decode phase (if any).
/// Resets `num_allocated_tokens` so re-admission will re-allocate from scratch.
pub fn reset_with_signal(&mut self) -> Vec<MoveBlock> {
let free_signal = self.free_signal();
// Don't reset generated_tokens since we're keeping the tokens in the sequence
let block_token_ids = if self.emit_token_ids {
Some(
self.tokens
.blocks()
.iter()
.map(|b| b.tokens().to_vec())
.collect(),
)
} else {
None
};
self.creation_signal = Some(MoveBlock::Use(
self.unique_blocks.clone(),
self.block_hashes(),
block_token_ids,
));
self.num_allocated_tokens = 0;
free_signal
}
......
......@@ -28,7 +28,7 @@
//! ## Preemption
//! If a Use operation fails (typically due to insufficient space), a false boolean signal
//! is returned to the scheduler for preemption. Initial KV block allocations for new requests
//! should not fail due to the watermark checking.
//! should not fail due to the capacity checking during scheduling.
//!
//! ## NOTE
//! For simplicity (or non-simplicity), reference counting is tracked manually instead of using
......@@ -177,8 +177,14 @@ impl KvManager {
}
}
/// Process a MoveBlock instruction synchronously
pub fn process(&mut self, event: &MoveBlock) -> bool {
/// Process a MoveBlock instruction synchronously.
///
/// For `MoveBlock::Use`, returns the number of blocks successfully allocated.
/// On partial failure, blocks 0..N are committed but block N+1 could not be
/// allocated. Callers should use the return value to track partial progress.
///
/// For other variants, returns the total block count (they always succeed or panic).
pub fn process(&mut self, event: &MoveBlock) -> usize {
match event {
MoveBlock::Use(hashes, local_hashes, token_ids) => {
let mut blocks_stored = Vec::<u64>::new();
......@@ -186,25 +192,28 @@ impl KvManager {
token_ids.as_ref().map(|_| Vec::new());
let mut parent_block: Option<&UniqueBlock> = None;
let mut allocated = 0;
for (i, hash) in hashes.iter().enumerate() {
// First check if it already exists in active blocks
if self.cache.contains_active(hash) {
// Block already active, just increment reference count
self.cache.increment_ref(hash);
parent_block = Some(hash);
allocated += 1;
continue;
}
// Then check if it exists in inactive and move it to active if found
if self.cache.reactivate(hash) {
parent_block = Some(hash);
allocated += 1;
continue;
}
// If at max capacity, evict the oldest entry from inactive blocks
if self.cache.is_at_capacity() {
let Some(evicted) = self.cache.evict_inactive() else {
return false;
return allocated;
};
tracing::trace!(
"Evicting block from inactive pool: {evicted:?}, dp_rank={}",
......@@ -217,6 +226,7 @@ impl KvManager {
// Now insert the new block in active blocks with reference count 1
self.cache.insert_active(hash.clone(), 1);
allocated += 1;
// Track blocks for trace/event
if let UniqueBlock::FullBlock(stored_full_block) = hash {
blocks_stored.push(*stored_full_block);
......@@ -238,6 +248,7 @@ impl KvManager {
true,
stored_token_ids,
);
return allocated;
}
MoveBlock::Destroy(hashes) => {
......@@ -306,8 +317,7 @@ impl KvManager {
}
}
// Return true if we made it this far
true
1
}
/// Get the count of blocks that aren't in active or inactive pools
......@@ -406,8 +416,8 @@ mod tests {
// Create a KvManager with 10 blocks capacity
let mut manager = KvManager::new(10, 16);
// Helper function to use multiple blocks that returns the response
fn use_blocks(manager: &mut KvManager, ids: Vec<u64>) -> bool {
// Helper function to use multiple blocks that returns the count allocated
fn use_blocks(manager: &mut KvManager, ids: Vec<u64>) -> usize {
let blocks: Vec<_> = ids.iter().map(|&id| UniqueBlock::FullBlock(id)).collect();
let hashes: Vec<_> = ids.into_iter().collect();
manager.process(&MoveBlock::Use(blocks, hashes, None))
......@@ -415,16 +425,16 @@ mod tests {
// First use 10 blocks (0 to 9) in a batch
let response = use_blocks(&mut manager, (0..10).collect());
assert!(response, "Expected success response");
assert_eq!(response, 10, "Expected all 10 blocks allocated");
// Verify we are at capacity
assert_eq!(manager.current_capacity(), 10);
// The 11th block should return false, not panic
// The 11th block should return 0, not panic
let response = use_blocks(&mut manager, vec![10]);
assert!(
!response,
"Expected failure response when exceeding max capacity"
assert_eq!(
response, 0,
"Expected 0 blocks allocated when exceeding max capacity"
);
}
......
This diff is collapsed.
......@@ -158,8 +158,8 @@ def _build_mocker_command(
command.append("--enable-chunked-prefill")
else:
command.append("--no-enable-chunked-prefill")
if "watermark" in mocker_args:
command.extend(["--watermark", str(mocker_args["watermark"])])
if "preemption_mode" in mocker_args:
command.extend(["--preemption-mode", str(mocker_args["preemption_mode"])])
if "dp_size" in mocker_args:
command.extend(["--data-parallel-size", str(mocker_args["dp_size"])])
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment