fix(mocker): align vLLM scheduler with v1 — drop watermark, LIFO preemption, retry loop (#7020)

Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

fix(mocker): align vLLM scheduler with v1 — drop watermark, LIFO preemption, retry loop (#7020)
Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
236cb17d · Yan Ru Pei · GitHub · b407b419 · 236cb17d · 236cb17d
Unverified Commit 236cb17d authored Mar 10, 2026 by Yan Ru Pei Committed by GitHub Mar 10, 2026
7 changed files
--- a/components/src/dynamo/mocker/README.md
+++ b/components/src/dynamo/mocker/README.md
@@ -21,7 +21,7 @@ The mocker engine now supports a vLLM-style CLI interface with individual argume
 - `--max-num-batched-tokens`: Maximum number of batched tokens per iteration (default: 8192)
 - `--enable-prefix-caching` / `--no-enable-prefix-caching`: Enable/disable automatic prefix caching (default: True)
 - `--enable-chunked-prefill` / `--no-enable-chunked-prefill`: Enable/disable chunked prefill (default: True)
- `--watermark`: KV cache watermark threshold as a fraction (default: 0.01)
+- `--preemption-mode`: Preemption mode for decode eviction under memory pressure: `lifo` (default, matches vLLM v1) or `fifo`
 - `--speedup-ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. Use `0` for infinite speedup (no simulation delays)
 - `--data-parallel-size`: Number of data parallel workers to simulate (default: 1)
 - `--num-workers`: Number of mocker workers to launch in the same process (default: 1). All workers share the same tokio runtime and thread pool

--- a/components/src/dynamo/mocker/args.py
+++ b/components/src/dynamo/mocker/args.py
@@ -107,7 +107,7 @@ def create_temp_engine_args_file(args: argparse.Namespace) -> Path:
        "max_num_batched_tokens": getattr(args, "max_num_batched_tokens", None),
        "enable_prefix_caching": getattr(args, "enable_prefix_caching", None),
        "enable_chunked_prefill": getattr(args, "enable_chunked_prefill", None),
-        "watermark": getattr(args, "watermark", None),
+        "preemption_mode": getattr(args, "preemption_mode", None),
        "speedup_ratio": getattr(args, "speedup_ratio", None),
        "dp_size": getattr(args, "dp_size", None),
        "startup_time": getattr(args, "startup_time", None),
@@ -287,10 +287,13 @@ def parse_args() -> argparse.Namespace:
        help="Disable chunked prefill",
    )
    parser.add_argument(
-        "--watermark",
-        type=float,
+        "--preemption-mode",
+        type=str,
        default=None,
-        help="Watermark value for the mocker engine (default: 0.01)",
+        choices=["lifo", "fifo"],
+        help="Preemption mode for decode eviction under memory pressure. "
+        "'lifo' (default) evicts the newest request (matches vLLM v1), "
+        "'fifo' evicts the oldest request.",
    )
    parser.add_argument(
        "--speedup-ratio",

--- a/lib/mocker/src/common/protocols.rs
+++ b/lib/mocker/src/common/protocols.rs
@@ -75,6 +75,16 @@ pub struct OutputSignal {
    pub completed: bool,
 }

+/// Preemption policy for evicting decode requests under memory pressure
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+pub enum PreemptionMode {
+    /// Evict the newest request (matches vLLM v1 default)
+    #[default]
+    Lifo,
+    /// Evict the oldest request
+    Fifo,
+}
+
 /// Worker type for disaggregated serving configurations
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub enum WorkerType {
@@ -147,10 +157,6 @@ pub struct MockEngineArgs {
    #[builder(default = true)]
    pub enable_chunked_prefill: bool,

-    #[builder(default = "0.01")]
-    #[validate(range(min = 0.0, max = 1.0))]
-    pub watermark: f64,
-
    #[builder(default = "1.0")]
    #[validate(range(min = 0.0))]
    pub speedup_ratio: f64,
@@ -205,6 +211,11 @@ pub struct MockEngineArgs {
    /// A KvEventPublisher relay subscribes to this socket and forwards events to NATS.
    #[builder(default = "None")]
    pub zmq_kv_events_port: Option<u16>,
+
+    /// Preemption mode for decode eviction under memory pressure.
+    /// Lifo (default) evicts the newest request; Fifo evicts the oldest.
+    #[builder(default)]
+    pub preemption_mode: PreemptionMode,
 }

 impl Default for MockEngineArgs {
@@ -248,7 +259,6 @@ impl MockEngineArgs {
            "max_num_batched_tokens",
            "enable_prefix_caching",
            "enable_chunked_prefill",
-            "watermark",
            "speedup_ratio",
            "dp_size",
            "startup_time",
@@ -261,6 +271,7 @@ impl MockEngineArgs {
            "kv_transfer_bandwidth",
            "reasoning",
            "zmq_kv_events_port",
+            "preemption_mode",
        ]
        .iter()
        .cloned()
@@ -318,12 +329,6 @@ impl MockEngineArgs {
            builder = builder.enable_chunked_prefill(enabled);
        }

-        if let Some(value) = extra_args.get("watermark")
-            && let Some(num) = value.as_f64()
-        {
-            builder = builder.watermark(num);
-        }
-
        if let Some(value) = extra_args.get("speedup_ratio")
            && let Some(num) = value.as_f64()
        {
@@ -378,6 +383,22 @@ impl MockEngineArgs {
            builder = builder.zmq_kv_events_port(Some(port as u16));
        }

+        if let Some(value) = extra_args.get("preemption_mode")
+            && let Some(mode_str) = value.as_str()
+        {
+            let mode = match mode_str {
+                "lifo" => PreemptionMode::Lifo,
+                "fifo" => PreemptionMode::Fifo,
+                _ => {
+                    return Err(anyhow::anyhow!(
+                        "Invalid preemption_mode: '{}'. Must be 'lifo' or 'fifo'.",
+                        mode_str
+                    ));
+                }
+            };
+            builder = builder.preemption_mode(mode);
+        }
+
        // Parse worker type from is_prefill and is_decode flags
        let is_prefill = extra_args
            .get("is_prefill")

--- a/lib/mocker/src/common/sequence.rs
+++ b/lib/mocker/src/common/sequence.rs
@@ -54,7 +54,8 @@ pub struct ActiveSequence {
    #[getter(copy)]
    num_input_tokens: usize,

-    creation_signal: Option<MoveBlock>,
+    #[getter(copy)]
+    num_allocated_tokens: usize,

    #[getter(copy)]
    enable_prefix_caching: bool,
@@ -75,28 +76,9 @@ impl ActiveSequence {
        let block_size = block_size.unwrap_or(64);
        let num_input_tokens = tokens.len();

-        let block_token_ids: Option<Vec<Vec<u32>>> = if emit_token_ids {
-            let num_complete = tokens.len() / block_size;
-            Some(
-                tokens
-                    .chunks(block_size)
-                    .take(num_complete)
-                    .map(|c| c.to_vec())
-                    .collect(),
-            )
-        } else {
-            None
-        };
-
        let tokens = Tokens::from(tokens).into_sequence(block_size as u32, Some(1337));
        let unique_blocks =
            create_unique_blocks_from_sequence(&tokens, block_size, enable_prefix_caching);
-        let block_hashes = tokens.blocks().iter().map(|b| b.block_hash()).collect();
-        let creation_signal = Some(MoveBlock::Use(
-            unique_blocks.clone(),
-            block_hashes,
-            block_token_ids,
-        ));

        let seq = Self {
            unique_blocks,
@@ -105,7 +87,7 @@ impl ActiveSequence {
            max_output_tokens,
            generated_tokens: 0,
            num_input_tokens,
-            creation_signal,
+            num_allocated_tokens: 0,
            enable_prefix_caching,
            emit_token_ids,
        };
@@ -125,8 +107,60 @@ impl ActiveSequence {
        self.tokens.total_tokens() == 0
    }

+    /// Build a `MoveBlock::Use` signal for blocks up to `cumulative_tokens`
+    /// without updating internal state. Returns `None` if no new blocks are needed.
+    /// Call `commit_allocation` after the signal is successfully processed.
+    pub fn prepare_allocation(&self, cumulative_tokens: usize) -> Option<MoveBlock> {
+        let prev_blocks = self
+            .num_allocated_tokens
+            .div_ceil(self.block_size)
+            .min(self.unique_blocks.len());
+        let target_blocks = cumulative_tokens
+            .div_ceil(self.block_size)
+            .min(self.unique_blocks.len());
+        if target_blocks <= prev_blocks {
+            return None;
+        }
+
+        let range = prev_blocks..target_blocks;
+        let blocks = self.unique_blocks[range.clone()].to_vec();
+
+        let all_hashes = self.block_hashes();
+        let num_full = all_hashes.len();
+        let hash_start = prev_blocks.min(num_full);
+        let hash_end = target_blocks.min(num_full);
+        let hashes = all_hashes[hash_start..hash_end].to_vec();
+
+        let token_ids = if self.emit_token_ids && hash_start < hash_end {
+            let all_token_ids: Vec<Vec<u32>> = self
+                .tokens
+                .blocks()
+                .iter()
+                .map(|b| b.tokens().to_vec())
+                .collect();
+            Some(all_token_ids[hash_start..hash_end].to_vec())
+        } else {
+            None
+        };
+
+        Some(MoveBlock::Use(blocks, hashes, token_ids))
+    }
+
+    /// Commit a successful allocation by advancing `num_allocated_tokens`.
+    pub fn commit_allocation(&mut self, cumulative_tokens: usize) {
+        self.num_allocated_tokens = cumulative_tokens;
+    }
+
+    /// Prepare + commit in one call (convenience for paths where failure is impossible).
+    pub fn allocate_blocks_for_chunk(&mut self, cumulative_tokens: usize) -> Option<MoveBlock> {
+        let signal = self.prepare_allocation(cumulative_tokens);
+        self.commit_allocation(cumulative_tokens);
+        signal
+    }
+
+    /// Allocate all remaining blocks at once (backward compat).
    pub fn take_creation_signal(&mut self) -> Option<MoveBlock> {
-        self.creation_signal.take()
+        self.allocate_blocks_for_chunk(self.len())
    }

    pub fn block_hashes(&self) -> Vec<u64> {
@@ -262,31 +296,12 @@ impl ActiveSequence {
            .collect()
    }

-    /// Move the request to a preempted state and return the free signals from freeing current blocks
+    /// Move the request to a preempted state and return the free signals from freeing current blocks.
    /// Upon preemption, the sequence retains the tokens generated during the decode phase (if any).
+    /// Resets `num_allocated_tokens` so re-admission will re-allocate from scratch.
    pub fn reset_with_signal(&mut self) -> Vec<MoveBlock> {
        let free_signal = self.free_signal();
-
-        // Don't reset generated_tokens since we're keeping the tokens in the sequence
-
-        let block_token_ids = if self.emit_token_ids {
-            Some(
-                self.tokens
-                    .blocks()
-                    .iter()
-                    .map(|b| b.tokens().to_vec())
-                    .collect(),
-            )
-        } else {
-            None
-        };
-
-        self.creation_signal = Some(MoveBlock::Use(
-            self.unique_blocks.clone(),
-            self.block_hashes(),
-            block_token_ids,
-        ));
-
+        self.num_allocated_tokens = 0;
        free_signal
    }


--- a/lib/mocker/src/kv_manager/vllm_backend.rs
+++ b/lib/mocker/src/kv_manager/vllm_backend.rs
@@ -28,7 +28,7 @@
 //! ## Preemption
 //! If a Use operation fails (typically due to insufficient space), a false boolean signal
 //! is returned to the scheduler for preemption. Initial KV block allocations for new requests
-//! should not fail due to the watermark checking.
+//! should not fail due to the capacity checking during scheduling.
 //!
 //! ## NOTE
 //! For simplicity (or non-simplicity), reference counting is tracked manually instead of using
@@ -177,8 +177,14 @@ impl KvManager {
        }
    }

-    /// Process a MoveBlock instruction synchronously
-    pub fn process(&mut self, event: &MoveBlock) -> bool {
+    /// Process a MoveBlock instruction synchronously.
+    ///
+    /// For `MoveBlock::Use`, returns the number of blocks successfully allocated.
+    /// On partial failure, blocks 0..N are committed but block N+1 could not be
+    /// allocated. Callers should use the return value to track partial progress.
+    ///
+    /// For other variants, returns the total block count (they always succeed or panic).
+    pub fn process(&mut self, event: &MoveBlock) -> usize {
        match event {
            MoveBlock::Use(hashes, local_hashes, token_ids) => {
                let mut blocks_stored = Vec::<u64>::new();
@@ -186,25 +192,28 @@ impl KvManager {
                    token_ids.as_ref().map(|_| Vec::new());

                let mut parent_block: Option<&UniqueBlock> = None;
+                let mut allocated = 0;
                for (i, hash) in hashes.iter().enumerate() {
                    // First check if it already exists in active blocks
                    if self.cache.contains_active(hash) {
                        // Block already active, just increment reference count
                        self.cache.increment_ref(hash);
                        parent_block = Some(hash);
+                        allocated += 1;
                        continue;
                    }

                    // Then check if it exists in inactive and move it to active if found
                    if self.cache.reactivate(hash) {
                        parent_block = Some(hash);
+                        allocated += 1;
                        continue;
                    }

                    // If at max capacity, evict the oldest entry from inactive blocks
                    if self.cache.is_at_capacity() {
                        let Some(evicted) = self.cache.evict_inactive() else {
-                            return false;
+                            return allocated;
                        };
                        tracing::trace!(
                            "Evicting block from inactive pool: {evicted:?}, dp_rank={}",
@@ -217,6 +226,7 @@ impl KvManager {

                    // Now insert the new block in active blocks with reference count 1
                    self.cache.insert_active(hash.clone(), 1);
+                    allocated += 1;
                    // Track blocks for trace/event
                    if let UniqueBlock::FullBlock(stored_full_block) = hash {
                        blocks_stored.push(*stored_full_block);
@@ -238,6 +248,7 @@ impl KvManager {
                    true,
                    stored_token_ids,
                );
+                return allocated;
            }

            MoveBlock::Destroy(hashes) => {
@@ -306,8 +317,7 @@ impl KvManager {
            }
        }

-        // Return true if we made it this far
-        true
+        1
    }

    /// Get the count of blocks that aren't in active or inactive pools
@@ -406,8 +416,8 @@ mod tests {
        // Create a KvManager with 10 blocks capacity
        let mut manager = KvManager::new(10, 16);

-        // Helper function to use multiple blocks that returns the response
-        fn use_blocks(manager: &mut KvManager, ids: Vec<u64>) -> bool {
+        // Helper function to use multiple blocks that returns the count allocated
+        fn use_blocks(manager: &mut KvManager, ids: Vec<u64>) -> usize {
            let blocks: Vec<_> = ids.iter().map(|&id| UniqueBlock::FullBlock(id)).collect();
            let hashes: Vec<_> = ids.into_iter().collect();
            manager.process(&MoveBlock::Use(blocks, hashes, None))
@@ -415,16 +425,16 @@ mod tests {

        // First use 10 blocks (0 to 9) in a batch
        let response = use_blocks(&mut manager, (0..10).collect());
-        assert!(response, "Expected success response");
+        assert_eq!(response, 10, "Expected all 10 blocks allocated");

        // Verify we are at capacity
        assert_eq!(manager.current_capacity(), 10);

-        // The 11th block should return false, not panic
+        // The 11th block should return 0, not panic
        let response = use_blocks(&mut manager, vec![10]);
-        assert!(
-            !response,
-            "Expected failure response when exceeding max capacity"
+        assert_eq!(
+            response, 0,
+            "Expected 0 blocks allocated when exceeding max capacity"
        );
    }


--- a/lib/mocker/src/scheduler/vllm.rs
+++ b/lib/mocker/src/scheduler/vllm.rs
--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -158,8 +158,8 @@ def _build_mocker_command(
            command.append("--enable-chunked-prefill")
        else:
            command.append("--no-enable-chunked-prefill")
-    if "watermark" in mocker_args:
-        command.extend(["--watermark", str(mocker_args["watermark"])])
+    if "preemption_mode" in mocker_args:
+        command.extend(["--preemption-mode", str(mocker_args["preemption_mode"])])
    if "dp_size" in mocker_args:
        command.extend(["--data-parallel-size", str(mocker_args["dp_size"])])
    # Use --durable-kv-events to enable JetStream mode (local indexer disabled)