fix: avoid offload redundant prefill blocks | fix cuda graph hanging (#3632)

Signed-off-by: Ziqi Fan <ziqif@nvidia.com>

fix: avoid offload redundant prefill blocks | fix cuda graph hanging (#3632)
Signed-off-by: Ziqi Fan <ziqif@nvidia.com>
f5854e17 · Ziqi Fan · GitHub · a6ac22e6 · f5854e17
Unverified Commit f5854e17 authored Oct 15, 2025 by Ziqi Fan Committed by GitHub Oct 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 2 deletions

lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs ...thon/rust/llm/block_manager/vllm/connector/leader/slot.rs +7 -2

No files found.
--- a/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
+++ b/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-use std::{any::Any, sync::Arc};
+use std::{any::Any, cmp::max, sync::Arc};
 use dynamo_llm::{
    block_manager::{
@@ -479,7 +479,7 @@ impl Slot for VllmConnectorSlot {
        &mut self,
        tokens: &[u32],
        block_ids: &[BlockId],
-        _num_computed_tokens: usize,
+        num_computed_tokens: usize,
        num_scheduled_tokens: usize,
    ) -> Result<(), SlotError> {
        if !tokens.is_empty() {
@@ -493,6 +493,11 @@ impl Slot for VllmConnectorSlot {
            self.state = SlotState::Prefilling;
        }
+        // Use max to advance both current_position and evaluated_blocks at least by num_computed_tokens.
+        // This logic is to prevent redundant block offloading.
+        self.current_position = max(self.current_position, num_computed_tokens);
+        self.evaluated_blocks = max(self.evaluated_blocks, num_computed_tokens / self.block_size);
        // apply new block_ids
        if !block_ids.is_empty() {
            tracing::debug!("assigning {} new device blocks slot", block_ids.len());