Unverified Commit f5854e17 authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

fix: avoid offload redundant prefill blocks | fix cuda graph hanging (#3632)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent a6ac22e6
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use std::{any::Any, sync::Arc}; use std::{any::Any, cmp::max, sync::Arc};
use dynamo_llm::{ use dynamo_llm::{
block_manager::{ block_manager::{
...@@ -479,7 +479,7 @@ impl Slot for VllmConnectorSlot { ...@@ -479,7 +479,7 @@ impl Slot for VllmConnectorSlot {
&mut self, &mut self,
tokens: &[u32], tokens: &[u32],
block_ids: &[BlockId], block_ids: &[BlockId],
_num_computed_tokens: usize, num_computed_tokens: usize,
num_scheduled_tokens: usize, num_scheduled_tokens: usize,
) -> Result<(), SlotError> { ) -> Result<(), SlotError> {
if !tokens.is_empty() { if !tokens.is_empty() {
...@@ -493,6 +493,11 @@ impl Slot for VllmConnectorSlot { ...@@ -493,6 +493,11 @@ impl Slot for VllmConnectorSlot {
self.state = SlotState::Prefilling; self.state = SlotState::Prefilling;
} }
// Use max to advance both current_position and evaluated_blocks at least by num_computed_tokens.
// This logic is to prevent redundant block offloading.
self.current_position = max(self.current_position, num_computed_tokens);
self.evaluated_blocks = max(self.evaluated_blocks, num_computed_tokens / self.block_size);
// apply new block_ids // apply new block_ids
if !block_ids.is_empty() { if !block_ids.is_empty() {
tracing::debug!("assigning {} new device blocks slot", block_ids.len()); tracing::debug!("assigning {} new device blocks slot", block_ids.len());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment