[fix]修复cudagraph和eager分段模式下，开启mla后报错问题

e0c80c12 · 王敏 · f98420b4 · e0c80c12
Commit e0c80c12 authored May 07, 2025 by 王敏
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

vllm/worker/model_runner.py vllm/worker/model_runner.py +5 -1

No files found.
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -892,6 +892,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
            max_encoder_seq_len=max_encoder_seq_len)

        batch_size = len(input_tokens)
+
+        if batch_size + cuda_graph_pad_size >= self.runner.enforce_eager_bs_threshould:
+            cuda_graph_pad_size = -1
+
        if cuda_graph_pad_size != -1:
            # If cuda graph can be used, pad tensors accordingly.
            # See `capture_model` API for more details.
@@ -1709,7 +1713,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
        # virtual engines share the same kv cache.
        virtual_engine = model_input.virtual_engine
        if prefill_meta is None and decode_meta.use_cuda_graph and \
-                model_input.input_tokens.shape[0] <= self.enforce_eager_bs_threshould:
+                model_input.input_tokens.shape[0] < self.enforce_eager_bs_threshould:
            assert model_input.input_tokens is not None
            graph_batch_size = model_input.input_tokens.shape[0]
            model_executable = self.graph_runners[virtual_engine][