[full_graph] Fix query_start_loc padding (#19321)

Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>

[full_graph] Fix query_start_loc padding (#19321)
Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
770e5dcd · Yinghai Lu · GitHub · c57c9415 · 770e5dcd
Unverified Commit 770e5dcd authored Jun 09, 2025 by Yinghai Lu Committed by GitHub Jun 09, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +4 -1

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -655,7 +655,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        # Fill unused with -1. Needed for reshape_and_cache
        self.seq_lens[num_reqs:].fill_(0)
-        self.query_start_loc[num_reqs + 1:].fill_(-1)
+        # Note: pad query_start_loc to be non-decreasing, as kernels
+        # like FlashAttention requires that
+        self.query_start_loc[num_reqs + 1:].fill_(
+            self.query_start_loc_cpu[num_reqs].item())
        query_start_loc = self.query_start_loc[:num_reqs + 1]
        seq_lens = self.seq_lens[:num_reqs]