[BugFix][V1] Fix int32 token index overflow when preparing input ids (#16806)

32d4b669 · Yong Hoon Shin · GitHub · 3cde34a4 · 32d4b669 · 32d4b669
Unverified Commit 32d4b669 authored Apr 23, 2025 by Yong Hoon Shin Committed by GitHub Apr 23, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +2 -1

vllm/v1/worker/tpu_model_runner.py vllm/v1/worker/tpu_model_runner.py +2 -1

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -241,10 +241,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            device=self.device)
        # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        # Keep in int64 to avoid overflow with long context
        self.arange_np = np.arange(max(self.max_num_reqs + 1,
                                       self.max_model_len,
                                       self.max_num_tokens),
-                                   dtype=np.int32)
+                                   dtype=np.int64)
        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
        # a faster version of creating a new tensor every time. Thus, we should
        # not make any assumptions about the values in these tensors.

--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -219,7 +219,8 @@ class TPUModelRunner:
        # Range tensor with values [0 .. self.max_num_tokens - 1].
        # Used to initialize positions / context_lens / seq_lens
-        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
+        # Keep in int64 to avoid overflow with long context
+        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
        self.num_reqs_paddings = _get_req_paddings(
            min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)