[V0 deprecation] Clean up LoRA (#25686)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>

[V0 deprecation] Clean up LoRA (#25686)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
0fa673af · Jee Jee Li · GitHub · 3468f17e · 0fa673af
Unverified Commit 0fa673af authored Sep 26, 2025 by Jee Jee Li Committed by GitHub Sep 25, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 8 deletions

vllm/lora/punica_wrapper/punica_gpu.py vllm/lora/punica_wrapper/punica_gpu.py +1 -8

No files found.
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -11,7 +11,6 @@ from typing import Optional, Union, final
 import torch
-import vllm.envs as envs
 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON
@@ -41,14 +40,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
                                                      max_num_batched_tokens,
                                                      device=device)
-        # When cudagraph capture size is greater than max_num_seqs (max_batches,
-        # here), V0 captures the graph as if max_num_seqs is set to
-        # the capture size.
-        # V1 doesn't have this problem and always respects max_num_seqs.
-        max_num_prompts = (max_batches
-                           if envs.VLLM_USE_V1 else max_num_batched_tokens)
        self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-                                                       max_num_prompts,
+                                                       max_batches,
                                                       device=device)
    def update_metadata(self, mapping: LoRAMapping,