Unverified Commit 0fa673af authored by Jee Jee Li's avatar Jee Jee Li Committed by GitHub
Browse files

[V0 deprecation] Clean up LoRA (#25686)


Signed-off-by: default avatarJee Jee Li <pandaleefree@gmail.com>
parent 3468f17e
...@@ -11,7 +11,6 @@ from typing import Optional, Union, final ...@@ -11,7 +11,6 @@ from typing import Optional, Union, final
import torch import torch
import vllm.envs as envs
from vllm.lora.layers import LoRAMapping from vllm.lora.layers import LoRAMapping
from vllm.triton_utils import HAS_TRITON from vllm.triton_utils import HAS_TRITON
...@@ -41,14 +40,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): ...@@ -41,14 +40,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
max_num_batched_tokens, max_num_batched_tokens,
device=device) device=device)
# When cudagraph capture size is greater than max_num_seqs (max_batches,
# here), V0 captures the graph as if max_num_seqs is set to
# the capture size.
# V1 doesn't have this problem and always respects max_num_seqs.
max_num_prompts = (max_batches
if envs.VLLM_USE_V1 else max_num_batched_tokens)
self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras, self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
max_num_prompts, max_batches,
device=device) device=device)
def update_metadata(self, mapping: LoRAMapping, def update_metadata(self, mapping: LoRAMapping,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment