[Frontend] remove max_num_batched_tokens limit for lora (#7288)

48abee9e · Cherilyn Buren · GitHub · 74670964 · 48abee9e
Unverified Commit 48abee9e authored Aug 08, 2024 by Cherilyn Buren Committed by GitHub Aug 08, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 5 deletions

vllm/config.py vllm/config.py +0 -5

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1377,11 +1377,6 @@ class LoRAConfig:
                           model_config.quantization)

    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        if scheduler_config.max_num_batched_tokens > 65528:
-            raise ValueError(
-                "Due to limitations of the custom LoRA CUDA kernel, "
-                "max_num_batched_tokens must be <= 65528 when "
-                "LoRA is enabled.")
        if scheduler_config.chunked_prefill_enabled:
            raise ValueError("LoRA is not supported with chunked prefill yet.")