[CI/Build][LoRA] Temporarily fix long context failure issue (#9579)

a48e3ec0 · Jee Jee Li · GitHub · 6c5af09b · a48e3ec0
Unverified Commit a48e3ec0 authored Oct 22, 2024 by Jee Jee Li Committed by GitHub Oct 22, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 11 deletions

tests/lora/test_long_context.py tests/lora/test_long_context.py +20 -11

No files found.
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -28,9 +28,15 @@ sampling_params = SamplingParams(
 def _create_lora_request(lora_id, long_context_infos):
    context_len = long_context_infos[lora_id]["context_length"]
    scaling_factor = context_len_to_scaling_factor[context_len]
-    return LoRARequest(context_len, lora_id,
+    return LoRARequest(
-                       long_context_infos[lora_id]["lora"], None,
+        # There are 2 LoRAs for 16K, we need to add lora_id to indicate
-                       4096 * scaling_factor)
+        # they are different LoRAs.
+        context_len + str(lora_id),
+        lora_id,
+        long_context_infos[lora_id]["lora"],
+        None,
+        4096 * scaling_factor,
+    )
 def evaluate_json_response(model_response, golden_response):
@@ -108,14 +114,17 @@ def lora_llm(long_context_infos):
        for info in long_context_infos.values()
    ]
-    llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
+    llm = vllm.LLM(
-                   enable_lora=True,
+        "meta-llama/Llama-2-13b-chat-hf",
-                   max_num_seqs=16,
+        enable_lora=True,
-                   max_loras=2,
+        max_num_seqs=16,
-                   long_lora_scaling_factors=tuple(scaling_factors),
+        max_loras=2,
-                   max_num_batched_tokens=4096 * 8,
+        long_lora_scaling_factors=tuple(scaling_factors),
-                   tensor_parallel_size=4,
+        max_num_batched_tokens=4096 * 8,
-                   distributed_executor_backend="mp")
+        tensor_parallel_size=4,
+        # FIXME enable async output processor
+        disable_async_output_proc=True,
+        distributed_executor_backend="mp")
    yield llm
    del llm