[CI/Build] Clean up LoRA test (#23890)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>

[CI/Build] Clean up LoRA test (#23890)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
b4f9e963 · Jee Jee Li · GitHub · 05d839c1 · b4f9e963 · b4f9e963
Unverified Commit b4f9e963 authored Aug 29, 2025 by Jee Jee Li Committed by GitHub Aug 28, 2025
4 changed files
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi


--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -109,10 +109,9 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Test (API Server) # 40min
@@ -326,7 +325,7 @@ steps:
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
  parallelism: 4

 - label: PyTorch Compilation Unit Tests
@@ -807,13 +806,13 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_multi_loras_with_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py


 - label: Weight Loading Multiple GPU Test  # 33min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_gpus: 2 
  optional: true
  source_file_dependencies:
  - vllm/

--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import weakref
-
-import pytest
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-
-PROMPTS = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def llm(request, monkeypatch_module):
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
-
-    # pytest caches the fixture so we use weakref.proxy to
-    # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              tensor_parallel_size=1,
-              max_model_len=8192,
-              enable_lora=True,
-              max_loras=4,
-              max_lora_rank=64,
-              max_num_seqs=128,
-              enforce_eager=True)
-
-    yield weakref.proxy(llm)
-
-    del llm
-
-    cleanup_dist_env_and_memory()
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.mark.skip_global_cleanup
-def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
-    lora_request = [
-        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
-        for idx in range(len(PROMPTS))
-    ]
-    # Multiple SamplingParams should be matched with each prompt
-    outputs = llm.generate(PROMPTS, lora_request=lora_request)
-    assert len(PROMPTS) == len(outputs)
-
-    # Exception raised, if the size of params does not match the size of prompts
-    with pytest.raises(ValueError):
-        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
-
-    # Single LoRARequest should be applied to every prompt
-    single_lora_request = lora_request[0]
-    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
-    assert len(PROMPTS) == len(outputs)
--- a/tests/lora/test_multi_loras_with_tp.py
+++ b/tests/lora/test_multi_loras_with_tp.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Script to test multi loras service with tp >= 2
+This script contains:
+1. test multi loras service with tp >= 2
+2. test multi loras request
 """
+import pytest
+
 from tests.utils import multi_gpu_test
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():

        output_text = call_llm_get_outputs(prompt, "Alice")
        check_outputs(output_text, expected_output)
+
+
+def test_multiple_lora_requests():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    PROMPTS = ["Hello, my name is"] * 2
+    LORA_NAME = "Alice"
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1,
+                    LORA_NAME_PATH_MAP[LORA_NAME])
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)