[CI/Build] Fix CI LoRA failure (#16270)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>

[CI/Build] Fix CI LoRA failure (#16270)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
86c3369e · Jee Jee Li · GitHub · 2755c34a · 86c3369e · 86c3369e
Unverified Commit 86c3369e authored Apr 09, 2025 by Jee Jee Li Committed by GitHub Apr 09, 2025
8 changed files
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
        monkeypatch.setenv('VLLM_USE_V1', '0')
    yield
+@pytest.fixture
+def reset_default_device():
+    """
+    Some tests, such as `test_punica_ops.py`, explicitly set the 
+    default device, which can affect subsequent tests. Adding this fixture 
+    helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
                       max_num_seqs=16,
                       max_loras=4,
                       max_lora_rank=64,
-                       tensor_parallel_size=1,
                       trust_remote_code=True,
                       fully_sharded_loras=fully_sharded)
    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)

--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
                   enable_lora=True,
                   max_loras=4,
                   max_lora_rank=64,
-                   tensor_parallel_size=1,
                   trust_remote_code=True,
                   enable_chunked_prefill=True)

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 @pytest.fixture(autouse=True)
-def clean_cache():
+def clean_cache_reset_device(reset_default_device):
    # Release any memory we might be holding on to. CI runs OOMs otherwise.
    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                _LORA_B_PTR_DICT)

--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
        # also test odd max_num_seqs
        max_num_seqs=13,
        max_loras=4,
-        tensor_parallel_size=1,
        enable_chunked_prefill=True)
    generate_and_test(llm, sql_lora_files)

--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
 from .utils import PunicaTensors, assert_close, generate_data_for_nslices
+@pytest.fixture(autouse=True)
+def reset_device(reset_default_device):
+    pass
 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
        nslices: int, inputs_tensor: torch.Tensor,

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("tp_size", [1])
+def test_quant_model_lora(tinyllama_lora_files, model):
-def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
-                          tp_size):
-    if num_gpus_available < tp_size and \
-        tp_size > 1 and current_platform.is_cuda_alike():
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    llm = vllm.LLM(
        model=model.model_path,
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
        max_num_seqs=16,
        max_loras=4,
        max_model_len=400,
-        tensor_parallel_size=tp_size,
        gpu_memory_utilization=0.2,  #avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
-        tensor_parallel_size=1,
        gpu_memory_utilization=0.2,  #avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,

--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
                   enable_lora=True,
                   max_loras=4,
                   max_lora_rank=16,
-                   tensor_parallel_size=1,
                   trust_remote_code=True,
                   enable_chunked_prefill=True)