[tests] fix tests

04629132 · zhuwenwen · 07c69390 · 04629132 · 04629132 · 04629132
Commit 04629132 authored Jun 12, 2025 by zhuwenwen
12 changed files
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -3,6 +3,7 @@
 import itertools
 from functools import partial

+import os
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
@@ -12,6 +13,7 @@ from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor

 from ...utils import build_model_context
+from ....utils import models_path_prefix


 def _validate_image_max_tokens_one(
@@ -33,7 +35,7 @@ def _validate_image_max_tokens_one(
 @pytest.mark.skip("This test takes around 5 minutes to run. "
                  "Comment this out to run it manually.")
 @pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+                         [os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
        model_id,
@@ -127,7 +129,7 @@ def _test_image_prompt_replacements(


 @pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+                         [os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
    ctx = build_model_context(
@@ -180,4 +182,4 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
        processor,
        num_imgs=num_imgs,
        image_sizes=image_sizes,
-    )
+    )
\ No newline at end of file
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for mllama's multimodal preprocessing and profiling."""
+import os
 import pytest
 from transformers import MllamaConfig

@@ -7,10 +8,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.profiling import MultiModalProfiler

 from ...utils import build_model_context
+from ....utils import models_path_prefix


 @pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
+                         [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct")])
 @pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
 @pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
 def test_profiling(
@@ -68,4 +70,4 @@ def test_profiling(
    # simulate mllama image-present prefill.
    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
                                          encoder_seq_lens):
-        assert actual_len >= last_group_len
+        assert actual_len >= last_group_len
\ No newline at end of file
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for phi3v's multimodal preprocessing kwargs."""
+import os
 import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY

 from ....conftest import _ImageAssets
 from ...utils import build_model_context
+from ....utils import models_path_prefix


-@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")])
 # yapf: disable
 @pytest.mark.parametrize(
    ("mm_processor_kwargs", "expected_toks_per_img"),
@@ -50,4 +52,4 @@ def test_processor_override(

    # Ensure we have the right number of placeholders per num_crops size
    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
-    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert img_tok_count == expected_toks_per_img * num_imgs
\ No newline at end of file
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for phi4mm's multimodal preprocessing kwargs."""
+import os
 import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY

 from ....conftest import _ImageAssets
 from ...utils import build_model_context
+from ....utils import models_path_prefix


-@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "microsoft/Phi-4-multimodal-instruct")])
 # yapf: disable
 @pytest.mark.parametrize(
    ("mm_processor_kwargs", "expected_toks_per_img"),
@@ -56,4 +58,4 @@ def test_processor_override(
    # Ensure we have the right number of placeholders per num_crops size
    img_tok_count = processed_inputs["prompt_token_ids"].count(
        _IMAGE_PLACEHOLDER_TOKEN_ID)
-    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert img_tok_count == expected_toks_per_img * num_imgs
\ No newline at end of file
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY

 from ....conftest import _ImageAssets
 from ...utils import build_model_context
+from ....utils import models_path_prefix


-@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")])
 # yapf: disable
 @pytest.mark.parametrize(
    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
@@ -51,4 +53,4 @@ def test_processor_override(

    assert img_tok_count == expected_toks_per_img * num_imgs
    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
-    assert pixel_shape[1] == expected_pixels_shape[1]
+    assert pixel_shape[1] == expected_pixels_shape[1]
\ No newline at end of file
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for smolvlm's multimodal preprocessing kwargs."""
+import os
 import pytest
 from transformers import SmolVLMConfig

@@ -7,9 +8,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY

 from ....conftest import _ImageAssets
 from ...utils import build_model_context
+from ....utils import models_path_prefix


-@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM2-2.2B-Instruct")])
 # yapf: disable
 @pytest.mark.parametrize(
    ("mm_processor_kwargs", "expected_toks_per_img"),
@@ -62,4 +64,4 @@ def test_processor_override(
    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = ctx.get_hf_config().image_token_id
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert img_tok_count == expected_toks_per_img * num_imgs
\ No newline at end of file
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -8,7 +8,9 @@ import os
 import pytest
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
+# from ..utils import models_path_prefix

+models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH")


 @dataclass(frozen=True)
@@ -109,8 +111,6 @@ class _HfExamplesInfo:
                pytest.skip(msg)


-models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH")
-
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
    # [Decoder-only]

--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -2,6 +2,7 @@

 import warnings

+import os
 import pytest
 import torch.cuda

@@ -20,6 +21,8 @@ from vllm.platforms import current_platform
 from ..utils import create_new_process_for_each_test
 from .registry import HF_EXAMPLE_MODELS

+models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH")
+

 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
@@ -52,12 +55,12 @@ def test_registry_imports(model_arch):

 @create_new_process_for_each_test()
 @pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
-    ("LlamaForCausalLM", False, False, False),
-    ("MllamaForConditionalGeneration", True, False, False),
-    ("LlavaForConditionalGeneration", True, True, False),
-    ("BertForSequenceClassification", False, False, True),
-    ("RobertaForSequenceClassification", False, False, True),
-    ("XLMRobertaForSequenceClassification", False, False, True),
+    (os.path.join(models_path_prefix, "LlamaForCausalLM"), False, False, False),
+    (os.path.join(models_path_prefix, "MllamaForConditionalGeneration"), True, False, False),
+    (os.path.join(models_path_prefix, "LlavaForConditionalGeneration"), True, True, False),
+    (os.path.join(models_path_prefix, "BertForSequenceClassification"), False, False, True),
+    (os.path.join(models_path_prefix, "RobertaForSequenceClassification"), False, False, True),
+    (os.path.join(models_path_prefix, "XLMRobertaForSequenceClassification"), False, False, True),
 ])
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
    assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
@@ -77,9 +80,9 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):

 @create_new_process_for_each_test()
 @pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
-    ("MLPSpeculatorPreTrainedModel", False, False),
-    ("DeepseekV2ForCausalLM", True, False),
-    ("Qwen2VLForConditionalGeneration", True, True),
+    (os.path.join(models_path_prefix, "MLPSpeculatorPreTrainedModel"), False, False),
+    (os.path.join(models_path_prefix, "DeepseekV2ForCausalLM"), True, False),
+    (os.path.join(models_path_prefix, "Qwen2VLForConditionalGeneration"), True, True),
 ])
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
@@ -104,4 +107,4 @@ def test_hf_registry_coverage():

    assert not untested_archs, (
        "Please add the following architectures to "
-        f"`tests/models/registry.py`: {untested_archs}")
+        f"`tests/models/registry.py`: {untested_archs}")
\ No newline at end of file
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
-# SPDX-License-Identifier: Apache-2.0
-
-# Expanded quantized model tests for CPU offloading
-# Base tests: tests/basic_correctness/test_cpu_offload.py
-
-import pytest
-import os
-
-from tests.quantization.utils import is_quant_method_supported
-
-from ..utils import compare_two_settings, models_path_prefix
-from vllm.platforms import current_platform
-
-
-@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
-                    reason="fp8 is not supported on this GPU type.")
-def test_cpu_offload_fp8():
-    # Test quantization of an unquantized checkpoint
-    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-                         ["--quantization", "fp8"],
-                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test loading a quantized checkpoint
-    # compare_two_settings(os.path.join(models_path_prefix, "neuralmagic/Qwen2-1.5B-Instruct-FP8"), [],
-    #                      ["--cpu-offload-gb", "1"],
-    #                      max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
-                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test GPTQ Marlin
-    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test GPTQ
-    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"),
-                         ["--quantization", "gptq"],
-                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(),
-                    reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test AWQ Marlin
-    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test AWQ
-    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"),
-                         ["--quantization", "awq"],
-                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
-                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test wNa16
-    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test w4a16_marlin24
-    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"),
-                         [], ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test w8a8
-    compare_two_settings(
-        os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), [],
-        ["--cpu-offload-gb", "1"],
-        max_wait_seconds=480)
+# SPDX-License-Identifier: Apache-2.0
+
+# Expanded quantized model tests for CPU offloading
+# Base tests: tests/basic_correctness/test_cpu_offload.py
+
+import pytest
+import os
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ..utils import compare_two_settings, models_path_prefix
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
+                    reason="fp8 is not supported on this GPU type.")
+def test_cpu_offload_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
+                         ["--quantization", "fp8"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test loading a quantized checkpoint
+    # compare_two_settings(os.path.join(models_path_prefix, "neuralmagic/Qwen2-1.5B-Instruct-FP8"), [],
+    #                      ["--cpu-offload-gb", "1"],
+    #                      max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
+    # Test GPTQ Marlin
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test GPTQ
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"),
+                         ["--quantization", "gptq"],
+                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(),
+                    reason="awq_marlin is not supported on this GPU type.")
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
+    # Test AWQ Marlin
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test AWQ
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"),
+                         ["--quantization", "awq"],
+                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
+    # Test wNa16
+    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test w4a16_marlin24
+    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"),
+                         [], ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test w8a8
+    compare_two_settings(
+        os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480)
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
 # SPDX-License-Identifier: Apache-2.0

-import vllm
-
-
-def test_embedded_commit_defined():
-    assert hasattr(vllm, "__version__")
-    assert hasattr(vllm, "__version_tuple__")
-    assert vllm.__version__ != "dev"
-    assert vllm.__version_tuple__ != (0, 0, "dev")
+import vllm
+
+
+def test_embedded_commit_defined():
+    assert hasattr(vllm, "__version__")
+    assert hasattr(vllm, "__version_tuple__")
+    assert vllm.__version__ != "dev"
+    assert vllm.__version_tuple__ != (0, 0, "dev")
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
-# SPDX-License-Identifier: Apache-2.0
-import random
-
-import numpy as np
-import torch
-
-from vllm.platforms.interface import Platform
-
-
-def test_seed_behavior():
-    # Test with a specific seed
-    Platform.seed_everything(42)
-    random_value_1 = random.randint(0, 100)
-    np_random_value_1 = np.random.randint(0, 100)
-    torch_random_value_1 = torch.randint(0, 100, (1, )).item()
-
-    Platform.seed_everything(42)
-    random_value_2 = random.randint(0, 100)
-    np_random_value_2 = np.random.randint(0, 100)
-    torch_random_value_2 = torch.randint(0, 100, (1, )).item()
-
-    assert random_value_1 == random_value_2
-    assert np_random_value_1 == np_random_value_2
-    assert torch_random_value_1 == torch_random_value_2
+# SPDX-License-Identifier: Apache-2.0
+import random
+
+import numpy as np
+import torch
+
+from vllm.platforms.interface import Platform
+
+
+def test_seed_behavior():
+    # Test with a specific seed
+    Platform.seed_everything(42)
+    random_value_1 = random.randint(0, 100)
+    np_random_value_1 = np.random.randint(0, 100)
+    torch_random_value_1 = torch.randint(0, 100, (1, )).item()
+
+    Platform.seed_everything(42)
+    random_value_2 = random.randint(0, 100)
+    np_random_value_2 = np.random.randint(0, 100)
+    torch_random_value_2 = torch.randint(0, 100, (1, )).item()
+
+    assert random_value_1 == random_value_2
+    assert np_random_value_1 == np_random_value_2
+    assert torch_random_value_1 == torch_random_value_2
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh