[CI/Build] Reorganize models tests (#17459)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[CI/Build] Reorganize models tests (#17459)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
afb4429b · Cyrus Leung · GitHub · aa4502e7 · afb4429b · afb4429b
Unverified Commit afb4429b authored May 01, 2025 by Cyrus Leung Committed by GitHub Apr 30, 2025
20 changed files
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
--- a/tests/models/embedding/__init__.py
+++ b/tests/models/embedding/__init__.py
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 HF_TEXT_PROMPTS = [
    # T -> X

--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -8,7 +8,7 @@ from vllm.platforms import current_platform
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 # Llava Next embedding implementation is only supported by CUDA.
 # If run on ROCm, hf_model.model.resize_token_embeddings will

--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 HF_TEXT_PROMPTS = [
    # T -> X

--- a/tests/models/embedding/language/__init__.py
+++ b/tests/models/embedding/language/__init__.py
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
@@ -39,7 +39,6 @@ ground_truth_generations = [
 ]
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                    reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])

--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -7,8 +7,8 @@ import torch
 from vllm.multimodal.image import rescale_image_size
-from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
+from ...conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -85,7 +85,6 @@ def run_awq_test(
        )
-@pytest.mark.quant_model
 @pytest.mark.parametrize(
    ("source_model", "quant_model"),
    [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],

--- a/tests/models/test_bitblas.py
+++ b/tests/models/test_bitblas.py
@@ -15,7 +15,7 @@ from dataclasses import dataclass
 import pytest
-from .utils import check_logprobs_close
+from ..utils import check_logprobs_close
 @dataclass

--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -4,20 +4,15 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
-import os
-from typing import Optional
 import pytest
-from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize(

--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -14,9 +14,9 @@ from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported
-from ....conftest import VllmRunner
+from ...conftest import VllmRunner
-from ....utils import multi_gpu_test
+from ...utils import multi_gpu_test
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -38,7 +38,6 @@ LLAMA_CONFIG = GGUFTestConfig(
    original_model="meta-llama/Llama-3.2-1B-Instruct",
    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
-    marks=[pytest.mark.quant_model],
 )
 QWEN2_CONFIG = GGUFTestConfig(

--- a/tests/models/test_gptq_bitblas.py
+++ b/tests/models/test_gptq_bitblas.py
@@ -15,7 +15,7 @@ from dataclasses import dataclass
 import pytest
-from .utils import check_logprobs_close
+from ..utils import check_logprobs_close
 @dataclass

--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -16,7 +16,7 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -34,7 +34,6 @@ MODELS = [
 ]
-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                    reason="gptq_marlin is not supported on this GPU type.")

--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -13,7 +13,7 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 @dataclass
@@ -39,7 +39,6 @@ model_pairs = [
 ]
-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                    reason="Marlin24 is not supported on this GPU type.")