[CI/Build] Reorganize models tests (#17459)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[CI/Build] Reorganize models tests (#17459)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
afb4429b · Cyrus Leung · GitHub · aa4502e7 · afb4429b · afb4429b
Unverified Commit afb4429b authored May 01, 2025 by Cyrus Leung Committed by GitHub Apr 30, 2025
20 changed files
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -8,7 +8,7 @@ import pytest
 from vllm.config import PoolerConfig
 from vllm.platforms import current_platform

-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close


 @pytest.mark.parametrize(

--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -7,11 +7,10 @@ from array import array

 import openai
 import pytest
-import pytest_asyncio
 from scipy.spatial.distance import cosine

-import vllm
-import vllm.config
+from vllm import LLM, SamplingParams
+from vllm.config import ModelConfig
 from vllm.utils import STR_BACKEND_ENV_VAR

 from ....utils import RemoteOpenAIServer
@@ -31,73 +30,45 @@ def _arr(arr):
    return array("i", arr)


-def test_find_array(monkeypatch: pytest.MonkeyPatch):
-    # GritLM embedding implementation is only supported by XFormers backend.
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-
-        from vllm.model_executor.models.gritlm import GritLMPooler
-
-        # Create an LLM object to get the model config.
-        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-        pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
-
-        arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-        assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
-
-        with pytest.raises(ValueError):
-            pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
-
-
-@pytest.fixture(scope="module")
-def server_embedding():
-    # GritLM embedding implementation is only supported by XFormers backend.
-    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-    with pytest.MonkeyPatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
-
-
-@pytest.fixture(scope="module")
-def server_generate():
-    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
-    with pytest.MonkeyPatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+def test_find_array():
+    from vllm.model_executor.models.gritlm import GritLMPooler

+    model_config = ModelConfig(
+        MODEL_NAME,
+        task="embed",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="bfloat16",
+        seed=0,
+    )
+    pooler = GritLMPooler(model_config=model_config)

-@pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
-    async with server_embedding.get_async_client() as async_client:
-        yield async_client
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1

-@pytest_asyncio.fixture
-async def client_generate(server_generate: RemoteOpenAIServer):
-    async with server_generate.get_async_client() as async_client:
-        yield async_client
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)


 def run_llm_encode(
-    llm: vllm.LLM,
+    llm: LLM,
    queries: list[str],
    instruction: str,
-) -> list[float]:
-    outputs = llm.encode([instruction + q for q in queries], )
+) -> list[list[float]]:
+    outputs = llm.embed([instruction + q for q in queries])
    return [output.outputs.embedding for output in outputs]


 async def run_client_embeddings(
-    client: vllm.LLM,
+    client: openai.AsyncOpenAI,
    queries: list[str],
    instruction: str,
-) -> list[float]:
+) -> list[list[float]]:
    outputs = await client.embeddings.create(
        model=MODEL_NAME,
        input=[instruction + q for q in queries],
@@ -132,7 +103,7 @@ def get_test_data():
    return queries, q_instruction, documents, d_instruction


-def validate_embed_output(q_rep: list[float], d_rep: list[float]):
+def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)

@@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)

    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
+    assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)


-def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
+                                  vllm_runner):
    # GritLM embedding implementation is only supported by XFormers backend.
    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")

        queries, q_instruction, documents, d_instruction = get_test_data()

-        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        with vllm_runner(
+                MODEL_NAME,
+                task="embed",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model
+
+            d_rep = run_llm_encode(
+                llm,
+                documents,
+                d_instruction,
+            )
+            q_rep = run_llm_encode(
+                llm,
+                queries,
+                q_instruction,
+            )
+
+        validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding():
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_embedding = server.get_async_client()

-        d_rep = run_llm_encode(
-            llm,
+        d_rep = await run_client_embeddings(
+            client_embedding,
            documents,
            d_instruction,
        )
-        q_rep = run_llm_encode(
-            llm,
+        q_rep = await run_client_embeddings(
+            client_embedding,
            queries,
            q_instruction,
        )

-        validate_embed_output(q_rep, d_rep)
-
-
-@pytest.mark.asyncio
-async def test_gritlm_api_server_embedding(
-    client_embedding: openai.AsyncOpenAI, ):
-    queries, q_instruction, documents, d_instruction = get_test_data()
+    validate_embed_output(q_rep, d_rep)

-    d_rep = await run_client_embeddings(
-        client_embedding,
-        documents,
-        d_instruction,
-    )
-    q_rep = await run_client_embeddings(
-        client_embedding,
-        queries,
-        q_instruction,
-    )

-    validate_embed_output(q_rep, d_rep)
+def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")

+        input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"

-def test_gritlm_offline_gen():
-    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+        with vllm_runner(
+                MODEL_NAME,
+                task="generate",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model

-    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
-    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
-    outputs = llm.generate(input, sampling_params=sampling_params)
+            sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+            outputs = llm.generate(input, sampling_params=sampling_params)

-    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+        assert outputs[0].outputs[0].text == "The capital of France is Paris."


 @pytest.mark.asyncio
-async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
+async def test_gritlm_api_server_generate():
    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"

-    outputs = await client_generate.completions.create(
-        model=MODEL_NAME,
-        prompt=input,
-        max_tokens=256,
-        temperature=0.0,
-    )
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_generate = server.get_async_client()
+
+        outputs = await client_generate.completions.create(
+            model=MODEL_NAME,
+            prompt=input,
+            max_tokens=256,
+            temperature=0.0,
+        )

    assert outputs.choices[0].text == "The capital of France is Paris."
--- a/tests/models/embedding/language/test_jina.py
+++ b/tests/models/embedding/language/test_jina.py
@@ -8,9 +8,10 @@ import math

 import pytest

-from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
 from vllm import PoolingParams

+from ...utils import check_embeddings_close, matryoshka_fy
+
 SCORING_MODELS = [
    "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
 ]

--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
--- a/tests/models/embedding/language/test_snowflake_arctic_embed.py
+++ b/tests/models/embedding/language/test_snowflake_arctic_embed.py
@@ -5,9 +5,7 @@ Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
 """
 import pytest

-from tests.models.embedding.utils import EmbedModelInfo
-
-from ..utils import check_embeddings_close
+from ...utils import EmbedModelInfo, check_embeddings_close

 EMBEDDING_PROMPTS = [
    'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',

--- a/tests/models/embedding/language/test_truncation_control.py
+++ b/tests/models/embedding/language/test_truncation_control.py
--- a/tests/models/decoder_only/vision_language/__init__.py
+++ b/tests/models/decoder_only/vision_language/__init__.py
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -267,6 +267,7 @@ VLM_TEST_SETTINGS = {
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
+        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@@ -423,6 +424,8 @@ VLM_TEST_SETTINGS = {
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
    ),
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
@@ -434,6 +437,8 @@ VLM_TEST_SETTINGS = {
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
    ),
    "minicpmv_26": VLMTestInfo(
        models=["openbmb/MiniCPM-V-2_6"],
@@ -445,6 +450,8 @@ VLM_TEST_SETTINGS = {
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
    ),
    "minimax_vl_01": VLMTestInfo(
        models=["MiniMaxAI/MiniMax-VL-01"],

--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
--- a/tests/models/decoder_only/audio_language/test_granite_speech.py
+++ b/tests/models/decoder_only/audio_language/test_granite_speech.py
--- a/tests/models/decoder_only/vision_language/test_interleaved.py
+++ b/tests/models/decoder_only/vision_language/test_interleaved.py
@@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
 NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")


+@pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", ["float16"])
 @pytest.mark.parametrize("max_tokens", [128])

--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -17,7 +17,8 @@ from vllm.sequence import SampleLogprobs
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
 from ....quantization.utils import is_quant_method_supported
-from ....utils import large_gpu_test
+from ....utils import (create_new_process_for_each_test, large_gpu_test,
+                       multi_gpu_test)
 from ...utils import check_logprobs_close

 _LIMIT_IMAGE_PER_PROMPT = 3
@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
        )


+@create_new_process_for_each_test()
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    distributed_executor_backend,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=model,
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+
 @large_gpu_test(min_gb=48)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)

--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
-
-Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
-"""
 from typing import Optional

 import pytest

-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset

+from ....conftest import VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test

 PROMPTS = [
@@ -92,6 +89,7 @@ EXPECTED = {


 def run_test(
+    vllm_runner: type[VllmRunner],
    model: str,
    *,
    tensor_parallel_size: int,
@@ -100,38 +98,52 @@ def run_test(
    prompt_list = PROMPTS * 10
    expected_list = EXPECTED[model] * 10

-    llm = LLM(
-        model=model,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
+    with vllm_runner(
+            model,
+            max_model_len=448,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        llm = vllm_model.model

-    sampling_params = SamplingParams(
-        temperature=0,
-        top_p=1.0,
-        max_tokens=200,
-    )
+        sampling_params = SamplingParams(
+            temperature=0,
+            top_p=1.0,
+            max_tokens=200,
+        )

-    outputs = llm.generate(prompt_list, sampling_params)
+        outputs = llm.generate(prompt_list, sampling_params)

    for output, expected in zip(outputs, expected_list):
        print(output.outputs[0].text)
        assert output.outputs[0].text == expected


-@create_new_process_for_each_test()
+@create_new_process_for_each_test("spawn")
 @pytest.mark.core_model
 @pytest.mark.parametrize(
    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
-def test_models(model) -> None:
-    run_test(model, tensor_parallel_size=1)
+def test_models(vllm_runner, model) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=1,
+    )


+@create_new_process_for_each_test("spawn")
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-def test_models_distributed(model, distributed_executor_backend) -> None:
-    run_test(model,
-             tensor_parallel_size=2,
-             distributed_executor_backend=distributed_executor_backend)
+def test_models_distributed(
+    vllm_runner,
+    model,
+    distributed_executor_backend,
+) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
--- a/tests/models/decoder_only/vision_language/vlm_utils/__init__.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/__init__.py