[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>

[CI] Prune tests/models/decoder_only/language/* tests (#9940)
Signed-off-by: mgoin <michael@neuralmagic.com>
02462465 · Michael Goin · GitHub · b9c64c0c · 02462465 · b9c64c0c
Unverified Commit 02462465 authored Nov 05, 2024 by Michael Goin Committed by GitHub Nov 05, 2024
9 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -321,7 +321,6 @@ steps:
  - tests/models/decoder_only/language
  commands:
    - pytest -v -s models/decoder_only/language/test_models.py
-    - pytest -v -s models/decoder_only/language/test_big_models.py

 - label: Decoder-only Language Models Test (Extended) # 1h20min
  nightly: true
@@ -329,7 +328,7 @@ steps:
  - vllm/
  - tests/models/decoder_only/language
  commands:
-    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
+    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py

 - label: Decoder-only Multi-Modal Models Test (Standard)
  #mirror_hardwares: [amd]

--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-This tests bigger models and use half precision.
-
-Run `pytest tests/models/test_big_models.py`.
-"""
-import pytest
-
-from vllm.platforms import current_platform
-
-from ...utils import check_logprobs_close, check_outputs_equal
-
-MODELS = [
-    "meta-llama/Llama-2-7b-hf",
-    # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
-    # "Deci/DeciLM-7b",  # Broken
-    # "tiiuae/falcon-7b",  # Broken
-    "EleutherAI/gpt-j-6b",
-    # "mosaicml/mpt-7b",  # Broken
-    # "Qwen/Qwen1.5-0.5B"  # Broken,
-]
-
-if not current_platform.is_cpu():
-    MODELS += [
-        # fused_moe which not supported on CPU
-        "openbmb/MiniCPM3-4B",
-        # Head size isn't supported on CPU
-        "h2oai/h2o-danube3-4b-base",
-    ]
-
-# TODO: remove this after CPU float16 support ready
-target_dtype = "float" if current_platform.is_cpu() else "half"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [32])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-
-    if model == "openbmb/MiniCPM3-4B":
-        # the output becomes slightly different when upgrading to
-        # pytorch 2.5 . Changing to logprobs checks instead of exact
-        # output checks.
-        NUM_LOG_PROBS = 8
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -21,11 +21,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
    "kv_cache_dtype,base_model,test_model,scale_path",
    [
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct", None),
        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
         "meta-llama/Llama-2-7b-chat-hf",
@@ -33,7 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.

--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -22,24 +22,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024

 MODELS = [
-    # act_order==False, group_size=channelwise
-    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
-    # act_order==False, group_size=128
-    ("TheBloke/Llama-2-7B-GPTQ", "main"),
-
    # act_order==True, group_size=128
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-    # act_order==True, group_size=64
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
-    # act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),

    # 8-bit, act_order==True, group_size=channelwise
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-    # 8-bit, act_order==True, group_size=128
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
-    # 8-bit, act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),

    # 4-bit, act_order==True, group_size=128
    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")

--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -25,16 +25,16 @@ model_pairs = [
    # 4-bit, group_size == 128
    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
-    # 4-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+    # # 4-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),

    # 8-bit, group_size == 128
    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
-    # 8-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+    # # 8-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
 ]



--- a/tests/models/decoder_only/language/test_marlin.py
+++ b/tests/models/decoder_only/language/test_marlin.py
-"""Compare the outputs of a GPTQ model to a Marlin model.
-
-Note: GPTQ and Marlin do not have bitwise correctness.
-As a result, in this test, we just confirm that the top selected tokens of the
-Marlin/GPTQ models are in the top 3 selections of each other.
-
-Note: Marlin internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for Marlin. As a result, we re-run the test
-up to 3 times to see if we pass.
-
-Run `pytest tests/models/test_marlin.py`.
-"""
-from dataclasses import dataclass
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-
-from ...utils import check_logprobs_close
-
-
-@dataclass
-class ModelPair:
-    model_marlin: str
-    model_gptq: str
-
-
-model_pairs = [
-    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
-              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
-    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
-              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
-    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
-              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
-]
-
-
-@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("marlin"),
-                    reason="Marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("model_pair", model_pairs)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model_pair: ModelPair,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with vllm_runner(model_pair.model_marlin,
-                     dtype=dtype,
-                     quantization="marlin") as marlin_model:
-        marlin_outputs = marlin_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
-        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    check_logprobs_close(
-        outputs_0_lst=gptq_outputs,
-        outputs_1_lst=marlin_outputs,
-        name_0="gptq",
-        name_1="marlin",
-    )
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
 """
 import pytest

-from vllm import LLM, SamplingParams
+from vllm import SamplingParams

 from ...utils import check_logprobs_close

@@ -15,6 +15,10 @@ MODELS = [
    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]

+MISTRAL_FORMAT_MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
    "勇敢な船乗りについての詩を書く",  # japanese
@@ -95,7 +99,7 @@ def test_models(
    )


-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
@@ -135,28 +139,29 @@ def test_mistral_format(
    )


-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
 def test_mistral_symbolic_languages(
+    vllm_runner,
    model: str,
    dtype: str,
-    prompt: str,
 ) -> None:
-    prompt = "hi"
-    msg = {"role": "user", "content": prompt}
-    llm = LLM(model=model,
-              dtype=dtype,
-              max_model_len=8192,
-              tokenizer_mode="mistral",
-              config_format="mistral",
-              load_format="mistral")
-    outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
-    assert "�" not in outputs[0].outputs[0].text.strip()
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=8192,
+                     tokenizer_mode="mistral",
+                     config_format="mistral",
+                     load_format="mistral") as vllm_model:
+        for prompt in SYMBOLIC_LANG_PROMPTS:
+            msg = {"role": "user", "content": prompt}
+            outputs = vllm_model.model.chat([msg],
+                                            sampling_params=SAMPLING_PARAMS)
+            assert "�" not in outputs[0].outputs[0].text.strip()


 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model", MODELS[1:])  # v1 can't do func calling
+@pytest.mark.parametrize("model",
+                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(
    vllm_runner,
    model: str,

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -7,25 +7,39 @@ Run `pytest tests/models/test_models.py`.
 """
 import pytest

-from ...utils import check_outputs_equal
+from vllm.platforms import current_platform
+
+from ...utils import check_logprobs_close

 MODELS = [
-    "facebook/opt-125m",
-    "gpt2",
-    "bigcode/tiny_starcoder_py",
-    "EleutherAI/pythia-70m",
-    "bigscience/bloom-560m",  # Testing alibi slopes.
-    "microsoft/phi-2",
-    "stabilityai/stablelm-3b-4e1t",
-    # "allenai/OLMo-1B",  # Broken
-    "bigcode/starcoder2-3b",
-    "google/gemma-1.1-2b-it",
+    "facebook/opt-125m",  # opt
+    "openai-community/gpt2",  # gpt2
+    # "Milos/slovak-gpt-j-405M",  # gptj
+    # "bigcode/tiny_starcoder_py",  # gpt_bigcode
+    # "EleutherAI/pythia-70m",  # gpt_neox
+    "bigscience/bloom-560m",  # bloom - testing alibi slopes
+    "microsoft/phi-2",  # phi
+    # "stabilityai/stablelm-3b-4e1t",  # stablelm
+    # "bigcode/starcoder2-3b",  # starcoder2
+    "google/gemma-1.1-2b-it",  # gemma
+    "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
+    "meta-llama/Llama-3.2-1B-Instruct",  # llama
 ]

+if not current_platform.is_cpu():
+    MODELS += [
+        # fused_moe which not supported on CPU
+        "openbmb/MiniCPM3-4B",
+    ]
+
+# TODO: remove this after CPU float16 support ready
+target_dtype = "float" if current_platform.is_cpu() else "half"
+

 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    hf_runner,
    vllm_runner,
@@ -33,33 +47,24 @@ def test_models(
    model: str,
    dtype: str,
    max_tokens: int,
+    num_logprobs: int,
 ) -> None:
-    # To pass the small model tests, we need full precision.
-    assert dtype == "float"

    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)

-    check_outputs_equal(
+    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
--- a/tests/models/decoder_only/language/test_qwen.py
+++ b/tests/models/decoder_only/language/test_qwen.py
-"""Ensure that a text-only Qwen model can be run without throwing an error.
-We explicitly test this because Qwen is implemented as a multimodal and
-supports a visual encoder for models like Qwen-VL.
-"""
-from typing import List, Type
-
-import pytest
-
-from ....conftest import VllmRunner
-
-models = [
-    "Qwen/Qwen-7B-Chat"  # Has no visual encoder
-]
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_text_only_qwen_model_can_be_loaded_and_run(
-    vllm_runner: Type[VllmRunner],
-    example_prompts: List[str],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens,
-            num_logprobs=num_logprobs,
-        )