[V1] V1 Enablement Oracle (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>

[V1] V1 Enablement Oracle (#13726)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
d4d93db2 · Robert Shaw · GitHub · 8c0d15d5 · d4d93db2 · d4d93db2
Unverified Commit d4d93db2 authored Mar 15, 2025 by Robert Shaw Committed by GitHub Mar 14, 2025
20 changed files
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -35,13 +35,6 @@ def test_classification_models(
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.classify(example_prompts)
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-        vllm_model.apply_model(print_model)
    with hf_runner(model,
                   dtype=dtype,
                   auto_cls=AutoModelForSequenceClassification) as hf_model:

--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -73,13 +73,6 @@ def test_models(
                     **vllm_extra_kwargs) as vllm_model:
        vllm_outputs = vllm_model.encode(example_prompts)
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-        vllm_model.apply_model(print_model)
    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -256,7 +256,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
-                                                     extras={"mistral": "mistral-community/pixtral-12b"}),  # noqa: E501
+                                                     extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
+                                                             "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
    "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
    "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
    "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
@@ -274,8 +275,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                              trust_remote_code=True),
    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
-    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-vision-instruct",
+    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
-                                        trust_remote_code=True),
+                                        trust_remote_code=True,
+                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501),
    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                        trust_remote_code=True),
    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -6,6 +6,8 @@ import pytest
 from transformers import PretrainedConfig
 from vllm import LLM
+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.v1.engine.core import EngineCore as V1EngineCore
 from .registry import HF_EXAMPLE_MODELS
@@ -36,12 +38,18 @@ def test_can_initialize(model_arch):
        return hf_config
    # Avoid calling model.forward()
-    def _initialize_kv_caches(self) -> None:
+    def _initialize_kv_caches_v0(self) -> None:
        self.cache_config.num_gpu_blocks = 0
        self.cache_config.num_cpu_blocks = 0
-    with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
+    def _initalize_kv_caches_v1(self, vllm_config):
-                      _initialize_kv_caches):
+        # gpu_blocks (> 0), cpu_blocks
+        return 1, 0
+    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
+                       _initialize_kv_caches_v0),
+          patch.object(V1EngineCore, "_initialize_kv_caches",
+                       _initalize_kv_caches_v1)):
        LLM(
            model_info.default,
            tokenizer=model_info.tokenizer,

--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -11,12 +11,14 @@ from ..utils import fork_new_process_for_each_test
 @fork_new_process_for_each_test
-def test_plugin(dummy_opt_path):
+def test_plugin(dummy_opt_path, monkeypatch):
+    # V1 shuts down rather than raising an error here.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    os.environ["VLLM_PLUGINS"] = ""
    with pytest.raises(Exception) as excinfo:
        LLM(model=dummy_opt_path, load_format="dummy")
    error_msg = "has no vLLM implementation and " \
-                "the Transformers implementation is not compatible with vLLM."
+                "the Transformers implementation is not compatible with vLLM"
    assert (error_msg in str(excinfo.value))
@@ -51,7 +53,7 @@ image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 @fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path):
+def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
    prompts = [{
        "prompt": "What's in the image?<image>",

--- a/tests/mq_llm_engine/conftest.py
+++ b/tests/mq_llm_engine/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/plugins_tests/conftest.py
+++ b/tests/plugins_tests/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -34,7 +34,10 @@ def test_disable_sliding_window(model_len_len, ):
    del vllm_disabled_model
    cleanup_dist_env_and_memory()
-    vllm_enabled_model = LLM(model, disable_sliding_window=False)
+    vllm_enabled_model = LLM(model,
+                             enforce_eager=True,
+                             disable_sliding_window=False,
+                             enable_prefix_caching=False)
    vllm_enabled_model.generate("Hi my name is")
    model_config = vllm_enabled_model.llm_engine.model_config
    assert model_config.max_model_len == full_len, (

--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -16,6 +16,15 @@ from vllm.platforms import current_platform
 from ..models.utils import check_outputs_equal
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 MODELS = [
    "distilbert/distilgpt2",
 ]

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -21,6 +21,14 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 @pytest.mark.parametrize(
    "model_args",
    [

--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -10,6 +10,13 @@ from tests.quantization.utils import is_quant_method_supported
 from ..utils import compare_two_settings
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    # Fall back to V0 if cpu offloading is enabled.
+    # Fixture is required to that baseline uses V0.
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():

--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -47,7 +47,9 @@ KV_CACHE_MODELS = [
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
        def check_model(model):
@@ -86,6 +88,9 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
 @pytest.mark.parametrize("force_marlin", [False, True])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                         monkeypatch) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -28,8 +28,10 @@ MODEL_QUANT = [
 @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
-def test_gptq_with_dynamic(vllm_runner, model_id: str,
+def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
-                           use_marlin_kernel: bool):
+                           monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)

--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -29,7 +29,10 @@ def test_lm_head(
    vllm_runner,
    model_id: str,
    lm_head_quantized: bool,
+    monkeypatch,
 ) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_id, dtype=torch.float16,
                     max_model_len=2048) as vllm_model:

--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
    QuarkLinearMethod, QuarkW8A8Fp8)
-def test_quark_fp8(vllm_runner):
+def test_quark_fp8(vllm_runner, monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
    with vllm_runner(model_path) as llm:

--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -101,8 +101,10 @@ def test_register_quantization_config():
                         argvalues=[
                             "meta-llama/Llama-3.2-1B-Instruct",
                         ])
-def test_custom_quant(vllm_runner, model):
+def test_custom_quant(vllm_runner, model, monkeypatch):
    """Test infer with the custom quantization method."""
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_name=model,
                     quantization="custom_quant",
                     enforce_eager=True) as llm:

--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -6,6 +6,13 @@ Run `pytest tests/samplers/test_beam_search.py`.
 import pytest
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
 #   2. Increase beam_width to 8.
@@ -15,6 +22,7 @@ BEAM_WIDTHS = [4]
 MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
+@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)

--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -8,6 +8,13 @@ import pytest
 from vllm import SamplingParams
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
 MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]

--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -8,6 +8,14 @@ from vllm import SamplingParams
 MODELS = ["distilbert/distilgpt2"]
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_logits_processor_force_generate(

--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -10,6 +10,15 @@ from ..conftest import VllmRunner
 MODELS = ["distilbert/distilgpt2"]
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module is V0 only since it uses dtype=float, so
+    set VLLM_USE_V1=0 for all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                         ["float"])  # needed for comparing logprobs with HF