[V1] V1 Enablement Oracle (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>

[V1] V1 Enablement Oracle (#13726)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
d4d93db2 · Robert Shaw · GitHub · 8c0d15d5 · d4d93db2 · d4d93db2
Unverified Commit d4d93db2 authored Mar 15, 2025 by Robert Shaw Committed by GitHub Mar 14, 2025
20 changed files
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -6,11 +6,18 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
 """
 from typing import Optional

+import pytest
 from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 def _generate(
    model: LLM,
    prompt: str,

--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -7,6 +7,12 @@ from vllm import SamplingParams
 MODELS = ["distilbert/distilgpt2"]


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -8,6 +8,15 @@ import torch.nn.functional as F
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed

+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -18,6 +18,14 @@ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import Counter, is_pin_memory_available


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 class MockLogitsSampler(Sampler):

    def __init__(self, fake_logits: torch.Tensor):

--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -17,7 +17,9 @@ RANDOM_SEEDS = list(range(5))


 @pytest.fixture
-def vllm_model(vllm_runner):
+def vllm_model(vllm_runner, monkeypatch):
+    # This file relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(MODEL, dtype="half") as vllm_model:
        yield vllm_model


--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -11,6 +11,14 @@ from vllm.model_executor.utils import set_random_seed
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
    """
    Generates a fake temperature zero probability distribution.

--- a/tests/spec_decode/conftest.py
+++ b/tests/spec_decode/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -12,6 +12,14 @@ from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Tensorizer only tested on V0 so far.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.fixture(autouse=True)
 def cleanup():
    cleanup_dist_env_and_memory(shutdown_ray=True)

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -7,11 +7,13 @@ will never happen again.
 """
 import gc

+import pytest
 import torch

 from vllm import LLM, SamplingParams


+@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
 def test_duplicated_ignored_sequence_group():
    """https://github.com/vllm-project/vllm/issues/1655"""


--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -366,7 +366,10 @@ def test_bind_kv_cache_non_attention():
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]


-def test_bind_kv_cache_encoder_decoder():
+def test_bind_kv_cache_encoder_decoder(monkeypatch):
+    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
    from vllm.attention import Attention, AttentionType

    # example from bart

--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -279,7 +279,12 @@ def test_decode_prompt_logprobs_chunked_prefill(
    model,
    chunked_prefill_token_size: int,
    example_prompts,
+    monkeypatch,
 ):
+    # VLLM V1 does not use incremental detokenization for
+    # prompt logprobs, so this test strategy is irrelevant.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
    max_num_seqs = 256
    enable_chunked_prefill = False
    max_num_batched_tokens = None

--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -91,20 +91,22 @@ CONFIGS: dict[str, ServerConfig] = {
        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
        "to the user's question - just respond to it normally."
    },
-    "granite20b": {
-        "model":
-        "mbayser/granite-20b-functioncalling-FP8-KV",
-        "arguments": [
-            "--tool-call-parser", "granite-20b-fc", "--chat-template",
-            str(VLLM_PATH /
-                "examples/tool_chat_template_granite_20b_fc.jinja"),
-            "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
-        ],
-        "supports_parallel":
-        False,
-        "supports_rocm":
-        False,
-    },
+    # V1 Test: Passing locally but failing in CI. This runs the
+    # V0 Engine because of CPU offloading. Need to debug why.
+    # "granite20b": {
+    #     "model":
+    #     "mbayser/granite-20b-functioncalling-FP8-KV",
+    #     "arguments": [
+    #         "--tool-call-parser", "granite-20b-fc", "--chat-template",
+    #         str(VLLM_PATH /
+    #             "examples/tool_chat_template_granite_20b_fc.jinja"),
+    #         "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
+    #     ],
+    #     "supports_parallel":
+    #     False,
+    #     "supports_rocm":
+    #     False,
+    # },
    "granite-3.0-8b": {
        "model":
        "ibm-granite/granite-3.0-8b-instruct",

--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -19,6 +19,16 @@ from opentelemetry.sdk.environment_variables import (
 from vllm import LLM, SamplingParams
 from vllm.tracing import SpanAttributes

+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"

 FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',

--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -18,19 +18,19 @@ if not envs.VLLM_USE_V1:
 def test_prefix_caching_from_cli():
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
    args = parser.parse_args([])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert (engine_args.enable_prefix_caching
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert (vllm_config.cache_config.enable_prefix_caching
            ), "V1 turns on prefix caching by default."

    # Turn it off possible with flag.
    args = parser.parse_args(["--no-enable-prefix-caching"])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert not engine_args.enable_prefix_caching
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert not vllm_config.cache_config.enable_prefix_caching

    # Turn it on with flag.
    args = parser.parse_args(["--enable-prefix-caching"])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.enable_prefix_caching
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.enable_prefix_caching


 def test_defaults_with_usage_context():
@@ -38,11 +38,21 @@ def test_defaults_with_usage_context():
    vllm_config: VllmConfig = engine_args.create_engine_config(
        UsageContext.LLM_CLASS)

+    from vllm.platforms import current_platform
+    device_name = current_platform.get_device_name().lower()
+    if "h100" in device_name or "h200" in device_name:
+        # For H100 and H200, we use larger default values.
+        default_llm_tokens = 16384
+        default_server_tokens = 8192
+    else:
+        default_llm_tokens = 8192
+        default_server_tokens = 2048
+
    assert vllm_config.scheduler_config.max_num_seqs == 1024
-    assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
+    assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501

    engine_args = EngineArgs(model="facebook/opt-125m")
    vllm_config = engine_args.create_engine_config(
        UsageContext.OPENAI_API_SERVER)
    assert vllm_config.scheduler_config.max_num_seqs == 1024
-    assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
+    assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -6,7 +6,6 @@ from collections.abc import Generator
 import pytest
 import torch

-from tests.kernels.utils import override_backend_env_variable
 from tests.v1.sample.utils import (
    BatchLogprobsComposition, BatchLogprobsSpecType,
    assert_incr_detok_str_matches_non_incr_detok_str,
@@ -334,7 +333,7 @@ def test_get_logprobs_and_prompt_logprobs(
            do_apc=do_apc)


-def test_max_logprobs(monkeypatch):
+def test_max_logprobs():
    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
    
    Should also fail for `prompt_logprobs > max_logprobs`
@@ -344,7 +343,6 @@ def test_max_logprobs(monkeypatch):
    Args:
      monkeypatch
    """
-    override_backend_env_variable(monkeypatch, "FLASH_ATTN")

    runner = VllmRunner("facebook/opt-125m",
                        max_logprobs=1,

--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+import vllm.envs as envs
+from vllm import LLM
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+UNSUPPORTED_MODELS_V1 = [
+    "openai/whisper-large-v3",  # transcription
+    "facebook/bart-large-cnn",  # encoder decoder
+    "mistralai/Mamba-Codestral-7B-v0.1",  # mamba
+    "ibm-ai-platform/Bamba-9B",  # hybrid
+    "BAAI/bge-m3",  # embedding
+]
+
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
+def test_reject_unsupported_models(monkeypatch, model):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        args = AsyncEngineArgs(model=model)
+
+        with pytest.raises(NotImplementedError):
+            _ = args.create_engine_config()
+        m.delenv("VLLM_USE_V1")
+
+
+def test_reject_bad_config(monkeypatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+
+
+def test_unsupported_configs(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                kv_cache_dtype="fp8",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                speculative_model=MODEL,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                guided_decoding_backend="lm-format-enforcer:no-fallback",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                preemption_mode="swap",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                disable_async_output_proc=True,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                scheduling_policy="priority",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                num_scheduler_steps=5,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                scheduler_delay_factor=1.2,
+            ).create_engine_config()
+
+
+def test_enable_by_default_fallback(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Should default to V1 for supported config.
+        _ = AsyncEngineArgs(
+            model=MODEL,
+            enforce_eager=True,
+        ).create_engine_config()
+        assert envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Should fall back to V0 for experimental config.
+        _ = AsyncEngineArgs(
+            model=MODEL,
+            enable_lora=True,
+        ).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Should fall back to V0 for supported model.
+        _ = AsyncEngineArgs(
+            model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+
+def test_v1_llm_by_default(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Should default to V1 for supported config.
+        model = LLM(MODEL, enforce_eager=True)
+        print(model.generate("Hello my name is"))
+        assert hasattr(model.llm_engine, "engine_core")
+        m.delenv("VLLM_USE_V1")
+
+
+def test_v1_attn_backend(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+        # Fall back to V0.
+        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Reject if V1.
+        m.setenv("VLLM_USE_V1", "1")
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(model=MODEL).create_engine_config()
+        m.delenv("VLLM_USE_V1")
+
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA")
+        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
+        assert envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+
+def test_reject_using_constructor_directly(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Sets VLLM_USE_V1=1.
+        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
+
+        # This uses the V0 constructor directly.
+        with pytest.raises(ValueError):
+            AsyncLLMEngine(vllm_config,
+                           AsyncLLMEngine._get_executor_cls(vllm_config),
+                           log_stats=True)
+
+        m.delenv("VLLM_USE_V1")
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -15,6 +15,9 @@ QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
 MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")


+@pytest.mark.skipif(
+    MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq",
+    reason="OOM in the CI")
 @pytest.mark.skipif(
    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
    reason="Current system does not have minimum capability.")
@@ -22,10 +25,14 @@ def test_weight_loading(vllm_runner):
    """
    Test parameter weight loading with tp>1.
    """
+
+    # MoE models need fp16.
+    NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME
+                  == "nm-testing/test-w4a16-mixtral-actorder-group")
    with vllm_runner(
            model_name=MODEL_NAME,
            revision=REVISION,
-            dtype=torch.half if QUANTIZATION == "gptq" else "auto",
+            dtype=torch.half if NEEDS_FP16 else "auto",
            quantization=None if QUANTIZATION == "None" else QUANTIZATION,
            max_model_len=MAX_MODEL_LEN,
            tensor_parallel_size=2) as model:

--- a/tests/worker/conftest.py
+++ b/tests/worker/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1140,6 +1140,10 @@ class CacheConfig:
        if self.cache_dtype == "auto":
            pass
        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+            if envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "V1 does not yet support fp8 KV cache. "
+                    "Set VLLM_USE_V1=0 to enable fp8 kv cache.")
            logger.info(
                "Using fp8 data type to store kv cache. It reduces the GPU "
                "memory footprint and boosts the performance. "
@@ -3142,15 +3146,6 @@ class CompilationConfig(BaseModel):
                self.inductor_compile_config[KEY] = False

        if self.splitting_ops is None:
-            if envs.VLLM_USE_V1:
-                # v1 must split the graph on attention ops
-                # for piecewise cudagraph
-                self.splitting_ops = [
-                    "vllm.unified_attention",
-                    "vllm.unified_attention_with_output",
-                ]
-            else:
-                # v0 uses full graph compilation
            self.splitting_ops = []

        for k, v in self.inductor_passes.items():
@@ -3246,6 +3241,15 @@ class CompilationConfig(BaseModel):
        self.bs_to_padded_graph_size[
            self.max_capture_size] = self.max_capture_size

+    def set_splitting_ops_for_v1(self):
+        # If default, override splitting ops for piecewise cudagraph on V1.
+        # NOTE: this function needs to be called
+        if not self.splitting_ops:
+            self.splitting_ops = [
+                "vllm.unified_attention",
+                "vllm.unified_attention_with_output",
+            ]
+

 @dataclass
 class VllmConfig:
@@ -3297,6 +3301,7 @@ class VllmConfig:
        vllm_factors: list[Any] = []
        from vllm import __version__
        vllm_factors.append(__version__)
+        vllm_factors.append(envs.VLLM_USE_V1)
        if self.model_config:
            vllm_factors.append(self.model_config.compute_hash())
        else:
@@ -3460,6 +3465,7 @@ class VllmConfig:
            # CUDA graphs do not work properly with the custom CUDA kernels.
            # FIXME(woosuk): Disable inductor to reduce the compilation time
            # and avoid any potential issues with the inductor.
+            # FIXME(rob): Add function to set all of these.
            self.compilation_config.custom_ops = ["none"]
            self.compilation_config.use_cudagraph = True
            self.compilation_config.use_inductor = True
@@ -3467,6 +3473,7 @@ class VllmConfig:
            self.compilation_config.pass_config.enable_fusion = False
            self.compilation_config.pass_config.enable_noop = False
            self.compilation_config.level = CompilationLevel.PIECEWISE
+            self.compilation_config.set_splitting_ops_for_v1()

        self._set_cudagraph_sizes()


--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py