Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
15 changed files
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -31,7 +31,10 @@ def test_lm_head(
    vllm_runner,
    model_id: str,
    lm_head_quantized: bool,
+    monkeypatch,
 ) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_id, dtype=torch.float16,
                     max_model_len=2048) as vllm_model:


--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
    QuarkLinearMethod, QuarkW8A8Fp8)


-def test_quark_fp8(vllm_runner):
+def test_quark_fp8(vllm_runner, monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
    with vllm_runner(model_path) as llm:


--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -5,7 +5,7 @@ See https://github.com/vllm-project/vllm/issues/11926 for more details.

 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional

 import pytest
 import torch
@@ -58,7 +58,7 @@ class CustomQuantConfig(QuantizationConfig):
        """Name of the quantization method."""
        return "custom_quant"

-    def get_supported_act_dtypes(self) -> List["torch.dtype"]:
+    def get_supported_act_dtypes(self) -> list["torch.dtype"]:
        """List of supported activation dtypes."""
        return [torch.float16, torch.bfloat16]

@@ -68,12 +68,12 @@ class CustomQuantConfig(QuantizationConfig):
        return -1

    @staticmethod
-    def get_config_filenames() -> List[str]:
+    def get_config_filenames() -> list[str]:
        """List of filenames to search for in the model directory."""
        return []

    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig":
+    def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig":
        """Create a config class from the model's quantization config."""
        return CustomQuantConfig(num_bits=config.get("num_bits", 8))

@@ -101,8 +101,10 @@ def test_register_quantization_config():
                         argvalues=[
                             "meta-llama/Llama-3.2-1B-Instruct",
                         ])
-def test_custom_quant(vllm_runner, model):
+def test_custom_quant(vllm_runner, model, monkeypatch):
    """Test infer with the custom quantization method."""
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_name=model,
                     quantization="custom_quant",
                     enforce_eager=True) as llm:

--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -8,6 +8,13 @@ import pytest
 import os
 from ..utils import models_path_prefix

+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
 #   2. Increase beam_width to 8.
@@ -17,6 +24,7 @@ BEAM_WIDTHS = [4]
 MODELS = [os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")]


+@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)

--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -10,6 +10,13 @@ import os
 from vllm import SamplingParams
 from ..utils import models_path_prefix

+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
 MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]

--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -10,6 +10,14 @@ from ..utils import models_path_prefix
 MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_logits_processor_force_generate(

--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
-
 import pytest
 import torch
 import os
@@ -14,6 +12,15 @@ from ..utils import models_path_prefix
 MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module is V0 only since it uses dtype=float, so
+    set VLLM_USE_V1=0 for all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                         ["half"])  # needed for comparing logprobs with HF
@@ -72,7 +79,7 @@ def test_get_prompt_logprobs(
            assert (len(logprobs) == num_top_logprobs
                    or len(logprobs) == num_top_logprobs + 1)
        output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens_lst: List[str] = []
+        output_string_from_most_likely_tokens_lst: list[str] = []
        for top_logprobs in result.outputs[0].logprobs:
            top_logprob = next(iter(top_logprobs.values()))
            output_string_from_most_likely_tokens_lst.append(

--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -5,20 +5,27 @@ Run `pytest tests/samplers/test_no_bad_words.py`.

 """
 import os
-from typing import List, Optional
+from typing import Optional

+import pytest
 from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams
 from ..utils import models_path_prefix

+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 def _generate(
    model: LLM,
    prompt: str,
    num_prompt_tokens: int,
    temperature: float = 0,
-    bad_words: Optional[List[str]] = None,
-) -> List[int]:
+    bad_words: Optional[list[str]] = None,
+) -> list[int]:
    sampling_params = SamplingParams(
        temperature=temperature,
        bad_words=bad_words,
@@ -60,7 +67,7 @@ class TestOneTokenBadWord:

    def _generate(self,
                  model: LLM,
-                  bad_words: Optional[List[str]] = None) -> List[int]:
+                  bad_words: Optional[list[str]] = None) -> list[int]:
        return _generate(
            model=model,
            prompt=self.PROMPT,
@@ -70,7 +77,7 @@ class TestOneTokenBadWord:

    def _encode(self,
                prompt: str,
-                add_special_tokens: bool = True) -> List[int]:
+                add_special_tokens: bool = True) -> list[int]:
        return self.tokenizer(prompt,
                              add_special_tokens=add_special_tokens).input_ids

@@ -150,7 +157,7 @@ class TestTwoTokenBadWord:

    def _generate(self,
                  model: LLM,
-                  bad_words: Optional[List[str]] = None) -> List[int]:
+                  bad_words: Optional[list[str]] = None) -> list[int]:
        return _generate(
            model=model,
            prompt=self.PROMPT,
@@ -159,7 +166,7 @@ class TestTwoTokenBadWord:
        )

    @staticmethod
-    def _contains(sequence: List[int], subsequence: List[int]) -> bool:
+    def _contains(sequence: list[int], subsequence: list[int]) -> bool:
        searched = False

        for start in range(len(sequence)):
@@ -182,6 +189,6 @@ class TestTwoTokenBadWord:

    def _encode(self,
                prompt: str,
-                add_special_tokens: bool = True) -> List[int]:
+                add_special_tokens: bool = True) -> list[int]:
        return self.tokenizer(prompt,
                              add_special_tokens=add_special_tokens).input_ids
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -9,6 +9,12 @@ from ..utils import models_path_prefix
 MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for rejection sampling."""
-from typing import List, Tuple

 import pytest
 import torch
@@ -8,7 +7,16 @@ import torch.nn.functional as F

 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+

 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
@@ -46,7 +54,7 @@ def mock_causal_accepted_tensor(
    "which_tokens_accepted",
    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not current_platform.is_rocm() else [False])
 @torch.inference_mode()
 def test_correct_output_format(which_tokens_accepted: str, seed: int,
                               device: str, use_flashinfer: bool):
@@ -130,7 +138,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not current_platform.is_rocm() else [False])
 @torch.inference_mode()
 def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                    device: str, use_flashinfer: bool):
@@ -162,7 +170,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
 @pytest.mark.parametrize("n_rep", [100])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not current_platform.is_rocm() else [False])
 @torch.inference_mode()
 def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                                   frac_seeded: float, n_rep: int, device: str,
@@ -203,7 +211,7 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                assert torch.equal(results[j][i], results[0][i])


-@pytest.mark.skipif(is_hip(),
+@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Consistent with NV.")
 @pytest.mark.parametrize("k", [1, 3, 6])
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@@ -305,7 +313,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
            for i in range(batch_size)
        }

-    for use_flashinfer in [True, False] if not is_hip() else [False]:
+    for use_flashinfer in [True, False] if not current_platform.is_rocm() else [False]:
        rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
        rejection_sampler.init_gpu_tensors(device=device)
        # We use seeded sequences to ensure the same tokens are accepted
@@ -326,7 +334,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
 @pytest.mark.parametrize("which_token_ids",
                         ["bonus_token_ids", "draft_token_ids"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not current_platform.is_rocm() else [False])
 @torch.inference_mode()
 def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
                               which_token_ids: str, device: str,
@@ -378,7 +386,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,

 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
 @pytest.mark.parametrize("seed", list(range(5)))
-@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not current_platform.is_rocm() else [False])
 @torch.inference_mode()
 def test_rejection_sampling_approximates_target_distribution(
        seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
@@ -419,8 +427,8 @@ def test_rejection_sampling_approximates_target_distribution(
        draft_and_target_probs_equal)

    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
-    distance_wrt_reference: List[float] = []
-    distance_wrt_target: List[float] = []
+    distance_wrt_reference: list[float] = []
+    distance_wrt_target: list[float] = []

    for num_samples in sample_sizes:
        (reference_vs_rejsample_dist,
@@ -455,7 +463,7 @@ def test_rejection_sampling_approximates_target_distribution(
            expected_improvement_multiplier)


-def get_ratio_first_to_last(elements: List[float]) -> float:
+def get_ratio_first_to_last(elements: list[float]) -> float:
    return elements[0] / elements[-1]


@@ -480,7 +488,7 @@ class _CorrectnessTestHelper:

    def generate_probs_for_test(
        self, draft_and_target_probs_equal: bool
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        draft_probs, target_probs = (F.softmax(
            torch.rand(self.vocab_size, dtype=torch.float32),
            dim=-1,
@@ -502,7 +510,7 @@ class _CorrectnessTestHelper:
    def run_and_compare_distributions(self, draft_probs: torch.Tensor,
                                      target_probs: torch.Tensor,
                                      reference_probs: torch.Tensor,
-                                      num_samples: int) -> Tuple[float, float]:
+                                      num_samples: int) -> tuple[float, float]:
        # Sample using rejection sampling.
        rej_sample_probs = self._estimate_rejection_sampling_pdf(
            draft_probs, target_probs, num_samples)

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -3,7 +3,7 @@
 import itertools
 import random
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 from unittest.mock import Mock, patch

 import pytest
@@ -18,6 +18,14 @@ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import Counter, is_pin_memory_available


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 class MockLogitsSampler(Sampler):

    def __init__(self, fake_logits: torch.Tensor):
@@ -30,7 +38,7 @@ class MockLogitsSampler(Sampler):

 def _prepare_test(
        batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
    fake_logits = torch.full((batch_size, VOCAB_SIZE),
                             1e-2,
@@ -53,8 +61,8 @@ def _do_sample(
    sampling_params: SamplingParams,
    device: str,
 ):
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    seq_lens: List[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@@ -171,7 +179,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
    def create_sampling_params(min_tokens,
                               eos_token_id=0,
                               *,
-                               stop_token_ids: Optional[List[int]] = None,
+                               stop_token_ids: Optional[list[int]] = None,
                               prompt_logprobs: Optional[int] = None):
        sampling_params = SamplingParams(
            min_tokens=min_tokens,
@@ -196,7 +204,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
        batch_size = random.randint(1, 128)

        expected_penalization = []
-        sequence_metadata_list: List[SequenceGroupMetadata] = []
+        sequence_metadata_list: list[SequenceGroupMetadata] = []
        # 20% chance to generate seq group metadata list with all prompts
        is_prompt = random.random() < 0.2
        while batch_size > 0:
@@ -216,8 +224,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
                eos_token_id=eos_token_id,
                stop_token_ids=stop_token_ids)

-            seq_data: Dict[int, SequenceData] = {}
-            seq_group_penalization: List[bool] = []
+            seq_data: dict[int, SequenceData] = {}
+            seq_group_penalization: list[bool] = []
            for _ in range(num_seqs):
                num_input = random.randint(1, 100)
                num_generated = 0 if is_prompt else random.randint(1, 100)
@@ -376,16 +384,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
    else:
        test_cases = [generate_test_case()]

-    def run_test_case(*, expected_penalization: List[bool],
-                      seq_group_metadata_list: List[SequenceGroupMetadata]):
+    def run_test_case(*, expected_penalization: list[bool],
+                      seq_group_metadata_list: list[SequenceGroupMetadata]):
        assert expected_penalization, \
            "Invalid test case, need expected_penalization"
        assert seq_group_metadata_list, \
            "Invalid test case, need seq_group_metadata_list"

        batch_size = 0
-        seq_lens: List[int] = []
-        sampling_params_per_row: List[SamplingParams] = []
+        seq_lens: list[int] = []
+        sampling_params_per_row: list[SamplingParams] = []
        for sgm in seq_group_metadata_list:
            sampling_params = sgm.sampling_params

@@ -456,11 +464,11 @@ def test_sampler_mixed(seed: int, device: str):
    batch_size = random.randint(1, 256)
    input_tensor, fake_logits, sampler = _prepare_test(batch_size)

-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    expected_tokens: List[Optional[List[int]]] = []
-    seq_lens: List[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    expected_tokens: list[Optional[list[int]]] = []
+    seq_lens: list[int] = []
    for i in range(batch_size):
-        expected: Optional[List[int]] = None
+        expected: Optional[list[int]] = None
        sampling_type = random.randint(0, 2)
        if sampling_type == 0:
            sampling_params = SamplingParams(temperature=0)
@@ -492,7 +500,7 @@ def test_sampler_mixed(seed: int, device: str):
            ))
        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    generators: Dict[str, torch.Generator] = {}
+    generators: dict[str, torch.Generator] = {}

    def test_sampling():
        sampling_metadata = SamplingMetadata.prepare(
@@ -587,8 +595,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
                                                        device=device)
    assert len(processors) == 2  # top_p and top_k

-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    seq_lens: List[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@@ -669,10 +677,10 @@ def test_sampler_repetition_penalty_mixed(device: str):

    vocab_size = 8

-    def test_sampling_params(sampling_params: List[SamplingParams]):
+    def test_sampling_params(sampling_params: list[SamplingParams]):

-        seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        seq_lens: List[int] = []
+        seq_group_metadata_list: list[SequenceGroupMetadata] = []
+        seq_lens: list[int] = []
        for i in range(2):
            seq_group_metadata_list.append(
                SequenceGroupMetadata(

--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -19,7 +19,9 @@ RANDOM_SEEDS = list(range(5))


 @pytest.fixture
-def vllm_model(vllm_runner):
+def vllm_model(vllm_runner, monkeypatch):
+    # This file relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(MODEL, dtype="half") as vllm_model:
        yield vllm_model


--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -11,6 +11,14 @@ from vllm.model_executor.utils import set_random_seed
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
    """
    Generates a fake temperature zero probability distribution.

--- a/tests/spec_decode/conftest.py
+++ b/tests/spec_decode/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
 # SPDX-License-Identifier: Apache-2.0

+from collections.abc import Sequence
 from itertools import cycle
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Optional, Union

 import pytest
 import torch
@@ -55,7 +56,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
 def maybe_assert_ngram_worker(llm):
    # Verify the proposer worker is ngram if ngram is specified.
    if (llm.llm_engine.speculative_config is not None
-            and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
+            and llm.llm_engine.speculative_config.method == "ngram"):
        from vllm.spec_decode.ngram_worker import NGramWorker
        assert isinstance(
            llm.llm_engine.model_executor.driver_worker.proposer_worker,
@@ -64,9 +65,9 @@ def maybe_assert_ngram_worker(llm):

 def get_output_from_llm_generator(
        llm_generator, prompts,
-        sampling_params) -> Tuple[List[str], List[List[int]], float]:
-    tokens: List[str] = []
-    token_ids: List[List[int]] = []
+        sampling_params) -> tuple[list[str], list[list[int]], float]:
+    tokens: list[str] = []
+    token_ids: list[list[int]] = []
    acceptance_rate: float = -1.0
    for llm in llm_generator():
        maybe_assert_ngram_worker(llm)