Update deprecated Python 3.8 typing (#13971)

cf069aa8 · Harry Mellor · GitHub · bf33700e · cf069aa8 · cf069aa8
Unverified Commit cf069aa8 authored Mar 03, 2025 by Harry Mellor Committed by GitHub Mar 02, 2025
20 changed files
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -3,7 +3,6 @@
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
 from dataclasses import dataclass
-from typing import List
 import pytest
@@ -19,7 +18,7 @@ class ModelWithQuantization:
    quantization: str
-MODELS: List[ModelWithQuantization]
+MODELS: list[ModelWithQuantization]
 #AWQ quantization is currently not supported in ROCm.
 if current_platform.is_rocm():
    MODELS = [
@@ -41,7 +40,7 @@ else:
 def do_sample(llm: vllm.LLM,
              lora_path: str,
              lora_id: int,
-              max_tokens: int = 256) -> List[str]:
+              max_tokens: int = 256) -> list[str]:
    raw_prompts = [
        "Give me an orange-ish brown color",
        "Give me a neon pink color",
@@ -61,7 +60,7 @@ def do_sample(llm: vllm.LLM,
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Optional
 import pytest
 from packaging.version import Version
@@ -20,7 +20,7 @@ class TestConfig:
    max_loras: int = 2
    max_lora_rank: int = 16
    max_model_len: int = 4096
-    mm_processor_kwargs: Optional[Dict[str, int]] = None
+    mm_processor_kwargs: Optional[dict[str, int]] = None
    def __post_init__(self):
        if self.mm_processor_kwargs is None:
@@ -57,11 +57,11 @@ class Qwen2VLTester:
        )
    def run_test(self,
-                 images: List[ImageAsset],
+                 images: list[ImageAsset],
-                 expected_outputs: List[str],
+                 expected_outputs: list[str],
                 lora_id: Optional[int] = None,
                 temperature: float = 0,
-                 max_tokens: int = 5) -> List[str]:
+                 max_tokens: int = 5) -> list[str]:
        sampling_params = vllm.SamplingParams(
            temperature=temperature,

--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 import pytest
 import vllm
@@ -21,7 +19,7 @@ EXPECTED_LORA_OUTPUT = [
 ]
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
        PROMPT_TEMPLATE.format(
@@ -40,7 +38,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()

--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@@ -3,7 +3,6 @@
 import shutil
 from os import path
 from tempfile import TemporaryDirectory
-from typing import List, Tuple
 import torch
 from huggingface_hub import snapshot_download
@@ -86,8 +85,8 @@ def test_ultravox_lora(vllm_runner):
                dtype="bfloat16",
                max_model_len=1024,
        ) as vllm_model:
-            ultravox_outputs: List[Tuple[
+            ultravox_outputs: list[tuple[
-                List[int], str]] = vllm_model.generate_greedy(
+                list[int], str]] = vllm_model.generate_greedy(
                    [
                        _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
                                    ULTRAVOX_MODEL_NAME)
@@ -108,7 +107,7 @@ def test_ultravox_lora(vllm_runner):
            dtype="bfloat16",
            max_model_len=1024,
    ) as vllm_model:
-        llama_outputs: List[Tuple[List[int], str]] = (
+        llama_outputs: list[tuple[list[int], str]] = (
            vllm_model.generate_greedy(
                [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
                256,

--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 import torch
@@ -12,7 +12,7 @@ class DummyLoRAManager:
    def __init__(self, device: torch.device = "cuda:0"):
        super().__init__()
-        self._loras: Dict[str, LoRALayerWeights] = {}
+        self._loras: dict[str, LoRALayerWeights] = {}
        self._device = device
    def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
@@ -77,11 +77,11 @@ class DummyLoRAManager:
        self,
        module_name: str,
        input_dim: int,
-        output_dims: List[int],
+        output_dims: list[int],
-        noop_lora_index: Optional[List[int]] = None,
+        noop_lora_index: Optional[list[int]] = None,
        rank: int = 8,
    ):
-        base_loras: List[LoRALayerWeights] = []
+        base_loras: list[LoRALayerWeights] = []
        noop_lora_index_set = set(noop_lora_index or [])
        for i, out_dim in enumerate(output_dims):
@@ -110,7 +110,7 @@ def assert_close(a, b):
 @dataclass
 class PunicaTensors:
    inputs_tensor: torch.Tensor
-    lora_weights: Union[torch.Tensor, List[torch.Tensor]]
+    lora_weights: Union[torch.Tensor, list[torch.Tensor]]
    our_out_tensor: torch.Tensor
    ref_out_tensor: torch.Tensor
    b_seq_start_loc: torch.Tensor
@@ -118,7 +118,7 @@ class PunicaTensors:
    seq_len_tensor: torch.Tensor
    token_lora_mapping: torch.Tensor
-    def meta(self) -> Tuple[int, int]:
+    def meta(self) -> tuple[int, int]:
        """
        Infer max_seq_length and token_nums from the tensors
        and return them.

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
 # SPDX-License-Identifier: Apache-2.0
 import time
-from typing import List
 import pytest
 import ray
@@ -133,7 +132,7 @@ def test_metric_counter_generation_tokens_multi_step(
    "served_model_name",
    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
 def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
-                                   served_model_name: List[str]) -> None:
+                                   served_model_name: list[str]) -> None:
    with vllm_runner(model,
                     dtype=dtype,
                     disable_log_stats=False,

--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Dict, List, Optional
+from typing import Optional
 from typing_extensions import TypedDict
 class ServerConfig(TypedDict, total=False):
    model: str
-    arguments: List[str]
+    arguments: list[str]
    system_prompt: Optional[str]
    supports_parallel: Optional[bool]
    supports_rocm: Optional[bool]
-ARGS: List[str] = ["--max-model-len", "1024"]
+ARGS: list[str] = ["--max-model-len", "1024"]
-CONFIGS: Dict[str, ServerConfig] = {
+CONFIGS: dict[str, ServerConfig] = {
    "mistral": {
        "model":
        "mistralai/Mistral-7B-Instruct-v0.3",

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 import pytest
 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
@@ -51,7 +49,7 @@ class Relu3(ReLUSquaredActivation):
        # All but RMSNorm
        ("all,-rms_norm", 4, [0, 1, 1, 1], True),
    ])
-def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
+def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int],
                     default_on: bool):
    vllm_config = VllmConfig(compilation_config=CompilationConfig(
        level=torch_level, custom_ops=env.split(",")))

--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional, Tuple, Type
+from typing import Optional
 import numpy as np
 import pytest
@@ -17,7 +17,7 @@ from ...utils import check_logprobs_close
 MODEL_NAME = "fixie-ai/ultravox-v0_4"
-AudioTuple = Tuple[np.ndarray, int]
+AudioTuple = tuple[np.ndarray, int]
 VLLM_PLACEHOLDER = "<|audio|>"
 HF_PLACEHOLDER = "<|audio|>"
@@ -78,7 +78,7 @@ def _get_prompt(audio_count, question, placeholder):
                                         add_generation_prompt=True)
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
@@ -96,9 +96,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 def run_test(
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    prompts_and_audios: List[Tuple[str, str, AudioTuple]],
+    prompts_and_audios: list[tuple[str, str, AudioTuple]],
    model: str,
    *,
    dtype: str,
@@ -158,8 +158,8 @@ def run_test(
 def run_multi_audio_test(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
+    prompts_and_audios: list[tuple[str, list[AudioTuple]]],
    model: str,
    *,
    dtype: str,

--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -5,7 +5,7 @@ Note: To pass the test, quantization higher than Q4 should be used
 """
 import os
-from typing import List, NamedTuple, Type
+from typing import NamedTuple
 import pytest
 from huggingface_hub import hf_hub_download
@@ -90,8 +90,8 @@ MODELS = [
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_models(
    num_gpus_available: int,
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    example_prompts: List[str],
+    example_prompts: list[str],
    model: GGUFTestConfig,
    dtype: str,
    max_tokens: int,

--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -5,7 +5,6 @@
 Note: these tests will only pass on H100
 """
 import os
-from typing import List
 import pytest
 from transformers import AutoTokenizer
@@ -65,7 +64,7 @@ def test_models(example_prompts, model_name) -> None:
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
+    generations: list[str] = []
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:

--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional, Type
+from typing import Optional
 import pytest
 import torch
@@ -19,12 +19,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
 def run_awq_test(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
    image_assets: _ImageAssets,
    source_model: str,
    quant_model: str,
    *,
-    size_factors: List[float],
+    size_factors: list[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -6,7 +6,6 @@ import math
 import os
 from collections import defaultdict
 from pathlib import PosixPath
-from typing import Type
 import pytest
 from packaging.version import Version
@@ -562,8 +561,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
-                             hf_runner: Type[HfRunner],
+                             hf_runner: type[HfRunner],
-                             vllm_runner: Type[VllmRunner],
+                             vllm_runner: type[VllmRunner],
                             image_assets: _ImageAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
@@ -585,8 +584,8 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
-                            hf_runner: Type[HfRunner],
+                            hf_runner: type[HfRunner],
-                            vllm_runner: Type[VllmRunner],
+                            vllm_runner: type[VllmRunner],
                            image_assets: _ImageAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
@@ -608,8 +607,8 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
    ))
 def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
-                                hf_runner: Type[HfRunner],
+                                hf_runner: type[HfRunner],
-                                vllm_runner: Type[VllmRunner],
+                                vllm_runner: type[VllmRunner],
                                image_assets: _ImageAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
@@ -629,7 +628,7 @@ def test_image_embedding_models(model_type: str,
        fork_new_process_for_each_test=False,
    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
-                      hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
+                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
                      video_assets: _VideoAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
@@ -651,8 +650,8 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
 def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
 ):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
@@ -674,8 +673,8 @@ def test_custom_inputs_models(
 @fork_new_process_for_each_test
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
-                                   hf_runner: Type[HfRunner],
+                                   hf_runner: type[HfRunner],
-                                   vllm_runner: Type[VllmRunner],
+                                   vllm_runner: type[VllmRunner],
                                   image_assets: _ImageAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
@@ -698,8 +697,8 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
 @fork_new_process_for_each_test
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
-                                  hf_runner: Type[HfRunner],
+                                  hf_runner: type[HfRunner],
-                                  vllm_runner: Type[VllmRunner],
+                                  vllm_runner: type[VllmRunner],
                                  image_assets: _ImageAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
@@ -722,8 +721,8 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
 @fork_new_process_for_each_test
 def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
-                                      hf_runner: Type[HfRunner],
+                                      hf_runner: type[HfRunner],
-                                      vllm_runner: Type[VllmRunner],
+                                      vllm_runner: type[VllmRunner],
                                      image_assets: _ImageAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
@@ -743,8 +742,8 @@ def test_image_embedding_models_heavy(model_type: str,
        fork_new_process_for_each_test=True,
    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
-                            hf_runner: Type[HfRunner],
+                            hf_runner: type[HfRunner],
-                            vllm_runner: Type[VllmRunner],
+                            vllm_runner: type[VllmRunner],
                            video_assets: _VideoAssets):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
@@ -767,8 +766,8 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
 def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
 ):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -2,7 +2,7 @@
 import os
 import re
-from typing import List, Optional, Tuple, Type
+from typing import Optional
 import pytest
 from transformers import AutoTokenizer
@@ -25,7 +25,7 @@ HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these
 models = ["microsoft/Phi-3.5-vision-instruct"]
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
@@ -55,9 +55,9 @@ if current_platform.is_rocm():
 def run_test(
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
+    inputs: list[tuple[list[str], PromptImageInput]],
    model: str,
    *,
    dtype: str,

--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -6,7 +6,7 @@ Run `pytest tests/models/test_mistral.py`.
 import json
 import uuid
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Optional
 import pytest
 from mistral_common.multimodal import download_image
@@ -38,7 +38,7 @@ IMG_URLS = [
 PROMPT = "Describe each image in one short sentence."
-def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
+def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
    return [{
        "role":
        "user",
@@ -54,7 +54,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
    }]
-def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
+def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
    return [{
        "role":
        "user",
@@ -68,7 +68,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
    }]
-def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
+def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
    msg = _create_msg_format(urls)
    tokenizer = MistralTokenizer.from_model("pixtral")
@@ -89,7 +89,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
    return engine_inputs
-def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
+def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
    msg = _create_msg_format_hf(urls)
    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
@@ -128,7 +128,7 @@ assert FIXTURES_PATH.exists()
 FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
 FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
-OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
+OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
 # For the test author to store golden output in JSON

--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
+from typing import Any, Optional, TypedDict, Union
 import numpy.typing as npt
 import pytest
@@ -69,21 +69,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 def batch_make_image_embeddings(
-        image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
+        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
-        llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]:
+        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
    """batched image embeddings for Qwen2-VL
    This will infer all images' embeddings in a single batch, 
      and split the result according to input batches.
    image_batches:
-      - Single-image batches: `List[Image.Image]`
+      - Single-image batches: `list[Image.Image]`
-      - Multiple-image batches: `List[List[Image.Image]]]`
+      - Multiple-image batches: `list[list[Image.Image]]]`
-    returns: `List[Qwen2VLPromptImageEmbeddingInput]`
+    returns: `list[Qwen2VLPromptImageEmbeddingInput]`
    """
-    image_batches_: List[Any] = image_batches[:]
+    image_batches_: list[Any] = image_batches[:]
    # convert single-image batches to multiple-image batches
    for idx in range(len(image_batches_)):
@@ -93,7 +93,7 @@ def batch_make_image_embeddings(
        assert isinstance(image_batches_[idx], list)
    # append all images into a list (as a batch)
-    images: List[Image.Image] = []
+    images: list[Image.Image] = []
    for image_batch in image_batches_:
        images += image_batch
@@ -121,7 +121,7 @@ def batch_make_image_embeddings(
    image_embeds = torch.concat(llm.apply_model(get_image_embeds))
    # split into original batches
-    result: List[Qwen2VLPromptImageEmbeddingInput] = []
+    result: list[Qwen2VLPromptImageEmbeddingInput] = []
    image_counter = 0
    embed_counter = 0
    for image_batch in image_batches_:
@@ -153,7 +153,7 @@ def batch_make_image_embeddings(
 def batch_make_video_embeddings(
        video_batches: PromptVideoInput, processor,
-        llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
    """batched video embeddings for Qwen2-VL
    A NDArray represents a single video's all frames.
@@ -162,21 +162,21 @@ def batch_make_video_embeddings(
      and split the result according to input batches.
    video_batches:
-      - Single-video batches: `List[NDArray]`
+      - Single-video batches: `list[NDArray]`
-      - Multiple-video batches: `List[List[NDArray]]`
+      - Multiple-video batches: `list[list[NDArray]]`
    """
-    video_batches_: List[Any] = video_batches[:]
+    video_batches_: list[Any] = video_batches[:]
    for idx in range(len(video_batches_)):
        if not isinstance(video_batches_[idx], list):
-            single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
+            single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
            video_batches_[idx] = single_video_batch
        assert isinstance(video_batches_[idx], list)
    # append all videos into a list (as a batch)
-    videos: List[npt.NDArray] = []
+    videos: list[npt.NDArray] = []
    for video_batch in video_batches_:
        videos += video_batch
@@ -204,7 +204,7 @@ def batch_make_video_embeddings(
    video_embeds = torch.concat(llm.apply_model(get_image_embeds))
    # split into original batches
-    result: List[Qwen2VLPromptVideoEmbeddingInput] = []
+    result: list[Qwen2VLPromptVideoEmbeddingInput] = []
    video_counter = 0
    embed_counter = 0
    for video_batch in video_batches_:
@@ -235,8 +235,8 @@ def batch_make_video_embeddings(
 def run_embedding_input_test(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
+    inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
    model: str,
    *,
    dtype: str,
@@ -323,8 +323,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
                                         num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
-    inputs_per_case: List[Tuple[
+    inputs_per_case: list[tuple[
-        List[str], PromptImageInput, PromptVideoInput]] = [(
+        list[str], PromptImageInput, PromptVideoInput]] = [(
            [prompt for _ in size_factors],
            [rescale_image_size(image, factor) for factor in size_factors],
            [],
@@ -365,7 +365,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
                                                  num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
-    inputs_per_case: List[Tuple[List[str], PromptImageInput,
+    inputs_per_case: list[tuple[list[str], PromptImageInput,
                                PromptVideoInput]] = [(
                                    [MULTIIMAGE_PROMPT for _ in size_factors],
                                    [[
@@ -413,8 +413,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
        for asset in video_assets
    ]
-    inputs_per_case: List[Tuple[
+    inputs_per_case: list[tuple[
-        List[str], PromptImageInput, PromptVideoInput]] = [(
+        list[str], PromptImageInput, PromptVideoInput]] = [(
            [prompt for _ in size_factors],
            [],
            [rescale_video_size(video, factor) for factor in size_factors],

--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
 # SPDX-License-Identifier: Apache-2.0
 """Helpers for building inputs that can be leveraged for different test types.
 """
+from collections.abc import Iterable
 from pathlib import PosixPath
-from typing import Callable, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
@@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
 def get_model_prompts(base_prompts: Iterable[str],
                      img_idx_to_prompt: Optional[Callable[[int], str]],
                      video_idx_to_prompt: Optional[Callable[[int], str]],
-                      prompt_formatter: Callable[[str], str]) -> List[str]:
+                      prompt_formatter: Callable[[str], str]) -> list[str]:
    """Given a model-agnostic base prompt and test configuration for a model(s)
    to be tested, update the media placeholders and apply the prompt formatting
    to get the test prompt string for this model.
@@ -218,7 +219,7 @@ def build_video_inputs_from_test_info(
    ) for video, prompt in zip(sampled_vids, model_prompts)]
-def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
+def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
                             size_type: SizeType):
    """Applies a size scaler to one image; this can be a an image size factor,
    which scales the image while maintaining the aspect ratio"""

--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -5,7 +5,7 @@ handling multimodal placeholder substitution, and so on.
 """
 import itertools
 from collections import OrderedDict
-from typing import Dict, Iterable, Tuple
+from collections.abc import Iterable
 import pytest
@@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
-def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
+def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
                               test_type: VLMTestType,
-                               fork_per_test: bool) -> Dict[str, VLMTestInfo]:
+                               fork_per_test: bool) -> dict[str, VLMTestInfo]:
    """Given the dict of potential test settings to run, return a subdict
    of tests who have the current test type enabled with the matching val for
    fork_per_test.
@@ -49,7 +49,7 @@ def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
    return matching_tests
-def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
+def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
                             test_type: VLMTestType,
                             fork_new_process_for_each_test: bool):
    """Converts all of our VLMTestInfo into an expanded list of parameters.
@@ -121,7 +121,7 @@ def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
 def get_wrapped_test_sizes(
        test_info: VLMTestInfo,
-        test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
+        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
    """Given a test info which may have size factors or fixed sizes, wrap them
    and combine them into an iterable, each of which will be used in parameter
    expansion.

--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
 # SPDX-License-Identifier: Apache-2.0
 """Core test implementation to be shared across modalities."""
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Optional, Union
 import torch
 from PIL.Image import Image
@@ -17,9 +17,9 @@ from .types import RunnerOutput
 def run_test(
    *,
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
+    inputs: list[tuple[list[str], list[Union[list[Image], Image]]]],
    model: str,
    dtype: str,
    max_tokens: int,
@@ -29,15 +29,15 @@ def run_test(
    max_num_seqs: int,
    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
-    auto_cls: Type[_BaseAutoModelClass],
+    auto_cls: type[_BaseAutoModelClass],
    use_tokenizer_eos: bool,
    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
    comparator: Callable[..., None],
    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
-    stop_str: Optional[List[str]],
+    stop_str: Optional[list[str]],
-    limit_mm_per_prompt: Dict[str, int],
+    limit_mm_per_prompt: dict[str, int],
-    vllm_runner_kwargs: Optional[Dict[str, Any]],
+    vllm_runner_kwargs: Optional[dict[str, Any]],
-    hf_model_kwargs: Optional[Dict[str, Any]],
+    hf_model_kwargs: Optional[dict[str, Any]],
    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
    task: TaskOption = "auto",
    runner_mm_key: str = "images",
@@ -61,7 +61,7 @@ def run_test(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    vllm_runner_kwargs_: Dict[str, Any] = {}
+    vllm_runner_kwargs_: dict[str, Any] = {}
    if model_info.tokenizer:
        vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
    if model_info.tokenizer_mode:
@@ -84,7 +84,7 @@ def run_test(
                     **vllm_runner_kwargs_) as vllm_model:
        tokenizer = vllm_model.model.get_tokenizer()
-        vllm_kwargs: Dict[str, Any] = {}
+        vllm_kwargs: dict[str, Any] = {}
        if get_stop_token_ids is not None:
            vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
        if stop_str:

--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -6,7 +6,7 @@ typically specific to a small subset of models.
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
 from PIL.Image import Image
@@ -49,7 +49,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
 def qwen_vllm_to_hf_output(
        vllm_output: RunnerOutput,
-        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output
@@ -60,7 +60,7 @@ def qwen_vllm_to_hf_output(
 def qwen2_vllm_to_hf_output(
        vllm_output: RunnerOutput,
-        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output
@@ -78,7 +78,7 @@ def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
 def llava_video_vllm_to_hf_output(
        vllm_output: RunnerOutput,
-        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    config = AutoConfig.from_pretrained(model)
    mm_token_id = config.video_token_index
    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
@@ -247,7 +247,7 @@ def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
+        tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
                                                        _ImageAssets]) -> str:
    """Given a temporary dir path, export one or more image assets into the
    tempdir & replace its contents with the local path to the string so that
@@ -257,7 +257,7 @@ def qwen_prompt_path_encoder(
    Args:
        tmp_path: Tempdir for test under consideration.
        prompt: Prompt with image placeholders.
-        assets: List of image assets whose len equals the num placeholders.
+        assets: list of image assets whose len equals the num placeholders.
    """
    # Ensure that the number of placeholders matches the number of assets;
    # If this is not true, the test is probably written incorrectly.
@@ -350,7 +350,7 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size
-        def __call__(self, text: str, images: Union[Image, List[Image]],
+        def __call__(self, text: str, images: Union[Image, list[Image]],
                     **kwargs):
            # yapf: disable
            from vllm.model_executor.models.h2ovl import (
@@ -410,7 +410,7 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size
-        def __call__(self, text: str, images: Union[Image, List[Image]],
+        def __call__(self, text: str, images: Union[Image, list[Image]],
                     **kwargs):
            from vllm.model_executor.models.internvl import (
                IMG_CONTEXT, IMG_END, IMG_START,