Update deprecated Python 3.8 typing (#13971)

cf069aa8 · Harry Mellor · GitHub · bf33700e · cf069aa8 · cf069aa8
Unverified Commit cf069aa8 authored Mar 03, 2025 by Harry Mellor Committed by GitHub Mar 02, 2025
20 changed files
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,6 @@ import subprocess
 import sys
 from pathlib import Path
 from shutil import which
-from typing import Dict, List
 import torch
 from packaging.version import Version, parse
@@ -78,7 +77,7 @@ class CMakeExtension(Extension):
 class cmake_build_ext(build_ext):
    # A dict of extension directories that have been configured.
-    did_config: Dict[str, bool] = {}
+    did_config: dict[str, bool] = {}
    #
    # Determine number of compilation jobs and optionally nvcc compile threads.
@@ -548,10 +547,10 @@ def get_vllm_version() -> str:
    return version
-def get_requirements() -> List[str]:
+def get_requirements() -> list[str]:
    """Get Python package dependencies from requirements.txt."""
-    def _read_requirements(filename: str) -> List[str]:
+    def _read_requirements(filename: str) -> list[str]:
        with open(get_path(filename)) as f:
            requirements = f.read().strip().split("\n")
        resolved_requirements = []

--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
 # SPDX-License-Identifier: Apache-2.0
 """vllm.entrypoints.api_server with some extra logging for testing."""
-from typing import Any, Dict, Iterable
+from collections.abc import Iterable
+from typing import Any
 import uvicorn
 from fastapi.responses import JSONResponse, Response
@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
        self._num_aborts += len(ids)
        await super()._engine_abort(ids)
-    def testing_stats(self) -> Dict[str, Any]:
+    def testing_stats(self) -> dict[str, Any]:
        return {"num_aborted_requests": self._num_aborts}

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -6,7 +6,7 @@ import uuid
 from asyncio import CancelledError
 from copy import copy
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional
 import pytest
 import pytest_asyncio
@@ -254,7 +254,7 @@ async def test_output_kinds(async_engine, stop):
        params.output_kind = RequestOutputKind.DELTA
        prompt_tokens = None
-        output_tokens: List[int] = []
+        output_tokens: list[int] = []
        output_text = ""
        output_count = 0
        final_output = None

--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional
 import torch
 from torch import nn
@@ -56,7 +56,7 @@ class LlamaConfig:
    random_seed: int = 0
    def compute_hash(self) -> str:
-        factors: List[Any] = []
+        factors: list[Any] = []
        for k, v in self.__dict__.items():
            if k == "random_seed":
                continue
@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module):
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        For tractable computation:
        - if residual is None, the outputs are:

--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
 # SPDX-License-Identifier: Apache-2.0
 import dataclasses
-from typing import Dict, List, Optional
+from typing import Optional
 import pytest
@@ -14,7 +14,7 @@ from ..utils import compare_all_settings
 @dataclasses.dataclass
 class TestSetting:
    model: str
-    model_args: List[str]
+    model_args: list[str]
    pp_size: int
    tp_size: int
    attn_backend: str
@@ -108,8 +108,8 @@ def test_compile_correctness(test_setting: TestSetting):
    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
                ["-tp", str(tp_size)]
-    all_args: List[List[str]] = []
+    all_args: list[list[str]] = []
-    all_envs: List[Optional[Dict[str, str]]] = []
+    all_envs: list[Optional[dict[str, str]]] = []
    for level in [
            CompilationLevel.NO_COMPILATION,

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,7 @@ import os
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
-                    TypedDict, TypeVar, Union)
 import numpy as np
 import pytest
@@ -47,14 +46,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 _M = TypeVar("_M")
-_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
 PromptImageInput = _PromptMultiModalInput[Image.Image]
-PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
-def _read_prompts(filename: str) -> List[str]:
+def _read_prompts(filename: str) -> list[str]:
    with open(filename) as f:
        prompts = f.readlines()
        return prompts
@@ -77,7 +76,7 @@ class _ImageAssets(_ImageAssetsBase):
            ImageAsset("cherry_blossom"),
        ])
-    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+    def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
        """
        Convenience method to define the prompt for each test image.
@@ -102,7 +101,7 @@ class _VideoAssets(_VideoAssetsBase):
            VideoAsset("sample_demo_1.mp4"),
        ])
-    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
+    def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
        return [prompts["sample_demo_1"]]
@@ -175,7 +174,7 @@ def dynamo_reset():
 @pytest.fixture
-def example_prompts() -> List[str]:
+def example_prompts() -> list[str]:
    prompts = []
    for filename in _TEST_PROMPTS:
        prompts += _read_prompts(filename)
@@ -197,7 +196,7 @@ class DecoderPromptType(Enum):
 @pytest.fixture
 def example_encoder_decoder_prompts(
-) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
+) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
@@ -229,7 +228,7 @@ def example_encoder_decoder_prompts(
 @pytest.fixture
-def example_long_prompts() -> List[str]:
+def example_long_prompts() -> list[str]:
    prompts = []
    for filename in _LONG_PROMPTS:
        prompts += _read_prompts(filename)
@@ -273,11 +272,11 @@ class HfRunner:
        model_name: str,
        dtype: str = "half",
        *,
-        model_kwargs: Optional[Dict[str, Any]] = None,
+        model_kwargs: Optional[dict[str, Any]] = None,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
-        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
+        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[..., BatchEncoding] = identity,
    ) -> None:
        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -334,11 +333,11 @@ class HfRunner:
    def get_inputs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[BatchEncoding]:
+    ) -> list[BatchEncoding]:
        if images is not None:
            assert len(prompts) == len(images)
@@ -348,9 +347,9 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)
-        all_inputs: List[BatchEncoding] = []
+        all_inputs: list[BatchEncoding] = []
        for i, prompt in enumerate(prompts):
-            processor_kwargs: Dict[str, Any] = {
+            processor_kwargs: dict[str, Any] = {
                "text": prompt,
                "return_tensors": "pt",
            }
@@ -370,7 +369,7 @@ class HfRunner:
        return all_inputs
-    def classify(self, prompts: List[str]) -> List[str]:
+    def classify(self, prompts: list[str]) -> list[str]:
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
@@ -383,18 +382,18 @@ class HfRunner:
    def generate(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs: list[tuple[list[list[int]], list[str]]] = []
        for inputs in all_inputs:
            output_ids = self.model.generate(
                **self.wrap_device(inputs, device=self.model.device.type),
@@ -412,13 +411,13 @@ class HfRunner:
    def generate_greedy(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> list[tuple[list[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
@@ -432,10 +431,10 @@ class HfRunner:
    def generate_beam_search(
        self,
-        prompts: List[str],
+        prompts: list[str],
        beam_width: int,
        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
@@ -453,19 +452,19 @@ class HfRunner:
    def generate_greedy_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
-        all_logprobs: List[List[torch.Tensor]] = []
+        all_logprobs: list[list[torch.Tensor]] = []
        for inputs in all_inputs:
            output = self.model.generate(
                **self.wrap_device(inputs, device=self.model.device.type),
@@ -483,11 +482,11 @@ class HfRunner:
    def _hidden_states_to_seq_logprobs(
        self,
-        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-    ) -> List[torch.Tensor]:
+    ) -> list[torch.Tensor]:
        output_embeddings = self.model.get_output_embeddings()
-        seq_logprobs: List[torch.Tensor] = []
+        seq_logprobs: list[torch.Tensor] = []
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
@@ -503,14 +502,14 @@ class HfRunner:
    def _hidden_states_to_logprobs(
        self,
-        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
        num_logprobs: int,
-    ) -> Tuple[List[Dict[int, float]], int]:
+    ) -> tuple[list[dict[int, float]], int]:
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)
        # convert to dict
-        seq_logprobs_lst: List[Dict[int, float]] = []
+        seq_logprobs_lst: list[dict[int, float]] = []
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
@@ -530,22 +529,22 @@ class HfRunner:
    def generate_greedy_logprobs_limit(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
        **kwargs: Any,
-    ) -> List[TokensTextLogprobs]:
+    ) -> list[TokensTextLogprobs]:
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
-        all_logprobs: List[List[Dict[int, float]]] = []
+        all_logprobs: list[list[dict[int, float]]] = []
-        all_output_ids: List[List[int]] = []
+        all_output_ids: list[list[int]] = []
-        all_output_strs: List[str] = []
+        all_output_strs: list[str] = []
        for inputs in all_inputs:
            output = self.model.generate(
@@ -577,23 +576,23 @@ class HfRunner:
    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
-    ) -> List[TokensTextLogprobs]:
+    ) -> list[TokensTextLogprobs]:
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
-        all_logprobs: List[List[Dict[int, float]]] = []
+        all_logprobs: list[list[dict[int, float]]] = []
-        all_output_ids: List[List[int]] = []
+        all_output_ids: list[list[int]] = []
-        all_output_strs: List[str] = []
+        all_output_strs: list[str] = []
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
-            processor_kwargs: Dict[str, Any] = {
+            processor_kwargs: dict[str, Any] = {
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
@@ -641,10 +640,10 @@ class HfRunner:
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]
-    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
+    def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts)
-    def predict(self, prompts: List[List[str]]) -> torch.Tensor:
+    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
        return self.model.predict(prompts, convert_to_tensor=True)
    def __enter__(self):
@@ -699,11 +698,11 @@ class VllmRunner:
    def get_inputs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[TextPrompt]:
+    ) -> list[TextPrompt]:
        if images is not None:
            assert len(prompts) == len(images)
@@ -733,13 +732,13 @@ class VllmRunner:
    def generate(
        self,
-        prompts: List[str],
+        prompts: list[str],
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
@@ -749,12 +748,12 @@ class VllmRunner:
                                          sampling_params=sampling_params,
                                          **kwargs)
-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs: list[tuple[list[list[int]], list[str]]] = []
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
-            req_sample_output_ids: List[List[int]] = []
+            req_sample_output_ids: list[list[int]] = []
-            req_sample_output_strs: List[str] = []
+            req_sample_output_strs: list[str] = []
            for sample in req_output.outputs:
                output_str = sample.text
                output_ids = list(sample.token_ids)
@@ -765,9 +764,9 @@ class VllmRunner:
    @staticmethod
    def _final_steps_generate_w_logprobs(
-        req_outputs: List[RequestOutput],
+        req_outputs: list[RequestOutput],
-    ) -> List[TokensTextLogprobsPromptLogprobs]:
+    ) -> list[TokensTextLogprobsPromptLogprobs]:
-        outputs: List[TokensTextLogprobsPromptLogprobs] = []
+        outputs: list[TokensTextLogprobsPromptLogprobs] = []
        for req_output in req_outputs:
            assert len(req_output.outputs) > 0
            for sample in req_output.outputs:
@@ -780,14 +779,14 @@ class VllmRunner:
    def generate_w_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
        **kwargs: Any,
-    ) -> Union[List[TokensTextLogprobs],
+    ) -> Union[list[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+               list[TokensTextLogprobsPromptLogprobs]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
@@ -806,10 +805,10 @@ class VllmRunner:
    def generate_encoder_decoder_w_logprobs(
        self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        sampling_params: SamplingParams,
-    ) -> Union[List[TokensTextLogprobs],
+    ) -> Union[list[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+               list[TokensTextLogprobsPromptLogprobs]]:
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''
@@ -826,13 +825,13 @@ class VllmRunner:
    def generate_greedy(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> list[tuple[list[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts,
                                greedy_params,
@@ -845,18 +844,18 @@ class VllmRunner:
    def generate_greedy_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        num_logprobs: int,
        num_prompt_logprobs: Optional[int] = None,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[List[int]] = None,
+        stop_token_ids: Optional[list[int]] = None,
-        stop: Optional[List[str]] = None,
+        stop: Optional[list[str]] = None,
        **kwargs: Any,
-    ) -> Union[List[TokensTextLogprobs],
+    ) -> Union[list[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+               list[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
@@ -874,12 +873,12 @@ class VllmRunner:
    def generate_encoder_decoder_greedy_logprobs(
        self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
        num_logprobs: int,
        num_prompt_logprobs: Optional[int] = None,
-    ) -> Union[List[TokensTextLogprobs],
+    ) -> Union[list[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+               list[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
@@ -895,10 +894,10 @@ class VllmRunner:
    def generate_beam_search(
        self,
-        prompts: Union[List[str], List[List[int]]],
+        prompts: Union[list[str], list[list[int]]],
        beam_width: int,
        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        if is_list_of(prompts, str, check="all"):
            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
        else:
@@ -915,17 +914,17 @@ class VllmRunner:
            returned_outputs.append((token_ids, texts))
        return returned_outputs
-    def classify(self, prompts: List[str]) -> List[List[float]]:
+    def classify(self, prompts: list[str]) -> list[list[float]]:
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]
    def encode(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[List[float]]:
+    ) -> list[list[float]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
@@ -936,9 +935,9 @@ class VllmRunner:
    def score(
        self,
-        text_1: Union[str, List[str]],
+        text_1: Union[str, list[str]],
-        text_2: Union[str, List[str]],
+        text_2: Union[str, list[str]],
-    ) -> List[float]:
+    ) -> list[float]:
        req_outputs = self.model.score(text_1, text_2)
        return [req_output.outputs.score for req_output in req_outputs]

--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Callable, Iterable, Optional
+from collections.abc import Iterable
+from typing import Callable, Optional
 import pytest

--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
 # SPDX-License-Identifier: Apache-2.0
 import random
-from typing import List
 import pytest
@@ -137,9 +136,9 @@ def prep_prompts(batch_size: int):
    The prompt is just under 10k tokens; sliding window is 4k
    so the answer is outside sliding window, but should still be correct.
    """
-    prompts: List[str] = []
+    prompts: list[str] = []
-    answer: List[int] = []
+    answer: list[int] = []
-    indices: List[int] = []
+    indices: list[int] = []
    random.seed(1)
    for _ in range(batch_size):
        idx = random.randint(30, 90)
@@ -158,7 +157,7 @@ def prep_prompts(batch_size: int):
    return prompts, answer, indices
-def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
+def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
    answer2 = [int(text[0:2].strip()) for text in outputs]
    print(list(zip(indices, zip(answer, answer2))))
    numok = 0
@@ -170,7 +169,7 @@ def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
    assert frac_ok > 0.7
-def check_window(prompts: List[str]):
+def check_window(prompts: list[str]):
    def inner(llm: LLM):
        sliding_window = llm.llm_engine.model_config.get_sliding_window()

--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 import pytest
 from vllm.core.block.block_table import BlockTable
@@ -32,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
    token_ids = list(range(sequence_len))
    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
-    block_tables: List[BlockTable] = []
+    block_tables: list[BlockTable] = []
    for i in range(5):
        assert allocator.get_num_free_blocks(
            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
@@ -77,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
    num_immutable_blocks_per_alloc = len(
        chunked_tokens) - num_mutable_blocks_per_alloc
-    block_tables: List[BlockTable] = []
+    block_tables: list[BlockTable] = []
    for alloc_i in range(1, 6):
        block_tables.append(
@@ -272,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
    )
    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    appended_so_far: List[int] = []
+    appended_so_far: list[int] = []
    for append in chunk_list(token_ids_to_append, append_size):
        block_table.append_token_ids(append)
        appended_so_far.extend(append)

--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional
+from typing import Optional
 import pytest
@@ -14,7 +14,7 @@ class TestNaiveBlockAllocator:
    def create_allocate_lambda(allocate_type: str,
                               allocator: NaiveBlockAllocator,
                               prev_block: Optional[Block],
-                               token_ids: List[int]):
+                               token_ids: list[int]):
        if allocate_type == "immutable":
            allocate_block = lambda: allocator.allocate_immutable_block(
                prev_block=prev_block, token_ids=token_ids)

--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -2,7 +2,7 @@
 import math
 import random
-from typing import List, Optional
+from typing import Optional
 from unittest.mock import MagicMock
 import pytest
@@ -123,11 +123,11 @@ class TestPrefixCachingBlock:
    @staticmethod
    def create_chain(block_size: int,
-                     token_ids: List[int],
+                     token_ids: list[int],
-                     num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
+                     num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]:
        """Helper method which creates a chain of blocks.
        """
-        blocks: List[PrefixCachingBlock] = []
+        blocks: list[PrefixCachingBlock] = []
        num_blocks = math.ceil(
            len(token_ids) / block_size) + num_empty_trailing_blocks
@@ -161,7 +161,7 @@ class TestPrefixCachingBlockAllocator:
    @staticmethod
    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
                               prev_block: Optional[Block],
-                               token_ids: List[int]):
+                               token_ids: list[int]):
        if allocate_type == "immutable":
            allocate_block = lambda: allocator.allocate_immutable_block(
                prev_block=prev_block, token_ids=token_ids)
@@ -839,13 +839,13 @@ class TestPrefixCachingBlockAllocator:
    @staticmethod
    def create_immutable_chain(
        block_size: int,
-        token_ids: List[int],
+        token_ids: list[int],
        allocator: PrefixCachingBlockAllocator,
        extra_hash: Optional[int] = None,
-    ) -> List[PrefixCachingBlock]:
+    ) -> list[PrefixCachingBlock]:
        """Helper method which creates a chain of blocks.
        """
-        blocks: List[Block] = []
+        blocks: list[Block] = []
        num_blocks = math.ceil(len(token_ids) / block_size)
        if num_blocks == 0:

--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 from unittest.mock import MagicMock
 import pytest  # noqa
@@ -46,7 +45,7 @@ def test_simple():
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(num_seq_group):
@@ -93,7 +92,7 @@ def test_chunk():
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(2):
@@ -145,7 +144,7 @@ def test_concurrent_chunking():
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(2):
@@ -226,8 +225,8 @@ def test_short_prompts_jump_long_prompts_in_queue():
    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
    cache_config.num_gpu_blocks = 3200
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    long_seqs: List[SequenceGroup] = []
+    long_seqs: list[SequenceGroup] = []
-    short_seqs: List[SequenceGroup] = []
+    short_seqs: list[SequenceGroup] = []
    # Add 2 large seq groups to scheduler.
    for i in range(2):
@@ -368,7 +367,7 @@ def test_complex():
    cache_config.num_cpu_blocks = 64
    cache_config.num_gpu_blocks = 64
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(2):
@@ -439,7 +438,7 @@ def test_maximal_decoding():
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(2):
@@ -533,7 +532,7 @@ def test_prompt_limit():
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    _, seq_group = create_dummy_prompt("1",
                                       prompt_length=48,
@@ -565,7 +564,7 @@ def test_prompt_limit_exceed():
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    _, seq_group = create_dummy_prompt("2",
                                       prompt_length=48,
                                       block_size=block_size)
@@ -699,7 +698,7 @@ def test_chunked_prefill_max_seqs():
    cache_config.num_cpu_blocks = 128
    cache_config.num_gpu_blocks = 128
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    _, seq_group = create_dummy_prompt("1",
                                       prompt_length=65,
@@ -758,7 +757,7 @@ def test_prefix_caching():
    cache_config.num_cpu_blocks = 0
    cache_config.num_gpu_blocks = 32
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(2):
@@ -800,7 +799,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
    cache_config.num_cpu_blocks = 0
    cache_config.num_gpu_blocks = 32
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(2):

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -2,7 +2,6 @@
 import time
 from collections import deque
-from typing import List, Set, Tuple
 from unittest.mock import MagicMock
 import pytest  # noqa
@@ -57,7 +56,7 @@ def test_scheduler_abort_seq_group():
    # Add multiple seq groups to scheduler.
    num_seq_group = 4
-    request_ids: Set[str] = set()
+    request_ids: set[str] = set()
    for i in range(num_seq_group):
        _, seq_group = create_dummy_prompt(str(i), block_size)
        scheduler.add_seq_group(seq_group)
@@ -83,7 +82,7 @@ def test_scheduler_schedule_simple():
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(num_seq_group):
@@ -221,7 +220,7 @@ def test_scheduler_max_seqs():
    cache_config.num_gpu_blocks = 8
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    all_seq_groups: List[SequenceGroup] = []
+    all_seq_groups: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    for i in range(num_seq_group):
        _, seq_group = create_dummy_prompt(str(i),
@@ -480,7 +479,7 @@ def test_prefill_schedule_max_lora():
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    budget = create_token_budget(token_budget=120)
-    curr_loras: Set[int] = set()
+    curr_loras: set[int] = set()
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@@ -651,8 +650,8 @@ def test_schedule_swapped_max_loras():
                                     block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
-    curr_loras: Set[int] = set()
+    curr_loras: set[int] = set()
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@@ -683,7 +682,7 @@ def test_schedule_swapped_cannot_swap_in():
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@@ -714,7 +713,7 @@ def test_infeasible_swap():
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@@ -752,7 +751,7 @@ def test_schedule_swapped_blocks_to_copy():
                                       block_size=block_size)
    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
    scheduler._swap_out(seq_group, blocks_to_swap_out)
    scheduler._add_seq_group_to_swapped(seq_group)

--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 import pytest  # noqa
 from vllm.config import CacheConfig, SchedulerConfig
@@ -48,7 +46,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
    # Add seq groups to scheduler.
    req_id_list = []

--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -2,9 +2,8 @@
 import time
 from collections import defaultdict
-from typing import Any, Dict, List, Optional
+from collections.abc import Sequence as GenericSequence
-from typing import Sequence as GenericSequence
+from typing import Any, Optional
-from typing import Tuple
 from vllm import SamplingParams
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
@@ -20,10 +19,10 @@ def create_dummy_prompt(
    block_size: Optional[int] = None,
    lora_request: Optional[LoRARequest] = None,
    best_of: int = 1,
-    prompt_tokens: Optional[List[int]] = None,
+    prompt_tokens: Optional[list[int]] = None,
    min_tokens: int = 0,
    max_tokens: int = 16,
-) -> Tuple[Sequence, SequenceGroup]:
+) -> tuple[Sequence, SequenceGroup]:
    if not block_size:
        block_size = prompt_length
@@ -48,7 +47,7 @@ def create_dummy_prompt(
    return prompt, seq_group
-def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
+def create_dummy_lora_sequence(request_id: int, token_ids: list[int],
                               block_size: int, lora_int_id: int) -> Sequence:
    return Sequence(seq_id=request_id,
                    inputs=token_inputs(token_ids),
@@ -58,7 +57,7 @@ def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
                                             lora_int_id=lora_int_id))
-def create_dummy_sequence(request_id: int, token_ids: List[int],
+def create_dummy_sequence(request_id: int, token_ids: list[int],
                          block_size: int) -> Sequence:
    return Sequence(
        seq_id=request_id,
@@ -74,7 +73,7 @@ def create_dummy_prompt_encoder_decoder(
    block_size: Optional[int] = None,
    lora_request: Optional[LoRARequest] = None,
    best_of: int = 1,
-) -> Tuple[Sequence, Sequence, SequenceGroup]:
+) -> tuple[Sequence, Sequence, SequenceGroup]:
    if not block_size:
        block_size = decoder_prompt_length
@@ -125,7 +124,7 @@ def create_seq_group(
    prompt_token_ids = [0] * seq_prompt_len
-    seqs: List[Sequence] = []
+    seqs: list[Sequence] = []
    for seq_id_offset, output_len in enumerate(seq_output_lens):
        seq = Sequence(
            seq_id=seq_id_start + seq_id_offset,
@@ -241,7 +240,7 @@ class SchedulerProxy:
    def __init__(self, scheduler: Scheduler):
        self.scheduler_ = scheduler
-        self.call_history: Dict[str, List[Any]] = defaultdict(list)
+        self.call_history: dict[str, list[Any]] = defaultdict(list)
    def __getattr__(self, name: str) -> Any:
@@ -253,6 +252,6 @@ class SchedulerProxy:
        return wrapper
    def last_schedule_ret(
-        self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]:
+        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
        _, _, ret = self.call_history["schedule"][-1]
        return ret
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
-from typing import List, Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple, Optional
 import pytest
@@ -28,8 +28,8 @@ class EPTestOptions(NamedTuple):
 @dataclass
 class EPTestSettings:
-    parallel_setups: List[ParallelSetup]
+    parallel_setups: list[ParallelSetup]
-    distributed_backends: List[str]
+    distributed_backends: list[str]
    task: TaskOption
    test_options: EPTestOptions

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -9,7 +9,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import List, Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple, Optional
 import pytest
@@ -38,14 +38,14 @@ class PPTestOptions(NamedTuple):
 @dataclass
 class PPTestSettings:
-    parallel_setups: List[ParallelSetup]
+    parallel_setups: list[ParallelSetup]
    # NOTE: the length of distributed_backends and
    # vllm_major_versions should be the same, and they
    # are first zipped together to iterate over all
    # test settings.
-    distributed_backends: List[str]
+    distributed_backends: list[str]
    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: List[str]
+    vllm_major_versions: list[str]
    task: TaskOption
    test_options: PPTestOptions

--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -2,7 +2,6 @@
 import multiprocessing
 import os
-from typing import Dict, List
 import pytest
 import torch
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
 def distributed_run(fn, world_size):
    number_of_processes = world_size
-    processes: List[multiprocessing.Process] = []
+    processes: list[multiprocessing.Process] = []
    for i in range(number_of_processes):
-        env: Dict[str, str] = {}
+        env: dict[str, str] = {}
        env['RANK'] = str(i)
        env['LOCAL_RANK'] = str(i)
        env['WORLD_SIZE'] = str(number_of_processes)

--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -3,7 +3,6 @@
 import multiprocessing
 import random
 import time
-from typing import List
 import numpy as np
 import torch.distributed as dist
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
 from vllm.utils import get_ip, get_open_port, update_environment_variables
-def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
    np.random.seed(seed)
    sizes = np.random.randint(1, 10_000, n)
    # on average, each array will have 5k elements

--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -3,7 +3,7 @@
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
 """
-from typing import List, Optional, Tuple
+from typing import Optional
 import pytest
 from transformers import AutoModelForSeq2SeqLM
@@ -22,7 +22,7 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
 def vllm_to_hf_output(
-    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
    decoder_prompt_type: DecoderPromptType,
 ):
    """Sanitize vllm output to be comparable with hf output."""