Update deprecated Python 3.8 typing (#13971)

cf069aa8 · Harry Mellor · GitHub · bf33700e · cf069aa8 · cf069aa8
Unverified Commit cf069aa8 authored Mar 03, 2025 by Harry Mellor Committed by GitHub Mar 02, 2025
20 changed files
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -4,7 +4,6 @@ import math
 import pickle
 import re
 from collections import defaultdict
-from typing import List

 import matplotlib.pyplot as plt
 import pandas as pd
@@ -23,7 +22,7 @@ if __name__ == "__main__":

    with open(args.filename, 'rb') as f:
        data = pickle.load(f)
-        raw_results: List[TMeasurement] = data["results"]
+        raw_results: list[TMeasurement] = data["results"]

    results = defaultdict(lambda: list())
    for v in raw_results:

--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
 # SPDX-License-Identifier: Apache-2.0

 import dataclasses
-from typing import Any, Callable, Iterable, Optional
+from collections.abc import Iterable
+from typing import Any, Callable, Optional

 import torch
 import torch.utils.benchmark as TBenchmark

--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
 # SPDX-License-Identifier: Apache-2.0

 import enum
-from typing import Dict, Union
+from typing import Union

 from cutlass_library import *

@@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
    TmaWarpSpecializedCooperative = enum_auto()


-VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    **DataTypeNames,  # type: ignore
    **{
        VLLMDataType.u4b8: "u4b8",
@@ -29,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
    }
 }

-VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    **DataTypeTag,  # type: ignore
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@@ -37,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
    }
 }

-VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
+VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    **DataTypeSize,  # type: ignore
    **{
        VLLMDataType.u4b8: 4,
@@ -45,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
    }
 }

-VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    VLLMDataType.u4b8: "vllm::kU4B8",
    VLLMDataType.u8b128: "vllm::kU8B128",
    DataType.u4: "vllm::kU4",
@@ -56,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
    DataType.bf16: "vllm::kBfloat16",
 }

-VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.u8: "at::ScalarType::Byte",
    DataType.s8: "at::ScalarType::Char",
    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@@ -66,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
    DataType.f32: "at::ScalarType::Float",
 }

-VLLMKernelScheduleTag: Dict[Union[
+VLLMKernelScheduleTag: dict[Union[
    MixedInputKernelScheduleType, KernelScheduleType], str] = {
        **KernelScheduleTag,  # type: ignore
        **{

--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -8,7 +8,7 @@ from collections.abc import Iterable
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union

 import jinja2
 # yapf conflicts with isort for this block
@@ -247,8 +247,8 @@ TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative

 @dataclass(frozen=True)
 class ScheduleConfig:
-    tile_shape_mn: Tuple[int, int]
-    cluster_shape_mnk: Tuple[int, int, int]
+    tile_shape_mn: tuple[int, int]
+    cluster_shape_mnk: tuple[int, int, int]
    kernel_schedule: MixedInputKernelScheduleType
    epilogue_schedule: EpilogueScheduleType
    tile_scheduler: TileSchedulerType
@@ -277,8 +277,8 @@ class PrepackTypeConfig:
 @dataclass
 class ImplConfig:
    types: TypeConfig
-    schedules: List[ScheduleConfig]
-    heuristic: List[Tuple[Optional[str], ScheduleConfig]]
+    schedules: list[ScheduleConfig]
+    heuristic: list[tuple[Optional[str], ScheduleConfig]]


 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
@@ -333,7 +333,7 @@ def is_power_of_two(n):
    return (n != 0) and (n & (n - 1) == 0)


-def to_cute_constant(value: List[int]):
+def to_cute_constant(value: list[int]):

    def _to_cute_constant(value: int):
        if is_power_of_two(value):
@@ -347,7 +347,7 @@ def to_cute_constant(value: List[int]):
        return _to_cute_constant(value)


-def unique_schedules(impl_configs: List[ImplConfig]):
+def unique_schedules(impl_configs: list[ImplConfig]):
    return list(
        set(sch for impl_config in impl_configs
            for sch in impl_config.schedules))
@@ -391,7 +391,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE)
 prepack_dispatch_template = create_template(PREPACK_TEMPLATE)


-def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
+def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
    sources = []

    sources.append((
@@ -435,7 +435,7 @@ def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
    num_impls_per_file = math.ceil(num_impls / num_impl_files)

-    files_impls: List[List[ImplConfig]] = [[]]
+    files_impls: list[list[ImplConfig]] = [[]]

    curr_num_impls_assigned = 0
    curr_impl_in_file = 0
@@ -515,7 +515,7 @@ def generate():
        for cond, tile_config in default_tile_heuristic_config.items()
    ]

-    def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]):
+    def get_unique_schedules(heuristic: dict[str, ScheduleConfig]):
        # Do not use schedules = list(set(...)) because we need to make sure
        # the output list is deterministic; otherwise the generated kernel file
        # will be non-deterministic and causes ccache miss.

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,6 @@ import inspect
 import logging
 import os
 import sys
-from typing import List

 import requests
 from sphinx.ext import autodoc
@@ -58,7 +57,7 @@ templates_path = ['_templates']
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"]
+exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]

 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "

--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser):

    def extract_reasoning_content(
            self, model_output: str, request: ChatCompletionRequest
-    ) -> Tuple[Optional[str], Optional[str]]:
+    ) -> tuple[Optional[str], Optional[str]]:
        """
        Extract reasoning content from a complete model-generated string.

@@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser):
            The request object that was used to generate the model_output.

        Returns:
-        Tuple[Optional[str], Optional[str]]
+        tuple[Optional[str], Optional[str]]
            A tuple containing the reasoning content and the content.
        """
 ```

--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -193,7 +193,7 @@ class Step(BaseModel):


 class MathResponse(BaseModel):
-    steps: List[Step]
+    steps: list[Step]
    final_answer: str



--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -74,7 +74,7 @@ class Example:
        path (Path): The path to the main directory or file.
        category (str): The category of the document.
        main_file (Path): The main file in the directory.
-        other_files (list[Path]): List of other files in the directory.
+        other_files (list[Path]): list of other files in the directory.
        title (str): The title of the document.

    Methods:

--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """

-from typing import Any, Dict, List
+from typing import Any

 import numpy as np
 import ray
@@ -36,13 +36,13 @@ class LLMPredictor:
        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
                       tensor_parallel_size=tensor_parallel_size)

-    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
+    def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
        # Generate texts from the prompts.
        # The output is a list of RequestOutput objects that contain the prompt,
        # generated text, and other information.
        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt: List[str] = []
-        generated_text: List[str] = []
+        prompt: list[str] = []
+        generated_text: list[str] = []
        for output in outputs:
            prompt.append(output.prompt)
            generated_text.append(' '.join([o.text for o in output.outputs]))
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
        pg, placement_group_capture_child_tasks=True))


-resources_kwarg: Dict[str, Any] = {}
+resources_kwarg: dict[str, Any] = {}
 if tensor_parallel_size == 1:
    # For tensor_parallel_size == 1, we simply set num_gpus=1.
    resources_kwarg["num_gpus"] = 1

--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
 # SPDX-License-Identifier: Apache-2.0

 import argparse
-from typing import List, Tuple

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.utils import FlexibleArgumentParser


-def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
    """Create a list of test prompts with their sampling parameters."""
    return [
        ("A robot may not injure a human being",
@@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:


 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams]]):
+                     test_prompts: list[tuple[str, SamplingParams]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0

@@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine,
            engine.add_request(str(request_id), prompt, sampling_params)
            request_id += 1

-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()

        for request_output in request_outputs:
            if request_output.finished:

--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
 """

 import gc
-from typing import List, Optional, Tuple
+from typing import Optional

 import torch
 from huggingface_hub import snapshot_download
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest

 def create_test_prompts(
        lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
    return [
        # this is an example of using quantization without LoRA
        ("My name is",
@@ -49,7 +49,7 @@ def create_test_prompts(


 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
+                     test_prompts: list[tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
                               lora_request=lora_request)
            request_id += 1

-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                print("----------------------------------------------------")

--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -2,12 +2,11 @@

 import gc
 import time
-from typing import List

 from vllm import LLM, SamplingParams


-def time_generation(llm: LLM, prompts: List[str],
+def time_generation(llm: LLM, prompts: list[str],
                    sampling_params: SamplingParams):
    # Generate texts from the prompts. The output is a list of RequestOutput
    # objects that contain the prompt, generated text, and other information.

--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -6,7 +6,7 @@ for offline inference.
 Requires HuggingFace credentials for access to Llama2.
 """

-from typing import List, Optional, Tuple
+from typing import Optional

 from huggingface_hub import snapshot_download

@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest

 def create_test_prompts(
        lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
    """Create a list of test prompts with their sampling parameters.

    2 requests for base model, 4 requests for the LoRA. We define 2
@@ -56,7 +56,7 @@ def create_test_prompts(


 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
+                     test_prompts: list[tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
                               lora_request=lora_request)
            request_id += 1

-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()

        for request_output in request_outputs:
            if request_output.finished:

--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -21,7 +21,7 @@ import argparse
 import datetime
 import os
 import re
-from typing import List, Union
+from typing import Union

 import albumentations
 import numpy as np
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):


 def load_example(
-    file_paths: List[str],
-    mean: List[float] = None,
-    std: List[float] = None,
+    file_paths: list[str],
+    mean: list[float] = None,
+    std: list[float] = None,
    indices: Union[list[int], None] = None,
 ):
    """Build an input example by loading images in *file_paths*.

--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -5,8 +5,9 @@ import json
 import os
 import sys
 from argparse import RawTextHelpFormatter
+from collections.abc import Generator
 from dataclasses import asdict, dataclass
-from typing import Any, Dict, Generator, List, Optional, TypeAlias
+from typing import Any, Optional, TypeAlias

 import torch
 import tqdm
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
        return dtype


-OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
-def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \
      -> OutputLen_NumReqs_Map:
    """
    Given the number of requests, batch_size, and the number of requests
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
    Args:
        batch_size (int): Number of requests submitted for profile. This is
            args.batch_size.
-        step_requests (List[int]): step_requests[i] is the number of requests
+        step_requests (list[int]): step_requests[i] is the number of requests
            that the ith engine step should process.

    Returns:
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
    return ol_nr


-def determine_requests_per_step(context: ProfileContext) -> List[int]:
+def determine_requests_per_step(context: ProfileContext) -> list[int]:
    """
    Determine number of requests each engine step should process.
    If context.num_steps is set, then all engine steps process the
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
        context: ProfileContext object.

    Returns:
-        List[int]: Number of requests to process for all engine-steps. 
+        list[int]: Number of requests to process for all engine-steps. 
         output[i], contains the number of requests that the ith step
         should process.
    """
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
    for key, value in asdict(context).items():
        print(f"  {key} = {value}")

-    requests_per_step: List[int] = determine_requests_per_step(context)
+    requests_per_step: list[int] = determine_requests_per_step(context)

    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
        context.batch_size, requests_per_step)

--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -4,7 +4,6 @@ import argparse
 import dataclasses
 import os
 import time
-from typing import List

 import numpy as np
 import torch_xla.debug.profiler as xp
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_prompts: list[PromptType] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]


--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -5,7 +5,7 @@ multi-image input on vision language models for text generation,
 using the chat template defined by the model.
 """
 from argparse import Namespace
-from typing import List, NamedTuple, Optional
+from typing import NamedTuple, Optional

 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer
@@ -24,8 +24,8 @@ IMAGE_URLS = [
 class ModelRequestData(NamedTuple):
    llm: LLM
    prompt: str
-    stop_token_ids: Optional[List[int]]
-    image_data: List[Image]
+    stop_token_ids: Optional[list[int]]
+    image_data: list[Image]
    chat_template: Optional[str]


@@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.


-def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+def load_aria(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "rhymes-ai/Aria"
    llm = LLM(model=model_name,
              tokenizer_mode="slow",
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
    )


-def load_deepseek_vl2(question: str, image_urls: List[str]):
+def load_deepseek_vl2(question: str, image_urls: list[str]):
    model_name = "deepseek-ai/deepseek-vl2-tiny"

    llm = LLM(model=model_name,
@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
    )


-def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "h2oai/h2ovl-mississippi-800m"

    llm = LLM(
@@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
    )


-def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
@@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
    )


-def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
@@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
    )


-def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
+def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
@@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
    )


-def load_nvlm_d(question: str, image_urls: List[str]):
+def load_nvlm_d(question: str, image_urls: list[str]):
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
@@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]):
    )


-def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
@@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
    )


-def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
@@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:


 def load_qwen_vl_chat(question: str,
-                      image_urls: List[str]) -> ModelRequestData:
+                      image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
    llm = LLM(
        model=model_name,
@@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str,
    )


-def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
+def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
    )


-def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
+def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@@ -466,7 +466,7 @@ model_example_map = {
 }


-def run_generate(model, question: str, image_urls: List[str]):
+def run_generate(model, question: str, image_urls: list[str]):
    req_data = model_example_map[model](question, image_urls)

    sampling_params = SamplingParams(temperature=0.0,
@@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]):
        print(generated_text)


-def run_chat(model: str, question: str, image_urls: List[str]):
+def run_chat(model: str, question: str, image_urls: list[str]):
    req_data = model_example_map[model](question, image_urls)

    sampling_params = SamplingParams(temperature=0.0,

--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.

 import argparse
 import json
-from typing import Iterable, List
+from collections.abc import Iterable

 import requests

@@ -39,7 +39,7 @@ def post_http_request(prompt: str,
    return response


-def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
+def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
                                     delimiter=b"\0"):
@@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
            yield output


-def get_response(response: requests.Response) -> List[str]:
+def get_response(response: requests.Response) -> list[str]:
    data = json.loads(response.content)
    output = data["text"]
    return output

--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -24,4 +24,4 @@ responses = client.embeddings.create(
 )

 for data in responses.data:
-    print(data.embedding)  # list of float of len 4096
+    print(data.embedding)  # List of float of len 4096
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,32 @@ exclude = [
 [tool.ruff.lint.per-file-ignores]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
+# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
+"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"]
+"vllm/attention/**/*.py" = ["UP006", "UP035"]
+"vllm/compilation/**/*.py" = ["UP006", "UP035"]
+"vllm/core/**/*.py" = ["UP006", "UP035"]
+"vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
+"vllm/distributed/**/*.py" = ["UP006", "UP035"]
+"vllm/engine/**/*.py" = ["UP006", "UP035"]
+"vllm/executor/**/*.py" = ["UP006", "UP035"]
+"vllm/inputs/**/*.py" = ["UP006", "UP035"]
+"vllm/logging_utils/**/*.py" = ["UP006", "UP035"]
+"vllm/lora/**/*.py" = ["UP006", "UP035"]
+"vllm/model_executor/**/*.py" = ["UP006", "UP035"]
+"vllm/multimodal/**/*.py" = ["UP006", "UP035"]
+"vllm/platforms/**/*.py" = ["UP006", "UP035"]
+"vllm/plugins/**/*.py" = ["UP006", "UP035"]
+"vllm/profiler/**/*.py" = ["UP006", "UP035"]
+"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
+"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
+"vllm/third_party/**/*.py" = ["UP006", "UP035"]
+"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
+"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
+"vllm/usage/**/*.py" = ["UP006", "UP035"]
+"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
+"vllm/assets/**/*.py" = ["UP006", "UP035"]
+"vllm/worker/**/*.py" = ["UP006", "UP035"]

 [tool.ruff.lint]
 select = [
@@ -91,8 +117,6 @@ ignore = [
    "B007",
    # f-string format
    "UP032",
-    # Python 3.8 typing
-    "UP006", "UP035",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]