Unverified Commit cf069aa8 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update deprecated Python 3.8 typing (#13971)

parent bf33700e
...@@ -4,7 +4,6 @@ import math ...@@ -4,7 +4,6 @@ import math
import pickle import pickle
import re import re
from collections import defaultdict from collections import defaultdict
from typing import List
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
...@@ -23,7 +22,7 @@ if __name__ == "__main__": ...@@ -23,7 +22,7 @@ if __name__ == "__main__":
with open(args.filename, 'rb') as f: with open(args.filename, 'rb') as f:
data = pickle.load(f) data = pickle.load(f)
raw_results: List[TMeasurement] = data["results"] raw_results: list[TMeasurement] = data["results"]
results = defaultdict(lambda: list()) results = defaultdict(lambda: list())
for v in raw_results: for v in raw_results:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import dataclasses import dataclasses
from typing import Any, Callable, Iterable, Optional from collections.abc import Iterable
from typing import Any, Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import enum import enum
from typing import Dict, Union from typing import Union
from cutlass_library import * from cutlass_library import *
...@@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum): ...@@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative = enum_auto() TmaWarpSpecializedCooperative = enum_auto()
VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeNames, # type: ignore **DataTypeNames, # type: ignore
**{ **{
VLLMDataType.u4b8: "u4b8", VLLMDataType.u4b8: "u4b8",
...@@ -29,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { ...@@ -29,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
} }
} }
VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeTag, # type: ignore **DataTypeTag, # type: ignore
**{ **{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
...@@ -37,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { ...@@ -37,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
} }
} }
VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
**DataTypeSize, # type: ignore **DataTypeSize, # type: ignore
**{ **{
VLLMDataType.u4b8: 4, VLLMDataType.u4b8: 4,
...@@ -45,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { ...@@ -45,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
} }
} }
VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u4b8: "vllm::kU4B8",
VLLMDataType.u8b128: "vllm::kU8B128", VLLMDataType.u8b128: "vllm::kU8B128",
DataType.u4: "vllm::kU4", DataType.u4: "vllm::kU4",
...@@ -56,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { ...@@ -56,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
DataType.bf16: "vllm::kBfloat16", DataType.bf16: "vllm::kBfloat16",
} }
VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.u8: "at::ScalarType::Byte", DataType.u8: "at::ScalarType::Byte",
DataType.s8: "at::ScalarType::Char", DataType.s8: "at::ScalarType::Char",
DataType.e4m3: "at::ScalarType::Float8_e4m3fn", DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
...@@ -66,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { ...@@ -66,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
DataType.f32: "at::ScalarType::Float", DataType.f32: "at::ScalarType::Float",
} }
VLLMKernelScheduleTag: Dict[Union[ VLLMKernelScheduleTag: dict[Union[
MixedInputKernelScheduleType, KernelScheduleType], str] = { MixedInputKernelScheduleType, KernelScheduleType], str] = {
**KernelScheduleTag, # type: ignore **KernelScheduleTag, # type: ignore
**{ **{
......
...@@ -8,7 +8,7 @@ from collections.abc import Iterable ...@@ -8,7 +8,7 @@ from collections.abc import Iterable
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass, fields from dataclasses import dataclass, fields
from functools import reduce from functools import reduce
from typing import Dict, List, Optional, Tuple, Union from typing import Optional, Union
import jinja2 import jinja2
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
...@@ -247,8 +247,8 @@ TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative ...@@ -247,8 +247,8 @@ TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
@dataclass(frozen=True) @dataclass(frozen=True)
class ScheduleConfig: class ScheduleConfig:
tile_shape_mn: Tuple[int, int] tile_shape_mn: tuple[int, int]
cluster_shape_mnk: Tuple[int, int, int] cluster_shape_mnk: tuple[int, int, int]
kernel_schedule: MixedInputKernelScheduleType kernel_schedule: MixedInputKernelScheduleType
epilogue_schedule: EpilogueScheduleType epilogue_schedule: EpilogueScheduleType
tile_scheduler: TileSchedulerType tile_scheduler: TileSchedulerType
...@@ -277,8 +277,8 @@ class PrepackTypeConfig: ...@@ -277,8 +277,8 @@ class PrepackTypeConfig:
@dataclass @dataclass
class ImplConfig: class ImplConfig:
types: TypeConfig types: TypeConfig
schedules: List[ScheduleConfig] schedules: list[ScheduleConfig]
heuristic: List[Tuple[Optional[str], ScheduleConfig]] heuristic: list[tuple[Optional[str], ScheduleConfig]]
def generate_sch_sig(schedule_config: ScheduleConfig) -> str: def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
...@@ -333,7 +333,7 @@ def is_power_of_two(n): ...@@ -333,7 +333,7 @@ def is_power_of_two(n):
return (n != 0) and (n & (n - 1) == 0) return (n != 0) and (n & (n - 1) == 0)
def to_cute_constant(value: List[int]): def to_cute_constant(value: list[int]):
def _to_cute_constant(value: int): def _to_cute_constant(value: int):
if is_power_of_two(value): if is_power_of_two(value):
...@@ -347,7 +347,7 @@ def to_cute_constant(value: List[int]): ...@@ -347,7 +347,7 @@ def to_cute_constant(value: List[int]):
return _to_cute_constant(value) return _to_cute_constant(value)
def unique_schedules(impl_configs: List[ImplConfig]): def unique_schedules(impl_configs: list[ImplConfig]):
return list( return list(
set(sch for impl_config in impl_configs set(sch for impl_config in impl_configs
for sch in impl_config.schedules)) for sch in impl_config.schedules))
...@@ -391,7 +391,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE) ...@@ -391,7 +391,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE)
prepack_dispatch_template = create_template(PREPACK_TEMPLATE) prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
def create_sources(impl_configs: List[ImplConfig], num_impl_files=8): def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
sources = [] sources = []
sources.append(( sources.append((
...@@ -435,7 +435,7 @@ def create_sources(impl_configs: List[ImplConfig], num_impl_files=8): ...@@ -435,7 +435,7 @@ def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0) num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
num_impls_per_file = math.ceil(num_impls / num_impl_files) num_impls_per_file = math.ceil(num_impls / num_impl_files)
files_impls: List[List[ImplConfig]] = [[]] files_impls: list[list[ImplConfig]] = [[]]
curr_num_impls_assigned = 0 curr_num_impls_assigned = 0
curr_impl_in_file = 0 curr_impl_in_file = 0
...@@ -515,7 +515,7 @@ def generate(): ...@@ -515,7 +515,7 @@ def generate():
for cond, tile_config in default_tile_heuristic_config.items() for cond, tile_config in default_tile_heuristic_config.items()
] ]
def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]): def get_unique_schedules(heuristic: dict[str, ScheduleConfig]):
# Do not use schedules = list(set(...)) because we need to make sure # Do not use schedules = list(set(...)) because we need to make sure
# the output list is deterministic; otherwise the generated kernel file # the output list is deterministic; otherwise the generated kernel file
# will be non-deterministic and causes ccache miss. # will be non-deterministic and causes ccache miss.
......
...@@ -17,7 +17,6 @@ import inspect ...@@ -17,7 +17,6 @@ import inspect
import logging import logging
import os import os
import sys import sys
from typing import List
import requests import requests
from sphinx.ext import autodoc from sphinx.ext import autodoc
...@@ -58,7 +57,7 @@ templates_path = ['_templates'] ...@@ -58,7 +57,7 @@ templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and # List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files. # directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path. # This pattern also affects html_static_path and html_extra_path.
exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"] exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
# Exclude the prompt "$" when copying code # Exclude the prompt "$" when copying code
copybutton_prompt_text = r"\$ " copybutton_prompt_text = r"\$ "
......
...@@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser): ...@@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser):
def extract_reasoning_content( def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> Tuple[Optional[str], Optional[str]]: ) -> tuple[Optional[str], Optional[str]]:
""" """
Extract reasoning content from a complete model-generated string. Extract reasoning content from a complete model-generated string.
...@@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser): ...@@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser):
The request object that was used to generate the model_output. The request object that was used to generate the model_output.
Returns: Returns:
Tuple[Optional[str], Optional[str]] tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content. A tuple containing the reasoning content and the content.
""" """
``` ```
......
...@@ -193,7 +193,7 @@ class Step(BaseModel): ...@@ -193,7 +193,7 @@ class Step(BaseModel):
class MathResponse(BaseModel): class MathResponse(BaseModel):
steps: List[Step] steps: list[Step]
final_answer: str final_answer: str
......
...@@ -74,7 +74,7 @@ class Example: ...@@ -74,7 +74,7 @@ class Example:
path (Path): The path to the main directory or file. path (Path): The path to the main directory or file.
category (str): The category of the document. category (str): The category of the document.
main_file (Path): The main file in the directory. main_file (Path): The main file in the directory.
other_files (list[Path]): List of other files in the directory. other_files (list[Path]): list of other files in the directory.
title (str): The title of the document. title (str): The title of the document.
Methods: Methods:
......
...@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster. ...@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
""" """
from typing import Any, Dict, List from typing import Any
import numpy as np import numpy as np
import ray import ray
...@@ -36,13 +36,13 @@ class LLMPredictor: ...@@ -36,13 +36,13 @@ class LLMPredictor:
self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
tensor_parallel_size=tensor_parallel_size) tensor_parallel_size=tensor_parallel_size)
def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]: def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
# Generate texts from the prompts. # Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt, # The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information. # generated text, and other information.
outputs = self.llm.generate(batch["text"], sampling_params) outputs = self.llm.generate(batch["text"], sampling_params)
prompt: List[str] = [] prompt: list[str] = []
generated_text: List[str] = [] generated_text: list[str] = []
for output in outputs: for output in outputs:
prompt.append(output.prompt) prompt.append(output.prompt)
generated_text.append(' '.join([o.text for o in output.outputs])) generated_text.append(' '.join([o.text for o in output.outputs]))
...@@ -72,7 +72,7 @@ def scheduling_strategy_fn(): ...@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
pg, placement_group_capture_child_tasks=True)) pg, placement_group_capture_child_tasks=True))
resources_kwarg: Dict[str, Any] = {} resources_kwarg: dict[str, Any] = {}
if tensor_parallel_size == 1: if tensor_parallel_size == 1:
# For tensor_parallel_size == 1, we simply set num_gpus=1. # For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg["num_gpus"] = 1 resources_kwarg["num_gpus"] = 1
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import argparse import argparse
from typing import List, Tuple
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def create_test_prompts() -> List[Tuple[str, SamplingParams]]: def create_test_prompts() -> list[tuple[str, SamplingParams]]:
"""Create a list of test prompts with their sampling parameters.""" """Create a list of test prompts with their sampling parameters."""
return [ return [
("A robot may not injure a human being", ("A robot may not injure a human being",
...@@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]: ...@@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
def process_requests(engine: LLMEngine, def process_requests(engine: LLMEngine,
test_prompts: List[Tuple[str, SamplingParams]]): test_prompts: list[tuple[str, SamplingParams]]):
"""Continuously process a list of prompts and handle the outputs.""" """Continuously process a list of prompts and handle the outputs."""
request_id = 0 request_id = 0
...@@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine, ...@@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine,
engine.add_request(str(request_id), prompt, sampling_params) engine.add_request(str(request_id), prompt, sampling_params)
request_id += 1 request_id += 1
request_outputs: List[RequestOutput] = engine.step() request_outputs: list[RequestOutput] = engine.step()
for request_output in request_outputs: for request_output in request_outputs:
if request_output.finished: if request_output.finished:
......
...@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access. ...@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
""" """
import gc import gc
from typing import List, Optional, Tuple from typing import Optional
import torch import torch
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest ...@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
def create_test_prompts( def create_test_prompts(
lora_path: str lora_path: str
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
return [ return [
# this is an example of using quantization without LoRA # this is an example of using quantization without LoRA
("My name is", ("My name is",
...@@ -49,7 +49,7 @@ def create_test_prompts( ...@@ -49,7 +49,7 @@ def create_test_prompts(
def process_requests(engine: LLMEngine, def process_requests(engine: LLMEngine,
test_prompts: List[Tuple[str, SamplingParams, test_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]]): Optional[LoRARequest]]]):
"""Continuously process a list of prompts and handle the outputs.""" """Continuously process a list of prompts and handle the outputs."""
request_id = 0 request_id = 0
...@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine, ...@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
lora_request=lora_request) lora_request=lora_request)
request_id += 1 request_id += 1
request_outputs: List[RequestOutput] = engine.step() request_outputs: list[RequestOutput] = engine.step()
for request_output in request_outputs: for request_output in request_outputs:
if request_output.finished: if request_output.finished:
print("----------------------------------------------------") print("----------------------------------------------------")
......
...@@ -2,12 +2,11 @@ ...@@ -2,12 +2,11 @@
import gc import gc
import time import time
from typing import List
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
def time_generation(llm: LLM, prompts: List[str], def time_generation(llm: LLM, prompts: list[str],
sampling_params: SamplingParams): sampling_params: SamplingParams):
# Generate texts from the prompts. The output is a list of RequestOutput # Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information. # objects that contain the prompt, generated text, and other information.
......
...@@ -6,7 +6,7 @@ for offline inference. ...@@ -6,7 +6,7 @@ for offline inference.
Requires HuggingFace credentials for access to Llama2. Requires HuggingFace credentials for access to Llama2.
""" """
from typing import List, Optional, Tuple from typing import Optional
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest ...@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest
def create_test_prompts( def create_test_prompts(
lora_path: str lora_path: str
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
"""Create a list of test prompts with their sampling parameters. """Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2 2 requests for base model, 4 requests for the LoRA. We define 2
...@@ -56,7 +56,7 @@ def create_test_prompts( ...@@ -56,7 +56,7 @@ def create_test_prompts(
def process_requests(engine: LLMEngine, def process_requests(engine: LLMEngine,
test_prompts: List[Tuple[str, SamplingParams, test_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]]): Optional[LoRARequest]]]):
"""Continuously process a list of prompts and handle the outputs.""" """Continuously process a list of prompts and handle the outputs."""
request_id = 0 request_id = 0
...@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine, ...@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
lora_request=lora_request) lora_request=lora_request)
request_id += 1 request_id += 1
request_outputs: List[RequestOutput] = engine.step() request_outputs: list[RequestOutput] = engine.step()
for request_output in request_outputs: for request_output in request_outputs:
if request_output.finished: if request_output.finished:
......
...@@ -21,7 +21,7 @@ import argparse ...@@ -21,7 +21,7 @@ import argparse
import datetime import datetime
import os import os
import re import re
from typing import List, Union from typing import Union
import albumentations import albumentations
import numpy as np import numpy as np
...@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor): ...@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
def load_example( def load_example(
file_paths: List[str], file_paths: list[str],
mean: List[float] = None, mean: list[float] = None,
std: List[float] = None, std: list[float] = None,
indices: Union[list[int], None] = None, indices: Union[list[int], None] = None,
): ):
"""Build an input example by loading images in *file_paths*. """Build an input example by loading images in *file_paths*.
......
...@@ -5,8 +5,9 @@ import json ...@@ -5,8 +5,9 @@ import json
import os import os
import sys import sys
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
from collections.abc import Generator
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from typing import Any, Dict, Generator, List, Optional, TypeAlias from typing import Any, Optional, TypeAlias
import torch import torch
import tqdm import tqdm
...@@ -42,8 +43,8 @@ def get_dtype(dtype: str): ...@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
return dtype return dtype
OutputLen_NumReqs_Map: TypeAlias = Dict[int, int] OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \
-> OutputLen_NumReqs_Map: -> OutputLen_NumReqs_Map:
""" """
Given the number of requests, batch_size, and the number of requests Given the number of requests, batch_size, and the number of requests
...@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ ...@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
Args: Args:
batch_size (int): Number of requests submitted for profile. This is batch_size (int): Number of requests submitted for profile. This is
args.batch_size. args.batch_size.
step_requests (List[int]): step_requests[i] is the number of requests step_requests (list[int]): step_requests[i] is the number of requests
that the ith engine step should process. that the ith engine step should process.
Returns: Returns:
...@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ ...@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
return ol_nr return ol_nr
def determine_requests_per_step(context: ProfileContext) -> List[int]: def determine_requests_per_step(context: ProfileContext) -> list[int]:
""" """
Determine number of requests each engine step should process. Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the If context.num_steps is set, then all engine steps process the
...@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]: ...@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
context: ProfileContext object. context: ProfileContext object.
Returns: Returns:
List[int]: Number of requests to process for all engine-steps. list[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step output[i], contains the number of requests that the ith step
should process. should process.
""" """
...@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], ...@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
for key, value in asdict(context).items(): for key, value in asdict(context).items():
print(f" {key} = {value}") print(f" {key} = {value}")
requests_per_step: List[int] = determine_requests_per_step(context) requests_per_step: list[int] = determine_requests_per_step(context)
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
context.batch_size, requests_per_step) context.batch_size, requests_per_step)
......
...@@ -4,7 +4,6 @@ import argparse ...@@ -4,7 +4,6 @@ import argparse
import dataclasses import dataclasses
import os import os
import time import time
from typing import List
import numpy as np import numpy as np
import torch_xla.debug.profiler as xp import torch_xla.debug.profiler as xp
...@@ -35,7 +34,7 @@ def main(args: argparse.Namespace): ...@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000, dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size, size=(args.batch_size,
args.input_len)) args.input_len))
dummy_prompts: List[PromptType] = [{ dummy_prompts: list[PromptType] = [{
"prompt_token_ids": batch "prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()] } for batch in dummy_prompt_token_ids.tolist()]
......
...@@ -5,7 +5,7 @@ multi-image input on vision language models for text generation, ...@@ -5,7 +5,7 @@ multi-image input on vision language models for text generation,
using the chat template defined by the model. using the chat template defined by the model.
""" """
from argparse import Namespace from argparse import Namespace
from typing import List, NamedTuple, Optional from typing import NamedTuple, Optional
from PIL.Image import Image from PIL.Image import Image
from transformers import AutoProcessor, AutoTokenizer from transformers import AutoProcessor, AutoTokenizer
...@@ -24,8 +24,8 @@ IMAGE_URLS = [ ...@@ -24,8 +24,8 @@ IMAGE_URLS = [
class ModelRequestData(NamedTuple): class ModelRequestData(NamedTuple):
llm: LLM llm: LLM
prompt: str prompt: str
stop_token_ids: Optional[List[int]] stop_token_ids: Optional[list[int]]
image_data: List[Image] image_data: list[Image]
chat_template: Optional[str] chat_template: Optional[str]
...@@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple): ...@@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4. # Unless specified, these settings have been tested to work on a single L4.
def load_aria(question, image_urls: List[str]) -> ModelRequestData: def load_aria(question, image_urls: list[str]) -> ModelRequestData:
model_name = "rhymes-ai/Aria" model_name = "rhymes-ai/Aria"
llm = LLM(model=model_name, llm = LLM(model=model_name,
tokenizer_mode="slow", tokenizer_mode="slow",
...@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: ...@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
) )
def load_deepseek_vl2(question: str, image_urls: List[str]): def load_deepseek_vl2(question: str, image_urls: list[str]):
model_name = "deepseek-ai/deepseek-vl2-tiny" model_name = "deepseek-ai/deepseek-vl2-tiny"
llm = LLM(model=model_name, llm = LLM(model=model_name,
...@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]): ...@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
) )
def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-800m" model_name = "h2oai/h2ovl-mississippi-800m"
llm = LLM( llm = LLM(
...@@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
) )
def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceM4/Idefics3-8B-Llama3" model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
...@@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: ...@@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
) )
def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B" model_name = "OpenGVLab/InternVL2-2B"
llm = LLM( llm = LLM(
...@@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
) )
def load_mllama(question, image_urls: List[str]) -> ModelRequestData: def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
...@@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ...@@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
) )
def load_nvlm_d(question: str, image_urls: List[str]): def load_nvlm_d(question: str, image_urls: list[str]):
model_name = "nvidia/NVLM-D-72B" model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU # Adjust this as necessary to fit in GPU
...@@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]): ...@@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]):
) )
def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b" model_name = "mistral-community/pixtral-12b"
# Adjust this as necessary to fit in GPU # Adjust this as necessary to fit in GPU
...@@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
) )
def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
# num_crops is an override kwarg to the multimodal image processor; # num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame. # to use 16 for single frame scenarios, and 4 for multi-frame.
...@@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
def load_qwen_vl_chat(question: str, def load_qwen_vl_chat(question: str,
image_urls: List[str]) -> ModelRequestData: image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat" model_name = "Qwen/Qwen-VL-Chat"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
...@@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str, ...@@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str,
) )
def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import process_vision_info
except ModuleNotFoundError: except ModuleNotFoundError:
...@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
) )
def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import process_vision_info
except ModuleNotFoundError: except ModuleNotFoundError:
...@@ -466,7 +466,7 @@ model_example_map = { ...@@ -466,7 +466,7 @@ model_example_map = {
} }
def run_generate(model, question: str, image_urls: List[str]): def run_generate(model, question: str, image_urls: list[str]):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
...@@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]): ...@@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]):
print(generated_text) print(generated_text)
def run_chat(model: str, question: str, image_urls: List[str]): def run_chat(model: str, question: str, image_urls: list[str]):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
......
...@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API. ...@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
import argparse import argparse
import json import json
from typing import Iterable, List from collections.abc import Iterable
import requests import requests
...@@ -39,7 +39,7 @@ def post_http_request(prompt: str, ...@@ -39,7 +39,7 @@ def post_http_request(prompt: str,
return response return response
def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(chunk_size=8192, for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False, decode_unicode=False,
delimiter=b"\0"): delimiter=b"\0"):
...@@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: ...@@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
yield output yield output
def get_response(response: requests.Response) -> List[str]: def get_response(response: requests.Response) -> list[str]:
data = json.loads(response.content) data = json.loads(response.content)
output = data["text"] output = data["text"]
return output return output
......
...@@ -24,4 +24,4 @@ responses = client.embeddings.create( ...@@ -24,4 +24,4 @@ responses = client.embeddings.create(
) )
for data in responses.data: for data in responses.data:
print(data.embedding) # list of float of len 4096 print(data.embedding) # List of float of len 4096
...@@ -65,6 +65,32 @@ exclude = [ ...@@ -65,6 +65,32 @@ exclude = [
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/version.py" = ["F401"] "vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"] "vllm/_version.py" = ["ALL"]
# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"]
"vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/compilation/**/*.py" = ["UP006", "UP035"]
"vllm/core/**/*.py" = ["UP006", "UP035"]
"vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
"vllm/distributed/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/inputs/**/*.py" = ["UP006", "UP035"]
"vllm/logging_utils/**/*.py" = ["UP006", "UP035"]
"vllm/lora/**/*.py" = ["UP006", "UP035"]
"vllm/model_executor/**/*.py" = ["UP006", "UP035"]
"vllm/multimodal/**/*.py" = ["UP006", "UP035"]
"vllm/platforms/**/*.py" = ["UP006", "UP035"]
"vllm/plugins/**/*.py" = ["UP006", "UP035"]
"vllm/profiler/**/*.py" = ["UP006", "UP035"]
"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
"vllm/third_party/**/*.py" = ["UP006", "UP035"]
"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
"vllm/usage/**/*.py" = ["UP006", "UP035"]
"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
"vllm/assets/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [
...@@ -91,8 +117,6 @@ ignore = [ ...@@ -91,8 +117,6 @@ ignore = [
"B007", "B007",
# f-string format # f-string format
"UP032", "UP032",
# Python 3.8 typing
"UP006", "UP035",
# Can remove once 3.10+ is the minimum Python version # Can remove once 3.10+ is the minimum Python version
"UP007", "UP007",
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment