Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf069aa8
Unverified
Commit
cf069aa8
authored
Mar 03, 2025
by
Harry Mellor
Committed by
GitHub
Mar 02, 2025
Browse files
Update deprecated Python 3.8 typing (#13971)
parent
bf33700e
Changes
300
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
102 additions
and
81 deletions
+102
-81
benchmarks/kernels/graph_machete_bench.py
benchmarks/kernels/graph_machete_bench.py
+1
-2
benchmarks/kernels/utils.py
benchmarks/kernels/utils.py
+2
-1
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+7
-7
csrc/quantization/machete/generate.py
csrc/quantization/machete/generate.py
+10
-10
docs/source/conf.py
docs/source/conf.py
+1
-2
docs/source/features/reasoning_outputs.md
docs/source/features/reasoning_outputs.md
+2
-2
docs/source/features/structured_outputs.md
docs/source/features/structured_outputs.md
+1
-1
docs/source/generate_examples.py
docs/source/generate_examples.py
+1
-1
examples/offline_inference/distributed.py
examples/offline_inference/distributed.py
+5
-5
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+3
-4
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+4
-4
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+1
-2
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+4
-4
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/prithvi_geospatial_mae.py
+4
-4
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+8
-7
examples/offline_inference/profiling_tpu/profiling.py
examples/offline_inference/profiling_tpu/profiling.py
+1
-2
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+17
-17
examples/online_serving/api_client.py
examples/online_serving/api_client.py
+3
-3
examples/online_serving/openai_embedding_client.py
examples/online_serving/openai_embedding_client.py
+1
-1
pyproject.toml
pyproject.toml
+26
-2
No files found.
benchmarks/kernels/graph_machete_bench.py
View file @
cf069aa8
...
...
@@ -4,7 +4,6 @@ import math
import
pickle
import
re
from
collections
import
defaultdict
from
typing
import
List
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
...
...
@@ -23,7 +22,7 @@ if __name__ == "__main__":
with
open
(
args
.
filename
,
'rb'
)
as
f
:
data
=
pickle
.
load
(
f
)
raw_results
:
L
ist
[
TMeasurement
]
=
data
[
"results"
]
raw_results
:
l
ist
[
TMeasurement
]
=
data
[
"results"
]
results
=
defaultdict
(
lambda
:
list
())
for
v
in
raw_results
:
...
...
benchmarks/kernels/utils.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
dataclasses
from
typing
import
Any
,
Callable
,
Iterable
,
Optional
from
collections.abc
import
Iterable
from
typing
import
Any
,
Callable
,
Optional
import
torch
import
torch.utils.benchmark
as
TBenchmark
...
...
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
enum
from
typing
import
Dict
,
Union
from
typing
import
Union
from
cutlass_library
import
*
...
...
@@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative
=
enum_auto
()
VLLMDataTypeNames
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeNames
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
**
DataTypeNames
,
# type: ignore
**
{
VLLMDataType
.
u4b8
:
"u4b8"
,
...
...
@@ -29,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
}
}
VLLMDataTypeTag
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeTag
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
**
DataTypeTag
,
# type: ignore
**
{
VLLMDataType
.
u4b8
:
"cutlass::vllm_uint4b8_t"
,
...
...
@@ -37,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
}
}
VLLMDataTypeSize
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
int
]
=
{
VLLMDataTypeSize
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
int
]
=
{
**
DataTypeSize
,
# type: ignore
**
{
VLLMDataType
.
u4b8
:
4
,
...
...
@@ -45,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
}
}
VLLMDataTypeVLLMScalarTypeTag
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeVLLMScalarTypeTag
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataType
.
u4b8
:
"vllm::kU4B8"
,
VLLMDataType
.
u8b128
:
"vllm::kU8B128"
,
DataType
.
u4
:
"vllm::kU4"
,
...
...
@@ -56,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
DataType
.
bf16
:
"vllm::kBfloat16"
,
}
VLLMDataTypeTorchDataTypeTag
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeTorchDataTypeTag
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
DataType
.
u8
:
"at::ScalarType::Byte"
,
DataType
.
s8
:
"at::ScalarType::Char"
,
DataType
.
e4m3
:
"at::ScalarType::Float8_e4m3fn"
,
...
...
@@ -66,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
DataType
.
f32
:
"at::ScalarType::Float"
,
}
VLLMKernelScheduleTag
:
D
ict
[
Union
[
VLLMKernelScheduleTag
:
d
ict
[
Union
[
MixedInputKernelScheduleType
,
KernelScheduleType
],
str
]
=
{
**
KernelScheduleTag
,
# type: ignore
**
{
...
...
csrc/quantization/machete/generate.py
View file @
cf069aa8
...
...
@@ -8,7 +8,7 @@ from collections.abc import Iterable
from
copy
import
deepcopy
from
dataclasses
import
dataclass
,
fields
from
functools
import
reduce
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Optional
,
Union
import
jinja2
# yapf conflicts with isort for this block
...
...
@@ -247,8 +247,8 @@ TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
@
dataclass
(
frozen
=
True
)
class
ScheduleConfig
:
tile_shape_mn
:
T
uple
[
int
,
int
]
cluster_shape_mnk
:
T
uple
[
int
,
int
,
int
]
tile_shape_mn
:
t
uple
[
int
,
int
]
cluster_shape_mnk
:
t
uple
[
int
,
int
,
int
]
kernel_schedule
:
MixedInputKernelScheduleType
epilogue_schedule
:
EpilogueScheduleType
tile_scheduler
:
TileSchedulerType
...
...
@@ -277,8 +277,8 @@ class PrepackTypeConfig:
@
dataclass
class
ImplConfig
:
types
:
TypeConfig
schedules
:
L
ist
[
ScheduleConfig
]
heuristic
:
L
ist
[
T
uple
[
Optional
[
str
],
ScheduleConfig
]]
schedules
:
l
ist
[
ScheduleConfig
]
heuristic
:
l
ist
[
t
uple
[
Optional
[
str
],
ScheduleConfig
]]
def
generate_sch_sig
(
schedule_config
:
ScheduleConfig
)
->
str
:
...
...
@@ -333,7 +333,7 @@ def is_power_of_two(n):
return
(
n
!=
0
)
and
(
n
&
(
n
-
1
)
==
0
)
def
to_cute_constant
(
value
:
L
ist
[
int
]):
def
to_cute_constant
(
value
:
l
ist
[
int
]):
def
_to_cute_constant
(
value
:
int
):
if
is_power_of_two
(
value
):
...
...
@@ -347,7 +347,7 @@ def to_cute_constant(value: List[int]):
return
_to_cute_constant
(
value
)
def
unique_schedules
(
impl_configs
:
L
ist
[
ImplConfig
]):
def
unique_schedules
(
impl_configs
:
l
ist
[
ImplConfig
]):
return
list
(
set
(
sch
for
impl_config
in
impl_configs
for
sch
in
impl_config
.
schedules
))
...
...
@@ -391,7 +391,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE)
prepack_dispatch_template
=
create_template
(
PREPACK_TEMPLATE
)
def
create_sources
(
impl_configs
:
L
ist
[
ImplConfig
],
num_impl_files
=
8
):
def
create_sources
(
impl_configs
:
l
ist
[
ImplConfig
],
num_impl_files
=
8
):
sources
=
[]
sources
.
append
((
...
...
@@ -435,7 +435,7 @@ def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
num_impls
=
reduce
(
lambda
x
,
y
:
x
+
len
(
y
.
schedules
),
impl_configs
,
0
)
num_impls_per_file
=
math
.
ceil
(
num_impls
/
num_impl_files
)
files_impls
:
L
ist
[
L
ist
[
ImplConfig
]]
=
[[]]
files_impls
:
l
ist
[
l
ist
[
ImplConfig
]]
=
[[]]
curr_num_impls_assigned
=
0
curr_impl_in_file
=
0
...
...
@@ -515,7 +515,7 @@ def generate():
for
cond
,
tile_config
in
default_tile_heuristic_config
.
items
()
]
def
get_unique_schedules
(
heuristic
:
D
ict
[
str
,
ScheduleConfig
]):
def
get_unique_schedules
(
heuristic
:
d
ict
[
str
,
ScheduleConfig
]):
# Do not use schedules = list(set(...)) because we need to make sure
# the output list is deterministic; otherwise the generated kernel file
# will be non-deterministic and causes ccache miss.
...
...
docs/source/conf.py
View file @
cf069aa8
...
...
@@ -17,7 +17,6 @@ import inspect
import
logging
import
os
import
sys
from
typing
import
List
import
requests
from
sphinx.ext
import
autodoc
...
...
@@ -58,7 +57,7 @@ templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns
:
L
ist
[
str
]
=
[
"**/*.template.md"
,
"**/*.inc.md"
]
exclude_patterns
:
l
ist
[
str
]
=
[
"**/*.template.md"
,
"**/*.inc.md"
]
# Exclude the prompt "$" when copying code
copybutton_prompt_text
=
r
"\$ "
...
...
docs/source/features/reasoning_outputs.md
View file @
cf069aa8
...
...
@@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser):
def
extract_reasoning_content
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
"""
Extract reasoning content from a complete model-generated string.
...
...
@@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser):
The request object that was used to generate the model_output.
Returns:
T
uple[Optional[str], Optional[str]]
t
uple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
"""
```
...
...
docs/source/features/structured_outputs.md
View file @
cf069aa8
...
...
@@ -193,7 +193,7 @@ class Step(BaseModel):
class
MathResponse
(
BaseModel
):
steps
:
L
ist
[
Step
]
steps
:
l
ist
[
Step
]
final_answer
:
str
...
...
docs/source/generate_examples.py
View file @
cf069aa8
...
...
@@ -74,7 +74,7 @@ class Example:
path (Path): The path to the main directory or file.
category (str): The category of the document.
main_file (Path): The main file in the directory.
other_files (list[Path]):
L
ist of other files in the directory.
other_files (list[Path]):
l
ist of other files in the directory.
title (str): The title of the document.
Methods:
...
...
examples/offline_inference/distributed.py
View file @
cf069aa8
...
...
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
from
typing
import
Any
,
Dict
,
List
from
typing
import
Any
import
numpy
as
np
import
ray
...
...
@@ -36,13 +36,13 @@ class LLMPredictor:
self
.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-chat-hf"
,
tensor_parallel_size
=
tensor_parallel_size
)
def
__call__
(
self
,
batch
:
D
ict
[
str
,
np
.
ndarray
])
->
D
ict
[
str
,
list
]:
def
__call__
(
self
,
batch
:
d
ict
[
str
,
np
.
ndarray
])
->
d
ict
[
str
,
list
]:
# Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
outputs
=
self
.
llm
.
generate
(
batch
[
"text"
],
sampling_params
)
prompt
:
L
ist
[
str
]
=
[]
generated_text
:
L
ist
[
str
]
=
[]
prompt
:
l
ist
[
str
]
=
[]
generated_text
:
l
ist
[
str
]
=
[]
for
output
in
outputs
:
prompt
.
append
(
output
.
prompt
)
generated_text
.
append
(
' '
.
join
([
o
.
text
for
o
in
output
.
outputs
]))
...
...
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
pg
,
placement_group_capture_child_tasks
=
True
))
resources_kwarg
:
D
ict
[
str
,
Any
]
=
{}
resources_kwarg
:
d
ict
[
str
,
Any
]
=
{}
if
tensor_parallel_size
==
1
:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg
[
"num_gpus"
]
=
1
...
...
examples/offline_inference/llm_engine_example.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
argparse
from
typing
import
List
,
Tuple
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.utils
import
FlexibleArgumentParser
def
create_test_prompts
()
->
L
ist
[
T
uple
[
str
,
SamplingParams
]]:
def
create_test_prompts
()
->
l
ist
[
t
uple
[
str
,
SamplingParams
]]:
"""Create a list of test prompts with their sampling parameters."""
return
[
(
"A robot may not injure a human being"
,
...
...
@@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
]]):
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
...
...
@@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine,
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
cf069aa8
...
...
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
"""
import
gc
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
torch
from
huggingface_hub
import
snapshot_download
...
...
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
return
[
# this is an example of using quantization without LoRA
(
"My name is"
,
...
...
@@ -49,7 +49,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
...
...
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
print
(
"----------------------------------------------------"
)
...
...
examples/offline_inference/mlpspeculator.py
View file @
cf069aa8
...
...
@@ -2,12 +2,11 @@
import
gc
import
time
from
typing
import
List
from
vllm
import
LLM
,
SamplingParams
def
time_generation
(
llm
:
LLM
,
prompts
:
L
ist
[
str
],
def
time_generation
(
llm
:
LLM
,
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
):
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
...
...
examples/offline_inference/multilora_inference.py
View file @
cf069aa8
...
...
@@ -6,7 +6,7 @@ for offline inference.
Requires HuggingFace credentials for access to Llama2.
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
from
huggingface_hub
import
snapshot_download
...
...
@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
...
...
@@ -56,7 +56,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
...
...
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
...
...
examples/offline_inference/prithvi_geospatial_mae.py
View file @
cf069aa8
...
...
@@ -21,7 +21,7 @@ import argparse
import
datetime
import
os
import
re
from
typing
import
List
,
Union
from
typing
import
Union
import
albumentations
import
numpy
as
np
...
...
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
def
load_example
(
file_paths
:
L
ist
[
str
],
mean
:
L
ist
[
float
]
=
None
,
std
:
L
ist
[
float
]
=
None
,
file_paths
:
l
ist
[
str
],
mean
:
l
ist
[
float
]
=
None
,
std
:
l
ist
[
float
]
=
None
,
indices
:
Union
[
list
[
int
],
None
]
=
None
,
):
"""Build an input example by loading images in *file_paths*.
...
...
examples/offline_inference/profiling.py
View file @
cf069aa8
...
...
@@ -5,8 +5,9 @@ import json
import
os
import
sys
from
argparse
import
RawTextHelpFormatter
from
collections.abc
import
Generator
from
dataclasses
import
asdict
,
dataclass
from
typing
import
Any
,
Dict
,
Generator
,
List
,
Optional
,
TypeAlias
from
typing
import
Any
,
Optional
,
TypeAlias
import
torch
import
tqdm
...
...
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
return
dtype
OutputLen_NumReqs_Map
:
TypeAlias
=
D
ict
[
int
,
int
]
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
L
ist
[
int
])
\
OutputLen_NumReqs_Map
:
TypeAlias
=
d
ict
[
int
,
int
]
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
l
ist
[
int
])
\
->
OutputLen_NumReqs_Map
:
"""
Given the number of requests, batch_size, and the number of requests
...
...
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
Args:
batch_size (int): Number of requests submitted for profile. This is
args.batch_size.
step_requests (
L
ist[int]): step_requests[i] is the number of requests
step_requests (
l
ist[int]): step_requests[i] is the number of requests
that the ith engine step should process.
Returns:
...
...
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
return
ol_nr
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
L
ist
[
int
]:
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
l
ist
[
int
]:
"""
Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the
...
...
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
context: ProfileContext object.
Returns:
L
ist[int]: Number of requests to process for all engine-steps.
l
ist[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step
should process.
"""
...
...
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
for
key
,
value
in
asdict
(
context
).
items
():
print
(
f
"
{
key
}
=
{
value
}
"
)
requests_per_step
:
L
ist
[
int
]
=
determine_requests_per_step
(
context
)
requests_per_step
:
l
ist
[
int
]
=
determine_requests_per_step
(
context
)
ol_nr
:
OutputLen_NumReqs_Map
=
compute_request_output_lengths
(
context
.
batch_size
,
requests_per_step
)
...
...
examples/offline_inference/profiling_tpu/profiling.py
View file @
cf069aa8
...
...
@@ -4,7 +4,6 @@ import argparse
import
dataclasses
import
os
import
time
from
typing
import
List
import
numpy
as
np
import
torch_xla.debug.profiler
as
xp
...
...
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
args
.
batch_size
,
args
.
input_len
))
dummy_prompts
:
L
ist
[
PromptType
]
=
[{
dummy_prompts
:
l
ist
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
cf069aa8
...
...
@@ -5,7 +5,7 @@ multi-image input on vision language models for text generation,
using the chat template defined by the model.
"""
from
argparse
import
Namespace
from
typing
import
List
,
NamedTuple
,
Optional
from
typing
import
NamedTuple
,
Optional
from
PIL.Image
import
Image
from
transformers
import
AutoProcessor
,
AutoTokenizer
...
...
@@ -24,8 +24,8 @@ IMAGE_URLS = [
class
ModelRequestData
(
NamedTuple
):
llm
:
LLM
prompt
:
str
stop_token_ids
:
Optional
[
L
ist
[
int
]]
image_data
:
L
ist
[
Image
]
stop_token_ids
:
Optional
[
l
ist
[
int
]]
image_data
:
l
ist
[
Image
]
chat_template
:
Optional
[
str
]
...
...
@@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4.
def
load_aria
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_aria
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"rhymes-ai/Aria"
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"slow"
,
...
...
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
)
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
l
ist
[
str
]):
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
llm
=
LLM
(
model
=
model_name
,
...
...
@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
)
def
load_h2ovl
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_h2ovl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"h2oai/h2ovl-mississippi-800m"
llm
=
LLM
(
...
...
@@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
)
def
load_idefics3
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_idefics3
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU.
...
...
@@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
)
def
load_internvl
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_internvl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"OpenGVLab/InternVL2-2B"
llm
=
LLM
(
...
...
@@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
)
def
load_mllama
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_mllama
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
...
...
@@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
)
def
load_nvlm_d
(
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
load_nvlm_d
(
question
:
str
,
image_urls
:
l
ist
[
str
]):
model_name
=
"nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
...
...
@@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]):
)
def
load_pixtral_hf
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_pixtral_hf
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"mistral-community/pixtral-12b"
# Adjust this as necessary to fit in GPU
...
...
@@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
)
def
load_phi3v
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_phi3v
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
...
...
@@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
def
load_qwen_vl_chat
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"Qwen/Qwen-VL-Chat"
llm
=
LLM
(
model
=
model_name
,
...
...
@@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str,
)
def
load_qwen2_vl
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_qwen2_vl
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
try
:
from
qwen_vl_utils
import
process_vision_info
except
ModuleNotFoundError
:
...
...
@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
)
def
load_qwen2_5_vl
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_qwen2_5_vl
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
try
:
from
qwen_vl_utils
import
process_vision_info
except
ModuleNotFoundError
:
...
...
@@ -466,7 +466,7 @@ model_example_map = {
}
def
run_generate
(
model
,
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
run_generate
(
model
,
question
:
str
,
image_urls
:
l
ist
[
str
]):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
...
...
@@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]):
print
(
generated_text
)
def
run_chat
(
model
:
str
,
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
run_chat
(
model
:
str
,
question
:
str
,
image_urls
:
l
ist
[
str
]):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
...
...
examples/online_serving/api_client.py
View file @
cf069aa8
...
...
@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
import
argparse
import
json
from
typing
import
Iterable
,
List
from
collections.abc
import
Iterable
import
requests
...
...
@@ -39,7 +39,7 @@ def post_http_request(prompt: str,
return
response
def
get_streaming_response
(
response
:
requests
.
Response
)
->
Iterable
[
L
ist
[
str
]]:
def
get_streaming_response
(
response
:
requests
.
Response
)
->
Iterable
[
l
ist
[
str
]]:
for
chunk
in
response
.
iter_lines
(
chunk_size
=
8192
,
decode_unicode
=
False
,
delimiter
=
b
"
\0
"
):
...
...
@@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
yield
output
def
get_response
(
response
:
requests
.
Response
)
->
L
ist
[
str
]:
def
get_response
(
response
:
requests
.
Response
)
->
l
ist
[
str
]:
data
=
json
.
loads
(
response
.
content
)
output
=
data
[
"text"
]
return
output
...
...
examples/online_serving/openai_embedding_client.py
View file @
cf069aa8
...
...
@@ -24,4 +24,4 @@ responses = client.embeddings.create(
)
for
data
in
responses
.
data
:
print
(
data
.
embedding
)
#
l
ist of float of len 4096
print
(
data
.
embedding
)
#
L
ist of float of len 4096
pyproject.toml
View file @
cf069aa8
...
...
@@ -65,6 +65,32 @@ exclude = [
[tool.ruff.lint.per-file-ignores]
"vllm/version.py"
=
["F401"]
"vllm/_version.py"
=
["ALL"]
# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
"vllm/adapter_commons/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/attention/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/compilation/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/core/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/device_allocator/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/distributed/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/engine/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/executor/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/inputs/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/logging_utils/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/lora/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/model_executor/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/multimodal/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/platforms/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/plugins/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/profiler/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/prompt_adapter/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/spec_decode/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/third_party/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/transformers_utils/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/triton_utils/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/usage/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/vllm_flash_attn/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/assets/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/worker/**/*.py"
=
[
"UP006"
,
"UP035"
]
[tool.ruff.lint]
select
=
[
...
...
@@ -91,8 +117,6 @@ ignore = [
"B007"
,
# f-string format
"UP032"
,
# Python 3.8 typing
"UP006"
,
"UP035"
,
# Can remove once 3.10+ is the minimum Python version
"UP007"
,
]
...
...
Prev
1
2
3
4
5
6
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment