Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf069aa8
Unverified
Commit
cf069aa8
authored
Mar 03, 2025
by
Harry Mellor
Committed by
GitHub
Mar 02, 2025
Browse files
Update deprecated Python 3.8 typing (#13971)
parent
bf33700e
Changes
300
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
102 additions
and
81 deletions
+102
-81
benchmarks/kernels/graph_machete_bench.py
benchmarks/kernels/graph_machete_bench.py
+1
-2
benchmarks/kernels/utils.py
benchmarks/kernels/utils.py
+2
-1
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+7
-7
csrc/quantization/machete/generate.py
csrc/quantization/machete/generate.py
+10
-10
docs/source/conf.py
docs/source/conf.py
+1
-2
docs/source/features/reasoning_outputs.md
docs/source/features/reasoning_outputs.md
+2
-2
docs/source/features/structured_outputs.md
docs/source/features/structured_outputs.md
+1
-1
docs/source/generate_examples.py
docs/source/generate_examples.py
+1
-1
examples/offline_inference/distributed.py
examples/offline_inference/distributed.py
+5
-5
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+3
-4
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+4
-4
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+1
-2
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+4
-4
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/prithvi_geospatial_mae.py
+4
-4
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+8
-7
examples/offline_inference/profiling_tpu/profiling.py
examples/offline_inference/profiling_tpu/profiling.py
+1
-2
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+17
-17
examples/online_serving/api_client.py
examples/online_serving/api_client.py
+3
-3
examples/online_serving/openai_embedding_client.py
examples/online_serving/openai_embedding_client.py
+1
-1
pyproject.toml
pyproject.toml
+26
-2
No files found.
benchmarks/kernels/graph_machete_bench.py
View file @
cf069aa8
...
@@ -4,7 +4,6 @@ import math
...
@@ -4,7 +4,6 @@ import math
import
pickle
import
pickle
import
re
import
re
from
collections
import
defaultdict
from
collections
import
defaultdict
from
typing
import
List
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
import
pandas
as
pd
...
@@ -23,7 +22,7 @@ if __name__ == "__main__":
...
@@ -23,7 +22,7 @@ if __name__ == "__main__":
with
open
(
args
.
filename
,
'rb'
)
as
f
:
with
open
(
args
.
filename
,
'rb'
)
as
f
:
data
=
pickle
.
load
(
f
)
data
=
pickle
.
load
(
f
)
raw_results
:
L
ist
[
TMeasurement
]
=
data
[
"results"
]
raw_results
:
l
ist
[
TMeasurement
]
=
data
[
"results"
]
results
=
defaultdict
(
lambda
:
list
())
results
=
defaultdict
(
lambda
:
list
())
for
v
in
raw_results
:
for
v
in
raw_results
:
...
...
benchmarks/kernels/utils.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
dataclasses
import
dataclasses
from
typing
import
Any
,
Callable
,
Iterable
,
Optional
from
collections.abc
import
Iterable
from
typing
import
Any
,
Callable
,
Optional
import
torch
import
torch
import
torch.utils.benchmark
as
TBenchmark
import
torch.utils.benchmark
as
TBenchmark
...
...
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
enum
import
enum
from
typing
import
Dict
,
Union
from
typing
import
Union
from
cutlass_library
import
*
from
cutlass_library
import
*
...
@@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
...
@@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative
=
enum_auto
()
TmaWarpSpecializedCooperative
=
enum_auto
()
VLLMDataTypeNames
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeNames
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
**
DataTypeNames
,
# type: ignore
**
DataTypeNames
,
# type: ignore
**
{
**
{
VLLMDataType
.
u4b8
:
"u4b8"
,
VLLMDataType
.
u4b8
:
"u4b8"
,
...
@@ -29,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
...
@@ -29,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
}
}
}
}
VLLMDataTypeTag
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeTag
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
**
DataTypeTag
,
# type: ignore
**
DataTypeTag
,
# type: ignore
**
{
**
{
VLLMDataType
.
u4b8
:
"cutlass::vllm_uint4b8_t"
,
VLLMDataType
.
u4b8
:
"cutlass::vllm_uint4b8_t"
,
...
@@ -37,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
...
@@ -37,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
}
}
}
}
VLLMDataTypeSize
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
int
]
=
{
VLLMDataTypeSize
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
int
]
=
{
**
DataTypeSize
,
# type: ignore
**
DataTypeSize
,
# type: ignore
**
{
**
{
VLLMDataType
.
u4b8
:
4
,
VLLMDataType
.
u4b8
:
4
,
...
@@ -45,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
...
@@ -45,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
}
}
}
}
VLLMDataTypeVLLMScalarTypeTag
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeVLLMScalarTypeTag
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataType
.
u4b8
:
"vllm::kU4B8"
,
VLLMDataType
.
u4b8
:
"vllm::kU4B8"
,
VLLMDataType
.
u8b128
:
"vllm::kU8B128"
,
VLLMDataType
.
u8b128
:
"vllm::kU8B128"
,
DataType
.
u4
:
"vllm::kU4"
,
DataType
.
u4
:
"vllm::kU4"
,
...
@@ -56,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
...
@@ -56,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
DataType
.
bf16
:
"vllm::kBfloat16"
,
DataType
.
bf16
:
"vllm::kBfloat16"
,
}
}
VLLMDataTypeTorchDataTypeTag
:
D
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
VLLMDataTypeTorchDataTypeTag
:
d
ict
[
Union
[
VLLMDataType
,
DataType
],
str
]
=
{
DataType
.
u8
:
"at::ScalarType::Byte"
,
DataType
.
u8
:
"at::ScalarType::Byte"
,
DataType
.
s8
:
"at::ScalarType::Char"
,
DataType
.
s8
:
"at::ScalarType::Char"
,
DataType
.
e4m3
:
"at::ScalarType::Float8_e4m3fn"
,
DataType
.
e4m3
:
"at::ScalarType::Float8_e4m3fn"
,
...
@@ -66,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
...
@@ -66,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
DataType
.
f32
:
"at::ScalarType::Float"
,
DataType
.
f32
:
"at::ScalarType::Float"
,
}
}
VLLMKernelScheduleTag
:
D
ict
[
Union
[
VLLMKernelScheduleTag
:
d
ict
[
Union
[
MixedInputKernelScheduleType
,
KernelScheduleType
],
str
]
=
{
MixedInputKernelScheduleType
,
KernelScheduleType
],
str
]
=
{
**
KernelScheduleTag
,
# type: ignore
**
KernelScheduleTag
,
# type: ignore
**
{
**
{
...
...
csrc/quantization/machete/generate.py
View file @
cf069aa8
...
@@ -8,7 +8,7 @@ from collections.abc import Iterable
...
@@ -8,7 +8,7 @@ from collections.abc import Iterable
from
copy
import
deepcopy
from
copy
import
deepcopy
from
dataclasses
import
dataclass
,
fields
from
dataclasses
import
dataclass
,
fields
from
functools
import
reduce
from
functools
import
reduce
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Optional
,
Union
import
jinja2
import
jinja2
# yapf conflicts with isort for this block
# yapf conflicts with isort for this block
...
@@ -247,8 +247,8 @@ TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
...
@@ -247,8 +247,8 @@ TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
@
dataclass
(
frozen
=
True
)
@
dataclass
(
frozen
=
True
)
class
ScheduleConfig
:
class
ScheduleConfig
:
tile_shape_mn
:
T
uple
[
int
,
int
]
tile_shape_mn
:
t
uple
[
int
,
int
]
cluster_shape_mnk
:
T
uple
[
int
,
int
,
int
]
cluster_shape_mnk
:
t
uple
[
int
,
int
,
int
]
kernel_schedule
:
MixedInputKernelScheduleType
kernel_schedule
:
MixedInputKernelScheduleType
epilogue_schedule
:
EpilogueScheduleType
epilogue_schedule
:
EpilogueScheduleType
tile_scheduler
:
TileSchedulerType
tile_scheduler
:
TileSchedulerType
...
@@ -277,8 +277,8 @@ class PrepackTypeConfig:
...
@@ -277,8 +277,8 @@ class PrepackTypeConfig:
@
dataclass
@
dataclass
class
ImplConfig
:
class
ImplConfig
:
types
:
TypeConfig
types
:
TypeConfig
schedules
:
L
ist
[
ScheduleConfig
]
schedules
:
l
ist
[
ScheduleConfig
]
heuristic
:
L
ist
[
T
uple
[
Optional
[
str
],
ScheduleConfig
]]
heuristic
:
l
ist
[
t
uple
[
Optional
[
str
],
ScheduleConfig
]]
def
generate_sch_sig
(
schedule_config
:
ScheduleConfig
)
->
str
:
def
generate_sch_sig
(
schedule_config
:
ScheduleConfig
)
->
str
:
...
@@ -333,7 +333,7 @@ def is_power_of_two(n):
...
@@ -333,7 +333,7 @@ def is_power_of_two(n):
return
(
n
!=
0
)
and
(
n
&
(
n
-
1
)
==
0
)
return
(
n
!=
0
)
and
(
n
&
(
n
-
1
)
==
0
)
def
to_cute_constant
(
value
:
L
ist
[
int
]):
def
to_cute_constant
(
value
:
l
ist
[
int
]):
def
_to_cute_constant
(
value
:
int
):
def
_to_cute_constant
(
value
:
int
):
if
is_power_of_two
(
value
):
if
is_power_of_two
(
value
):
...
@@ -347,7 +347,7 @@ def to_cute_constant(value: List[int]):
...
@@ -347,7 +347,7 @@ def to_cute_constant(value: List[int]):
return
_to_cute_constant
(
value
)
return
_to_cute_constant
(
value
)
def
unique_schedules
(
impl_configs
:
L
ist
[
ImplConfig
]):
def
unique_schedules
(
impl_configs
:
l
ist
[
ImplConfig
]):
return
list
(
return
list
(
set
(
sch
for
impl_config
in
impl_configs
set
(
sch
for
impl_config
in
impl_configs
for
sch
in
impl_config
.
schedules
))
for
sch
in
impl_config
.
schedules
))
...
@@ -391,7 +391,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE)
...
@@ -391,7 +391,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE)
prepack_dispatch_template
=
create_template
(
PREPACK_TEMPLATE
)
prepack_dispatch_template
=
create_template
(
PREPACK_TEMPLATE
)
def
create_sources
(
impl_configs
:
L
ist
[
ImplConfig
],
num_impl_files
=
8
):
def
create_sources
(
impl_configs
:
l
ist
[
ImplConfig
],
num_impl_files
=
8
):
sources
=
[]
sources
=
[]
sources
.
append
((
sources
.
append
((
...
@@ -435,7 +435,7 @@ def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
...
@@ -435,7 +435,7 @@ def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
num_impls
=
reduce
(
lambda
x
,
y
:
x
+
len
(
y
.
schedules
),
impl_configs
,
0
)
num_impls
=
reduce
(
lambda
x
,
y
:
x
+
len
(
y
.
schedules
),
impl_configs
,
0
)
num_impls_per_file
=
math
.
ceil
(
num_impls
/
num_impl_files
)
num_impls_per_file
=
math
.
ceil
(
num_impls
/
num_impl_files
)
files_impls
:
L
ist
[
L
ist
[
ImplConfig
]]
=
[[]]
files_impls
:
l
ist
[
l
ist
[
ImplConfig
]]
=
[[]]
curr_num_impls_assigned
=
0
curr_num_impls_assigned
=
0
curr_impl_in_file
=
0
curr_impl_in_file
=
0
...
@@ -515,7 +515,7 @@ def generate():
...
@@ -515,7 +515,7 @@ def generate():
for
cond
,
tile_config
in
default_tile_heuristic_config
.
items
()
for
cond
,
tile_config
in
default_tile_heuristic_config
.
items
()
]
]
def
get_unique_schedules
(
heuristic
:
D
ict
[
str
,
ScheduleConfig
]):
def
get_unique_schedules
(
heuristic
:
d
ict
[
str
,
ScheduleConfig
]):
# Do not use schedules = list(set(...)) because we need to make sure
# Do not use schedules = list(set(...)) because we need to make sure
# the output list is deterministic; otherwise the generated kernel file
# the output list is deterministic; otherwise the generated kernel file
# will be non-deterministic and causes ccache miss.
# will be non-deterministic and causes ccache miss.
...
...
docs/source/conf.py
View file @
cf069aa8
...
@@ -17,7 +17,6 @@ import inspect
...
@@ -17,7 +17,6 @@ import inspect
import
logging
import
logging
import
os
import
os
import
sys
import
sys
from
typing
import
List
import
requests
import
requests
from
sphinx.ext
import
autodoc
from
sphinx.ext
import
autodoc
...
@@ -58,7 +57,7 @@ templates_path = ['_templates']
...
@@ -58,7 +57,7 @@ templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns
:
L
ist
[
str
]
=
[
"**/*.template.md"
,
"**/*.inc.md"
]
exclude_patterns
:
l
ist
[
str
]
=
[
"**/*.template.md"
,
"**/*.inc.md"
]
# Exclude the prompt "$" when copying code
# Exclude the prompt "$" when copying code
copybutton_prompt_text
=
r
"\$ "
copybutton_prompt_text
=
r
"\$ "
...
...
docs/source/features/reasoning_outputs.md
View file @
cf069aa8
...
@@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser):
...
@@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser):
def
extract_reasoning_content
(
def
extract_reasoning_content
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
"""
"""
Extract reasoning content from a complete model-generated string.
Extract reasoning content from a complete model-generated string.
...
@@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser):
...
@@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser):
The request object that was used to generate the model_output.
The request object that was used to generate the model_output.
Returns:
Returns:
T
uple[Optional[str], Optional[str]]
t
uple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
A tuple containing the reasoning content and the content.
"""
"""
```
```
...
...
docs/source/features/structured_outputs.md
View file @
cf069aa8
...
@@ -193,7 +193,7 @@ class Step(BaseModel):
...
@@ -193,7 +193,7 @@ class Step(BaseModel):
class
MathResponse
(
BaseModel
):
class
MathResponse
(
BaseModel
):
steps
:
L
ist
[
Step
]
steps
:
l
ist
[
Step
]
final_answer
:
str
final_answer
:
str
...
...
docs/source/generate_examples.py
View file @
cf069aa8
...
@@ -74,7 +74,7 @@ class Example:
...
@@ -74,7 +74,7 @@ class Example:
path (Path): The path to the main directory or file.
path (Path): The path to the main directory or file.
category (str): The category of the document.
category (str): The category of the document.
main_file (Path): The main file in the directory.
main_file (Path): The main file in the directory.
other_files (list[Path]):
L
ist of other files in the directory.
other_files (list[Path]):
l
ist of other files in the directory.
title (str): The title of the document.
title (str): The title of the document.
Methods:
Methods:
...
...
examples/offline_inference/distributed.py
View file @
cf069aa8
...
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
...
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
"""
from
typing
import
Any
,
Dict
,
List
from
typing
import
Any
import
numpy
as
np
import
numpy
as
np
import
ray
import
ray
...
@@ -36,13 +36,13 @@ class LLMPredictor:
...
@@ -36,13 +36,13 @@ class LLMPredictor:
self
.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-chat-hf"
,
self
.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-chat-hf"
,
tensor_parallel_size
=
tensor_parallel_size
)
tensor_parallel_size
=
tensor_parallel_size
)
def
__call__
(
self
,
batch
:
D
ict
[
str
,
np
.
ndarray
])
->
D
ict
[
str
,
list
]:
def
__call__
(
self
,
batch
:
d
ict
[
str
,
np
.
ndarray
])
->
d
ict
[
str
,
list
]:
# Generate texts from the prompts.
# Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt,
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
# generated text, and other information.
outputs
=
self
.
llm
.
generate
(
batch
[
"text"
],
sampling_params
)
outputs
=
self
.
llm
.
generate
(
batch
[
"text"
],
sampling_params
)
prompt
:
L
ist
[
str
]
=
[]
prompt
:
l
ist
[
str
]
=
[]
generated_text
:
L
ist
[
str
]
=
[]
generated_text
:
l
ist
[
str
]
=
[]
for
output
in
outputs
:
for
output
in
outputs
:
prompt
.
append
(
output
.
prompt
)
prompt
.
append
(
output
.
prompt
)
generated_text
.
append
(
' '
.
join
([
o
.
text
for
o
in
output
.
outputs
]))
generated_text
.
append
(
' '
.
join
([
o
.
text
for
o
in
output
.
outputs
]))
...
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
...
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
pg
,
placement_group_capture_child_tasks
=
True
))
pg
,
placement_group_capture_child_tasks
=
True
))
resources_kwarg
:
D
ict
[
str
,
Any
]
=
{}
resources_kwarg
:
d
ict
[
str
,
Any
]
=
{}
if
tensor_parallel_size
==
1
:
if
tensor_parallel_size
==
1
:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg
[
"num_gpus"
]
=
1
resources_kwarg
[
"num_gpus"
]
=
1
...
...
examples/offline_inference/llm_engine_example.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
argparse
from
typing
import
List
,
Tuple
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
def
create_test_prompts
()
->
L
ist
[
T
uple
[
str
,
SamplingParams
]]:
def
create_test_prompts
()
->
l
ist
[
t
uple
[
str
,
SamplingParams
]]:
"""Create a list of test prompts with their sampling parameters."""
"""Create a list of test prompts with their sampling parameters."""
return
[
return
[
(
"A robot may not injure a human being"
,
(
"A robot may not injure a human being"
,
...
@@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
...
@@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
def
process_requests
(
engine
:
LLMEngine
,
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
]]):
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
]]):
"""Continuously process a list of prompts and handle the outputs."""
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
request_id
=
0
...
@@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine,
...
@@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine,
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
request_id
+=
1
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
if
request_output
.
finished
:
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
cf069aa8
...
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
...
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
"""
"""
import
gc
import
gc
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
torch
import
torch
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
...
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
...
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
def
create_test_prompts
(
lora_path
:
str
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
return
[
return
[
# this is an example of using quantization without LoRA
# this is an example of using quantization without LoRA
(
"My name is"
,
(
"My name is"
,
...
@@ -49,7 +49,7 @@ def create_test_prompts(
...
@@ -49,7 +49,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
request_id
=
0
...
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
...
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
lora_request
=
lora_request
)
request_id
+=
1
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
if
request_output
.
finished
:
print
(
"----------------------------------------------------"
)
print
(
"----------------------------------------------------"
)
...
...
examples/offline_inference/mlpspeculator.py
View file @
cf069aa8
...
@@ -2,12 +2,11 @@
...
@@ -2,12 +2,11 @@
import
gc
import
gc
import
time
import
time
from
typing
import
List
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
def
time_generation
(
llm
:
LLM
,
prompts
:
L
ist
[
str
],
def
time_generation
(
llm
:
LLM
,
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
):
sampling_params
:
SamplingParams
):
# Generate texts from the prompts. The output is a list of RequestOutput
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
# objects that contain the prompt, generated text, and other information.
...
...
examples/offline_inference/multilora_inference.py
View file @
cf069aa8
...
@@ -6,7 +6,7 @@ for offline inference.
...
@@ -6,7 +6,7 @@ for offline inference.
Requires HuggingFace credentials for access to Llama2.
Requires HuggingFace credentials for access to Llama2.
"""
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
...
@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest
...
@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
def
create_test_prompts
(
lora_path
:
str
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
"""Create a list of test prompts with their sampling parameters.
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
2 requests for base model, 4 requests for the LoRA. We define 2
...
@@ -56,7 +56,7 @@ def create_test_prompts(
...
@@ -56,7 +56,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
request_id
=
0
...
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
...
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
lora_request
=
lora_request
)
request_id
+=
1
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
if
request_output
.
finished
:
...
...
examples/offline_inference/prithvi_geospatial_mae.py
View file @
cf069aa8
...
@@ -21,7 +21,7 @@ import argparse
...
@@ -21,7 +21,7 @@ import argparse
import
datetime
import
datetime
import
os
import
os
import
re
import
re
from
typing
import
List
,
Union
from
typing
import
Union
import
albumentations
import
albumentations
import
numpy
as
np
import
numpy
as
np
...
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
...
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
def
load_example
(
def
load_example
(
file_paths
:
L
ist
[
str
],
file_paths
:
l
ist
[
str
],
mean
:
L
ist
[
float
]
=
None
,
mean
:
l
ist
[
float
]
=
None
,
std
:
L
ist
[
float
]
=
None
,
std
:
l
ist
[
float
]
=
None
,
indices
:
Union
[
list
[
int
],
None
]
=
None
,
indices
:
Union
[
list
[
int
],
None
]
=
None
,
):
):
"""Build an input example by loading images in *file_paths*.
"""Build an input example by loading images in *file_paths*.
...
...
examples/offline_inference/profiling.py
View file @
cf069aa8
...
@@ -5,8 +5,9 @@ import json
...
@@ -5,8 +5,9 @@ import json
import
os
import
os
import
sys
import
sys
from
argparse
import
RawTextHelpFormatter
from
argparse
import
RawTextHelpFormatter
from
collections.abc
import
Generator
from
dataclasses
import
asdict
,
dataclass
from
dataclasses
import
asdict
,
dataclass
from
typing
import
Any
,
Dict
,
Generator
,
List
,
Optional
,
TypeAlias
from
typing
import
Any
,
Optional
,
TypeAlias
import
torch
import
torch
import
tqdm
import
tqdm
...
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
...
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
return
dtype
return
dtype
OutputLen_NumReqs_Map
:
TypeAlias
=
D
ict
[
int
,
int
]
OutputLen_NumReqs_Map
:
TypeAlias
=
d
ict
[
int
,
int
]
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
L
ist
[
int
])
\
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
l
ist
[
int
])
\
->
OutputLen_NumReqs_Map
:
->
OutputLen_NumReqs_Map
:
"""
"""
Given the number of requests, batch_size, and the number of requests
Given the number of requests, batch_size, and the number of requests
...
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
...
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
Args:
Args:
batch_size (int): Number of requests submitted for profile. This is
batch_size (int): Number of requests submitted for profile. This is
args.batch_size.
args.batch_size.
step_requests (
L
ist[int]): step_requests[i] is the number of requests
step_requests (
l
ist[int]): step_requests[i] is the number of requests
that the ith engine step should process.
that the ith engine step should process.
Returns:
Returns:
...
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
...
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
return
ol_nr
return
ol_nr
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
L
ist
[
int
]:
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
l
ist
[
int
]:
"""
"""
Determine number of requests each engine step should process.
Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the
If context.num_steps is set, then all engine steps process the
...
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
...
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
context: ProfileContext object.
context: ProfileContext object.
Returns:
Returns:
L
ist[int]: Number of requests to process for all engine-steps.
l
ist[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step
output[i], contains the number of requests that the ith step
should process.
should process.
"""
"""
...
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
...
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
for
key
,
value
in
asdict
(
context
).
items
():
for
key
,
value
in
asdict
(
context
).
items
():
print
(
f
"
{
key
}
=
{
value
}
"
)
print
(
f
"
{
key
}
=
{
value
}
"
)
requests_per_step
:
L
ist
[
int
]
=
determine_requests_per_step
(
context
)
requests_per_step
:
l
ist
[
int
]
=
determine_requests_per_step
(
context
)
ol_nr
:
OutputLen_NumReqs_Map
=
compute_request_output_lengths
(
ol_nr
:
OutputLen_NumReqs_Map
=
compute_request_output_lengths
(
context
.
batch_size
,
requests_per_step
)
context
.
batch_size
,
requests_per_step
)
...
...
examples/offline_inference/profiling_tpu/profiling.py
View file @
cf069aa8
...
@@ -4,7 +4,6 @@ import argparse
...
@@ -4,7 +4,6 @@ import argparse
import
dataclasses
import
dataclasses
import
os
import
os
import
time
import
time
from
typing
import
List
import
numpy
as
np
import
numpy
as
np
import
torch_xla.debug.profiler
as
xp
import
torch_xla.debug.profiler
as
xp
...
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
...
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
args
.
batch_size
,
size
=
(
args
.
batch_size
,
args
.
input_len
))
args
.
input_len
))
dummy_prompts
:
L
ist
[
PromptType
]
=
[{
dummy_prompts
:
l
ist
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
cf069aa8
...
@@ -5,7 +5,7 @@ multi-image input on vision language models for text generation,
...
@@ -5,7 +5,7 @@ multi-image input on vision language models for text generation,
using the chat template defined by the model.
using the chat template defined by the model.
"""
"""
from
argparse
import
Namespace
from
argparse
import
Namespace
from
typing
import
List
,
NamedTuple
,
Optional
from
typing
import
NamedTuple
,
Optional
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
transformers
import
AutoProcessor
,
AutoTokenizer
from
transformers
import
AutoProcessor
,
AutoTokenizer
...
@@ -24,8 +24,8 @@ IMAGE_URLS = [
...
@@ -24,8 +24,8 @@ IMAGE_URLS = [
class
ModelRequestData
(
NamedTuple
):
class
ModelRequestData
(
NamedTuple
):
llm
:
LLM
llm
:
LLM
prompt
:
str
prompt
:
str
stop_token_ids
:
Optional
[
L
ist
[
int
]]
stop_token_ids
:
Optional
[
l
ist
[
int
]]
image_data
:
L
ist
[
Image
]
image_data
:
l
ist
[
Image
]
chat_template
:
Optional
[
str
]
chat_template
:
Optional
[
str
]
...
@@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple):
...
@@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4.
# Unless specified, these settings have been tested to work on a single L4.
def
load_aria
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_aria
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"rhymes-ai/Aria"
model_name
=
"rhymes-ai/Aria"
llm
=
LLM
(
model
=
model_name
,
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"slow"
,
tokenizer_mode
=
"slow"
,
...
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
...
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
)
)
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
l
ist
[
str
]):
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
llm
=
LLM
(
model
=
model_name
,
llm
=
LLM
(
model
=
model_name
,
...
@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
...
@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
)
)
def
load_h2ovl
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_h2ovl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"h2oai/h2ovl-mississippi-800m"
model_name
=
"h2oai/h2ovl-mississippi-800m"
llm
=
LLM
(
llm
=
LLM
(
...
@@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
...
@@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
)
)
def
load_idefics3
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_idefics3
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU.
# The configuration below has been confirmed to launch on a single L40 GPU.
...
@@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
...
@@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
)
)
def
load_internvl
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_internvl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"OpenGVLab/InternVL2-2B"
model_name
=
"OpenGVLab/InternVL2-2B"
llm
=
LLM
(
llm
=
LLM
(
...
@@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
...
@@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
)
)
def
load_mllama
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_mllama
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
model_name
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
# The configuration below has been confirmed to launch on a single L40 GPU.
...
@@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
...
@@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
)
)
def
load_nvlm_d
(
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
load_nvlm_d
(
question
:
str
,
image_urls
:
l
ist
[
str
]):
model_name
=
"nvidia/NVLM-D-72B"
model_name
=
"nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
# Adjust this as necessary to fit in GPU
...
@@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]):
...
@@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]):
)
)
def
load_pixtral_hf
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_pixtral_hf
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"mistral-community/pixtral-12b"
model_name
=
"mistral-community/pixtral-12b"
# Adjust this as necessary to fit in GPU
# Adjust this as necessary to fit in GPU
...
@@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
...
@@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
)
)
def
load_phi3v
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_phi3v
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
# num_crops is an override kwarg to the multimodal image processor;
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
# to use 16 for single frame scenarios, and 4 for multi-frame.
...
@@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
...
@@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
def
load_qwen_vl_chat
(
question
:
str
,
def
load_qwen_vl_chat
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"Qwen/Qwen-VL-Chat"
model_name
=
"Qwen/Qwen-VL-Chat"
llm
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
model
=
model_name
,
...
@@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str,
...
@@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str,
)
)
def
load_qwen2_vl
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_qwen2_vl
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
try
:
try
:
from
qwen_vl_utils
import
process_vision_info
from
qwen_vl_utils
import
process_vision_info
except
ModuleNotFoundError
:
except
ModuleNotFoundError
:
...
@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
...
@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
)
)
def
load_qwen2_5_vl
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_qwen2_5_vl
(
question
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
try
:
try
:
from
qwen_vl_utils
import
process_vision_info
from
qwen_vl_utils
import
process_vision_info
except
ModuleNotFoundError
:
except
ModuleNotFoundError
:
...
@@ -466,7 +466,7 @@ model_example_map = {
...
@@ -466,7 +466,7 @@ model_example_map = {
}
}
def
run_generate
(
model
,
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
run_generate
(
model
,
question
:
str
,
image_urls
:
l
ist
[
str
]):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
...
@@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]):
...
@@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]):
print
(
generated_text
)
print
(
generated_text
)
def
run_chat
(
model
:
str
,
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
run_chat
(
model
:
str
,
question
:
str
,
image_urls
:
l
ist
[
str
]):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
...
...
examples/online_serving/api_client.py
View file @
cf069aa8
...
@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
...
@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
import
argparse
import
argparse
import
json
import
json
from
typing
import
Iterable
,
List
from
collections.abc
import
Iterable
import
requests
import
requests
...
@@ -39,7 +39,7 @@ def post_http_request(prompt: str,
...
@@ -39,7 +39,7 @@ def post_http_request(prompt: str,
return
response
return
response
def
get_streaming_response
(
response
:
requests
.
Response
)
->
Iterable
[
L
ist
[
str
]]:
def
get_streaming_response
(
response
:
requests
.
Response
)
->
Iterable
[
l
ist
[
str
]]:
for
chunk
in
response
.
iter_lines
(
chunk_size
=
8192
,
for
chunk
in
response
.
iter_lines
(
chunk_size
=
8192
,
decode_unicode
=
False
,
decode_unicode
=
False
,
delimiter
=
b
"
\0
"
):
delimiter
=
b
"
\0
"
):
...
@@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
...
@@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
yield
output
yield
output
def
get_response
(
response
:
requests
.
Response
)
->
L
ist
[
str
]:
def
get_response
(
response
:
requests
.
Response
)
->
l
ist
[
str
]:
data
=
json
.
loads
(
response
.
content
)
data
=
json
.
loads
(
response
.
content
)
output
=
data
[
"text"
]
output
=
data
[
"text"
]
return
output
return
output
...
...
examples/online_serving/openai_embedding_client.py
View file @
cf069aa8
...
@@ -24,4 +24,4 @@ responses = client.embeddings.create(
...
@@ -24,4 +24,4 @@ responses = client.embeddings.create(
)
)
for
data
in
responses
.
data
:
for
data
in
responses
.
data
:
print
(
data
.
embedding
)
#
l
ist of float of len 4096
print
(
data
.
embedding
)
#
L
ist of float of len 4096
pyproject.toml
View file @
cf069aa8
...
@@ -65,6 +65,32 @@ exclude = [
...
@@ -65,6 +65,32 @@ exclude = [
[tool.ruff.lint.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
"vllm/version.py"
=
["F401"]
"vllm/version.py"
=
["F401"]
"vllm/_version.py"
=
["ALL"]
"vllm/_version.py"
=
["ALL"]
# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
"vllm/adapter_commons/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/attention/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/compilation/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/core/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/device_allocator/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/distributed/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/engine/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/executor/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/inputs/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/logging_utils/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/lora/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/model_executor/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/multimodal/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/platforms/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/plugins/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/profiler/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/prompt_adapter/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/spec_decode/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/third_party/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/transformers_utils/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/triton_utils/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/usage/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/vllm_flash_attn/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/assets/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/worker/**/*.py"
=
[
"UP006"
,
"UP035"
]
[tool.ruff.lint]
[tool.ruff.lint]
select
=
[
select
=
[
...
@@ -91,8 +117,6 @@ ignore = [
...
@@ -91,8 +117,6 @@ ignore = [
"B007"
,
"B007"
,
# f-string format
# f-string format
"UP032"
,
"UP032"
,
# Python 3.8 typing
"UP006"
,
"UP035"
,
# Can remove once 3.10+ is the minimum Python version
# Can remove once 3.10+ is the minimum Python version
"UP007"
,
"UP007"
,
]
]
...
...
Prev
1
2
3
4
5
6
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment