Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dcb5624a
Commit
dcb5624a
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-dev
parents
55880ca2
ba41cc90
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1096 additions
and
135 deletions
+1096
-135
tests/benchmarks/test_latency_cli.py
tests/benchmarks/test_latency_cli.py
+19
-0
tests/benchmarks/test_serve_cli.py
tests/benchmarks/test_serve_cli.py
+44
-0
tests/benchmarks/test_throughput_cli.py
tests/benchmarks/test_throughput_cli.py
+19
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+1
-5
tests/compile/test_functionalization.py
tests/compile/test_functionalization.py
+8
-6
tests/compile/test_fusion.py
tests/compile/test_fusion.py
+5
-4
tests/compile/test_pass_manager.py
tests/compile/test_pass_manager.py
+5
-4
tests/compile/test_sequence_parallelism.py
tests/compile/test_sequence_parallelism.py
+190
-0
tests/conftest.py
tests/conftest.py
+58
-32
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+3
-3
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+30
-1
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+3
-3
tests/distributed/test_sequence_parallel.py
tests/distributed/test_sequence_parallel.py
+296
-0
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+123
-7
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+28
-0
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+116
-2
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+1
-0
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+32
-1
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+25
-18
tests/entrypoints/openai/test_embedding_dimensions.py
tests/entrypoints/openai/test_embedding_dimensions.py
+90
-49
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
tests/benchmarks/test_latency_cli.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
pytest
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
@
pytest
.
mark
.
benchmark
def
test_bench_latency
():
command
=
[
"vllm"
,
"bench"
,
"latency"
,
"--model"
,
MODEL_NAME
,
"--input-len"
,
"32"
,
"--output-len"
,
"1"
,
"--enforce-eager"
,
"--load-format"
,
"dummy"
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
tests/benchmarks/test_serve_cli.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
pytest
from
..utils
import
RemoteOpenAIServer
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--max-model-len"
,
"1024"
,
"--enforce-eager"
,
"--load-format"
,
"dummy"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
benchmark
def
test_bench_serve
(
server
):
command
=
[
"vllm"
,
"bench"
,
"serve"
,
"--model"
,
MODEL_NAME
,
"--host"
,
server
.
host
,
"--port"
,
str
(
server
.
port
),
"--random-input-len"
,
"32"
,
"--random-output-len"
,
"4"
,
"--num-prompts"
,
"5"
,
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
tests/benchmarks/test_throughput_cli.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
pytest
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
@
pytest
.
mark
.
benchmark
def
test_bench_throughput
():
command
=
[
"vllm"
,
"bench"
,
"throughput"
,
"--model"
,
MODEL_NAME
,
"--input-len"
,
"32"
,
"--output-len"
,
"1"
,
"--enforce-eager"
,
"--load-format"
,
"dummy"
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
tests/compile/test_full_graph.py
View file @
dcb5624a
...
...
@@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
(
"facebook/opt-125m"
,
{}),
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
(
"neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
(
"neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
,
{
"quantization"
:
"compressed-tensors"
}),
(
"neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
,
{}),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
{}),
]
...
...
tests/compile/test_functionalization.py
View file @
dcb5624a
...
...
@@ -11,7 +11,7 @@ from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
kFp8DynamicTokenSym
,
kFp8StaticTensorSym
)
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
,
is_func
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
CompilationConfig
from
vllm.config
import
CompilationConfig
,
VllmConfig
from
.backend
import
TestBackend
from
..utils
import
models_path_prefix
...
...
@@ -51,13 +51,15 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
do_fusion
:
bool
):
torch
.
set_default_device
(
"cuda"
)
config
=
CompilationConfig
.
PassConfig
(
enable_fusion
=
do_fusion
,
enable_noop
=
True
)
noop_pass
=
NoOpEliminationPass
(
config
)
fusion_pass
=
FusionPass
.
instance
(
config
)
vllm_config
=
VllmConfig
()
vllm_config
.
compilation_config
=
CompilationConfig
(
pass_config
=
\
CompilationConfig
.
PassConfig
(
enable_fusion
=
do_fusion
,
enable_noop
=
True
))
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
passes
=
[
noop_pass
,
fusion_pass
]
if
do_fusion
else
[
noop_pass
]
func_pass
=
FixFunctionalizationPass
(
config
)
func_pass
=
FixFunctionalizationPass
(
vllm_
config
)
backend_func
=
TestBackend
(
*
passes
,
func_pass
)
backend_no_func
=
TestBackend
(
*
passes
)
...
...
tests/compile/test_fusion.py
View file @
dcb5624a
...
...
@@ -77,12 +77,13 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
custom_ops
=
[
"+rms_norm"
]))
vllm_config
.
compilation_config
.
pass_config
=
\
CompilationConfig
.
PassConfig
(
enable_fusion
=
True
,
enable_noop
=
True
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
# Reshape pass is needed for the fusion pass to work
config
=
CompilationConfig
.
PassConfig
(
enable_fusion
=
True
,
enable_noop
=
True
)
noop_pass
=
NoOpEliminationPass
(
config
)
fusion_pass
=
FusionPass
.
instance
(
config
)
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
backend
=
TestBackend
(
noop_pass
,
fusion_pass
)
model
=
TestModel
(
hidden_size
,
eps
,
static
,
cutlass_fp8_enabled
)
...
...
tests/compile/test_pass_manager.py
View file @
dcb5624a
...
...
@@ -6,7 +6,7 @@ import torch
from
vllm.compilation.inductor_pass
import
CallableInductorPass
,
InductorPass
from
vllm.compilation.pass_manager
import
PostGradPassManager
from
vllm.config
import
Compilation
Config
from
vllm.config
import
Vllm
Config
# dummy custom pass that doesn't inherit
...
...
@@ -16,7 +16,7 @@ def simple_callable(graph: torch.fx.Graph):
# Should fail to add directly to the pass manager
def
test_bad_callable
():
config
=
CompilationConfig
().
pass_config
config
=
VllmConfig
()
pass_manager
=
PostGradPassManager
()
pass_manager
.
configure
(
config
)
...
...
@@ -43,7 +43,7 @@ class ProperPass(InductorPass):
],
)
def
test_pass_manager_uuid
(
callable
):
config
=
CompilationConfig
().
pass_config
config
=
VllmConfig
()
pass_manager
=
PostGradPassManager
()
pass_manager
.
configure
(
config
)
...
...
@@ -64,7 +64,8 @@ def test_pass_manager_uuid(callable):
# UUID should be different due to config change
config2
=
copy
.
deepcopy
(
config
)
config2
.
enable_fusion
=
not
config2
.
enable_fusion
config2
.
compilation_config
.
pass_config
.
enable_fusion
=
not
\
config2
.
compilation_config
.
pass_config
.
enable_fusion
pass_manager3
=
PostGradPassManager
()
pass_manager3
.
configure
(
config2
)
pass_manager3
.
add
(
callable
)
...
...
tests/compile/test_sequence_parallelism.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
import
vllm.envs
as
envs
from
vllm.compilation.fix_functionalization
import
FixFunctionalizationPass
from
vllm.compilation.fx_utils
import
(
find_auto_fn
,
find_auto_fn_maybe
,
find_specified_fn
,
find_specified_fn_maybe
,
is_func
)
from
vllm.compilation.sequence_parallelism
import
SequenceParallelismPass
from
vllm.config
import
(
CompilationConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
)
from
vllm.distributed
import
tensor_model_parallel_all_reduce
from
vllm.distributed.parallel_state
import
(
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.platforms
import
current_platform
from
vllm.utils
import
update_environment_variables
from
..utils
import
multi_gpu_test
from
.backend
import
TestBackend
OPS_IN_MODEL_BEFORE
=
[
torch
.
ops
.
vllm
.
all_reduce
.
default
,
]
OPS_IN_MODEL_AFTER
=
[
torch
.
ops
.
vllm
.
reduce_scatter
.
default
,
torch
.
ops
.
vllm
.
all_gather
.
default
,
]
OPS_IN_MODEL
=
[
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
]
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
,
intermediate_size
=
32
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
gate_proj
=
torch
.
nn
.
Parameter
(
torch
.
empty
((
intermediate_size
,
hidden_size
)))
self
.
norm
=
RMSNorm
(
hidden_size
,
1e-05
)
# Initialize weights
torch
.
nn
.
init
.
normal_
(
self
.
gate_proj
,
std
=
0.02
)
def
forward
(
self
,
hidden_states
,
residual
):
"""
Forward pass implementing the operations in the FX graph
Args:
hidden_states: Input tensor
residual: Residual tensor from previous layer
Returns:
Tuple containing the output tensor
"""
# Reshape input
view
=
hidden_states
.
reshape
(
-
1
,
self
.
hidden_size
)
#matrix multiplication
permute
=
self
.
gate_proj
.
permute
(
1
,
0
)
mm
=
torch
.
mm
(
view
,
permute
)
# Tensor parallel all-reduce
all_reduce
=
tensor_model_parallel_all_reduce
(
mm
)
# layer normalization
norm_output
,
residual_output
=
self
.
norm
(
all_reduce
,
residual
)
return
norm_output
,
residual_output
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
def
test_sequence_parallelism_pass
(
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
num_processes
=
2
def
run_torch_spawn
(
fn
,
nprocs
):
# need to use torch.mp.spawn otherwise will have problems with
# torch.distributed and cuda
torch
.
multiprocessing
.
spawn
(
fn
,
args
=
(
num_processes
,
batch_size
,
seq_len
,
hidden_size
,
dtype
),
nprocs
=
nprocs
)
run_torch_spawn
(
sequence_parallelism_pass_on_test_model
,
num_processes
)
def
sequence_parallelism_pass_on_test_model
(
local_rank
:
int
,
world_size
:
int
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
current_platform
.
seed_everything
(
0
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
({
'RANK'
:
str
(
local_rank
),
'LOCAL_RANK'
:
str
(
local_rank
),
'WORLD_SIZE'
:
str
(
world_size
),
'MASTER_ADDR'
:
'localhost'
,
'MASTER_PORT'
:
'12345'
,
})
# initialize distributed
init_distributed_environment
()
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
# configure vllm config for SequenceParallelismPass
vllm_config
=
VllmConfig
()
vllm_config
.
compilation_config
=
CompilationConfig
(
pass_config
=
CompilationConfig
.
PassConfig
(
enable_sequence_parallelism
=
True
,
),
)
vllm_config
.
device_config
=
DeviceConfig
(
device
=
torch
.
device
(
"cuda"
))
# this is a fake model name to construct the model config
# in the vllm_config, it's not really used.
model
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config
.
model_config
=
ModelConfig
(
model
=
model
,
task
=
"auto"
,
tokenizer
=
model
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
dtype
,
seed
=
42
)
sequence_parallelism_pass
=
SequenceParallelismPass
(
vllm_config
)
backend_no_func
=
TestBackend
(
sequence_parallelism_pass
)
func_pass
=
FixFunctionalizationPass
(
vllm_config
)
backend_func
=
TestBackend
(
sequence_parallelism_pass
,
func_pass
)
model
=
TestModel
(
hidden_size
,
hidden_size
*
2
)
hidden_states
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
dtype
=
dtype
)
residual
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
dtype
=
dtype
)
compiled_model_no_func
=
torch
.
compile
(
model
,
backend
=
backend_no_func
)
compiled_model_no_func
(
hidden_states
,
residual
)
compiled_model_func
=
torch
.
compile
(
model
,
backend
=
backend_func
)
compiled_model_func
(
hidden_states
,
residual
)
# Check substitution worked
pre_nodes
=
backend_no_func
.
graph_pre_pass
.
nodes
post_nodes
=
backend_no_func
.
graph_post_pass
.
nodes
# In pre-nodes, all reduce should be there,
# reduce scatter and all gather should not
for
op
in
OPS_IN_MODEL_BEFORE
:
find_specified_fn
(
pre_nodes
,
op
)
for
op
in
OPS_IN_MODEL_AFTER
:
assert
find_specified_fn_maybe
(
pre_nodes
,
op
)
is
None
# In post-nodes, reduce scatter and all gather should be there,
# all reduce should not
for
op
in
OPS_IN_MODEL_AFTER
:
find_specified_fn
(
post_nodes
,
op
)
for
op
in
OPS_IN_MODEL_BEFORE
:
assert
find_specified_fn_maybe
(
post_nodes
,
op
)
is
None
# check if the functionalization pass is applied
for
op
in
OPS_IN_MODEL
:
find_auto_fn
(
backend_no_func
.
graph_post_pass
.
nodes
,
op
)
assert
find_auto_fn_maybe
(
backend_func
.
graph_post_pass
.
nodes
,
op
)
is
None
# noqa: E501
# make sure the ops were all de-functionalized
found
=
dict
()
for
node
in
backend_func
.
graph_post_pass
.
nodes
:
for
op
in
OPS_IN_MODEL
:
if
is_func
(
node
,
op
):
found
[
op
]
=
True
assert
all
(
found
[
op
]
for
op
in
OPS_IN_MODEL
)
tests/conftest.py
View file @
dcb5624a
...
...
@@ -24,23 +24,24 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass
from
tests.models.utils
import
(
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
,
_get_and_verify_dtype
from
vllm.config
import
TaskOption
,
_get_and_verify_dtype
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
TokensPrompt
,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.utils
import
cuda_device_count_stateless
,
is_list_of
from
vllm.utils
import
cuda_device_count_stateless
from
.utils
import
models_path_prefix
logger
=
init_logger
(
__name__
)
_TEST_DIR
=
os
.
path
.
dirname
(
__file__
)
...
...
@@ -109,10 +110,25 @@ class _VideoAssets(_VideoAssetsBase):
return
[
prompts
[
"sample_demo_1"
]]
class
_AudioAssetsBase
(
UserList
[
AudioAsset
]):
pass
class
_AudioAssets
(
_AudioAssetsBase
):
def
__init__
(
self
)
->
None
:
super
().
__init__
([
AudioAsset
(
"mary_had_lamb"
),
AudioAsset
(
"winning_call"
),
])
IMAGE_ASSETS
=
_ImageAssets
()
"""Singleton instance of :class:`_ImageAssets`."""
VIDEO_ASSETS
=
_VideoAssets
()
"""Singleton instance of :class:`_VideoAssets`."""
AUDIO_ASSETS
=
_AudioAssets
()
"""Singleton instance of :class:`_AudioAssets`."""
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
...
...
@@ -269,6 +285,11 @@ def video_assets() -> _VideoAssets:
return
VIDEO_ASSETS
@
pytest
.
fixture
(
scope
=
"session"
)
def
audio_assets
()
->
_AudioAssets
:
return
AUDIO_ASSETS
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
,
BatchFeature
,
dict
)
_R
=
TypeVar
(
"_R"
)
...
...
@@ -396,10 +417,15 @@ class HfRunner:
processor_kwargs
[
"images"
]
=
image
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
processor_kwargs
[
"videos"
]
=
video
if
audios
is
not
None
and
(
audio_tuple
:
=
audios
[
i
])
is
not
None
:
audio
,
sr
=
audio_tuple
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
if
audios
is
not
None
and
(
audio_inputs
:
=
audios
[
i
])
is
not
None
:
# HACK - not all processors take sampling_rate; we should
# clean this up in the future.
if
len
(
audio_inputs
)
==
2
:
audio
,
sr
=
audio_inputs
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
else
:
processor_kwargs
[
"audio"
]
=
audio_inputs
inputs
=
self
.
processor
(
**
processor_kwargs
)
if
isinstance
(
inputs
,
BatchFeature
):
...
...
@@ -474,12 +500,19 @@ class HfRunner:
prompts
:
list
[
str
],
beam_width
:
int
,
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]:
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
num_beams
=
beam_width
,
num_return_sequences
=
beam_width
)
num_return_sequences
=
beam_width
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
for
i
in
range
(
len
(
outputs
)):
output_ids
,
output_str
=
outputs
[
i
]
for
j
in
range
(
len
(
output_ids
)):
...
...
@@ -530,7 +563,10 @@ class HfRunner:
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
.
to
(
output_embeddings
.
weight
.
device
),
last_hidden_states
.
to
(
device
=
output_embeddings
.
weight
.
device
,
dtype
=
output_embeddings
.
weight
.
dtype
,
),
output_embeddings
.
weight
.
t
(),
)
if
getattr
(
output_embeddings
,
"bias"
,
None
)
is
not
None
:
...
...
@@ -924,6 +960,7 @@ class VllmRunner:
max_tokens
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
skip_special_tokens
:
bool
=
True
,
)
->
Union
[
list
[
TokensTextLogprobs
],
list
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
...
...
@@ -931,6 +968,7 @@ class VllmRunner:
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
,
prompt_logprobs
=
(
num_prompt_logprobs
),
skip_special_tokens
=
skip_special_tokens
,
)
'''
Greedy logprobs generation for vLLM encoder/decoder models
...
...
@@ -941,18 +979,20 @@ class VllmRunner:
def
generate_beam_search
(
self
,
prompts
:
Union
[
list
[
str
],
list
[
list
[
int
]]],
prompts
:
list
[
str
],
beam_width
:
int
,
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]:
if
is_list_of
(
prompts
,
str
,
check
=
"all"
):
prompts
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
else
:
prompts
=
[
TokensPrompt
(
prompt_token_ids
=
tokens
)
for
tokens
in
prompts
]
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
outputs
=
self
.
model
.
beam_search
(
promp
ts
,
inpu
ts
,
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
))
returned_outputs
=
[]
for
output
in
outputs
:
...
...
@@ -1005,20 +1045,6 @@ def vllm_runner():
return
VllmRunner
def
get_tokenizer_pool_config
(
tokenizer_group_type
):
if
tokenizer_group_type
is
None
:
return
None
if
tokenizer_group_type
==
"ray"
:
return
TokenizerPoolConfig
(
pool_size
=
1
,
pool_type
=
"ray"
,
extra_config
=
{})
if
isinstance
(
tokenizer_group_type
,
type
):
return
TokenizerPoolConfig
(
pool_size
=
1
,
pool_type
=
tokenizer_group_type
,
extra_config
=
{})
raise
ValueError
(
f
"Unknown tokenizer_group_type:
{
tokenizer_group_type
}
"
)
@
pytest
.
fixture
()
def
temporary_enable_log_propagate
():
import
logging
...
...
tests/core/block/e2e/test_correctness.py
View file @
dcb5624a
...
...
@@ -197,15 +197,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
"block_size"
:
8
,
"block_size"
:
16
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"block_size"
:
16
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"block_size"
:
16
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
}])
...
...
tests/distributed/test_comm_ops.py
View file @
dcb5624a
...
...
@@ -14,7 +14,8 @@ import torch
from
vllm.distributed
import
(
broadcast_tensor_dict
,
get_pp_group
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
)
tensor_model_parallel_all_reduce
,
tensor_model_parallel_reduce_scatter
)
from
..utils
import
init_test_distributed_environment
,
multi_process_parallel
...
...
@@ -47,6 +48,34 @@ def all_reduce_test_worker(
torch
.
testing
.
assert_close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
reduce_scatter_test_worker
(
monkeypatch
:
pytest
.
MonkeyPatch
,
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
num_elements
=
8
all_tensors
=
[
torch
.
arange
(
num_elements
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
*
(
r
+
1
)
for
r
in
range
(
tp_size
)
]
index
=
rank
%
tp_size
partition_size
=
num_elements
//
tp_size
all_reduce
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
expected
=
all_reduce
[
index
*
partition_size
:(
index
+
1
)
*
partition_size
]
t
=
all_tensors
[
index
]
t
=
tensor_model_parallel_reduce_scatter
(
t
,
0
)
torch
.
testing
.
assert_close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_gather_test_worker
(
monkeypatch
:
pytest
.
MonkeyPatch
,
...
...
tests/distributed/test_pipeline_parallel.py
View file @
dcb5624a
...
...
@@ -161,12 +161,12 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-
2b
"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-
1.1-2b-it
"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-9b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"EleutherAI/gpt-j-6b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"EleutherAI/pythia-1
2
b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"EleutherAI/pythia-1
.4
b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerLM-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
):
PPTestSettings
.
fast
(),
# Uses Llama
...
...
@@ -195,7 +195,7 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
):
PPTestSettings
.
detailed
(
multi_node_only
=
True
,
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2
-7
B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2
.5-0.5
B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
):
PPTestSettings
.
fast
(),
...
...
tests/distributed/test_sequence_parallel.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import
json
import
os
from
dataclasses
import
dataclass
from
typing
import
Literal
,
NamedTuple
,
Optional
import
pytest
from
vllm.config
import
TaskOption
from
vllm.logger
import
init_logger
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
compare_two_settings
,
create_new_process_for_each_test
logger
=
init_logger
(
"test_sequence_parallel"
)
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
class
ParallelSetup
(
NamedTuple
):
tp_size
:
int
sp_enabled
:
bool
eager_mode
:
bool
chunked_prefill
:
bool
class
SPTestOptions
(
NamedTuple
):
multi_node_only
:
bool
load_format
:
Optional
[
str
]
=
None
@
dataclass
class
SPTestSettings
:
parallel_setups
:
list
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
list
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
list
[
str
]
task
:
TaskOption
test_options
:
SPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
def
detailed
(
*
,
tp_base
:
int
=
2
,
multi_node_only
:
bool
=
False
,
task
:
TaskOption
=
"auto"
,
load_format
:
Optional
[
str
]
=
None
,
):
return
SPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
True
)
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
task
=
task
,
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
)
@
staticmethod
def
fast
(
*
,
tp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
multi_node_only
:
bool
=
False
,
load_format
:
Optional
[
str
]
=
None
,
):
return
SPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
task
=
task
,
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
)
def
iter_params
(
self
,
model_id
:
str
):
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
backend
,
vllm_major_version
in
zip
(
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
model_id
,
parallel_setup
,
backend
,
vllm_major_version
,
self
.
task
,
opts
)
def
_compare_sp
(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
SPTestOptions
,
num_gpus_available
:
int
,
*
,
method
:
Literal
[
"generate"
,
"encode"
],
is_multimodal
:
bool
,
):
(
tp_size
,
sp_enabled
,
eager_mode
,
chunked_prefill
,
)
=
parallel_setup
multi_node_only
,
load_format
=
test_options
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
if
load_format
==
"dummy"
:
# Avoid OOM
text_overrides
=
{
"num_hidden_layers"
:
4
,
"hidden_size"
:
512
,
"intermediate_size"
:
800
,
"num_attention_heads"
:
4
,
"num_key_value_heads"
:
1
,
}
if
is_multimodal
:
hf_overrides
.
update
({
"text_config"
:
text_overrides
})
else
:
hf_overrides
.
update
(
text_overrides
)
else
:
model_info
.
check_available_online
(
on_fail
=
"skip"
)
pp_size
=
1
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
if
VLLM_MULTI_NODE
and
distributed_backend
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
if
multi_node_only
and
not
VLLM_MULTI_NODE
:
pytest
.
skip
(
"Not in multi-node setting"
)
common_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"8"
,
]
if
chunked_prefill
:
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
if
task
!=
"auto"
:
common_args
.
extend
([
"--task"
,
task
])
if
trust_remote_code
:
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
common_args
.
extend
([
"--tokenizer-mode"
,
tokenizer_mode
])
if
load_format
:
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
compilation_config
=
{
'level'
:
3
,
'custom_ops'
:
[
"+rms_norm"
],
'compile_sizes'
:
[
4
,
8
],
'splitting_ops'
:
[],
'pass_config'
:
{
'enable_sequence_parallism'
:
sp_enabled
,
'enable_noop'
:
True
,
'enable_fusion'
:
True
,
},
}
tp_sp_env
=
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
}
tp_sp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--distributed-executor-backend"
,
distributed_backend
,
"--compilation_config"
,
str
(
compilation_config
),
]
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
}
tp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--distributed-executor-backend"
,
"mp"
,
]
try
:
compare_two_settings
(
model_id
,
tp_sp_args
,
tp_args
,
tp_sp_env
,
tp_env
,
method
=
method
)
except
Exception
:
testing_ray_compiled_graph
=
tp_sp_env
is
not
None
if
testing_ray_compiled_graph
and
vllm_major_version
==
"0"
:
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger
.
exception
(
"Ray Compiled Graph tests failed"
)
else
:
raise
SP_TEXT_GENERATION_MODELS
=
{
# [Decoder-only]
"meta-llama/Llama-3.2-1B-Instruct"
:
SPTestSettings
.
detailed
(),
}
SP_TEST_MODELS
=
[
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct"
,
]
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
[
params
for
model_id
,
settings
in
SP_TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
SP_TEST_MODELS
],
)
@
create_new_process_for_each_test
()
def
test_tp_sp_generation
(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
SPTestOptions
,
num_gpus_available
,
):
_compare_sp
(
model_id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"generate"
,
is_multimodal
=
False
)
tests/engine/test_arg_utils.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
json
from
argparse
import
ArgumentError
,
ArgumentTypeError
from
contextlib
import
nullcontext
from
dataclasses
import
dataclass
,
field
from
typing
import
Literal
,
Optional
import
pytest
from
vllm.config
import
PoolerConfig
from
vllm.engine.arg_utils
import
EngineArgs
,
nullable_kvs
from
vllm.config
import
PoolerConfig
,
config
from
vllm.engine.arg_utils
import
(
EngineArgs
,
contains_type
,
get_kwargs
,
get_type
,
is_not_builtin
,
is_type
,
nullable_kvs
,
optional_type
)
from
vllm.utils
import
FlexibleArgumentParser
@
pytest
.
mark
.
parametrize
((
"type"
,
"value"
,
"expected"
),
[
(
int
,
"42"
,
42
),
(
int
,
"None"
,
None
),
(
float
,
"3.14"
,
3.14
),
(
float
,
"None"
,
None
),
(
str
,
"Hello World!"
,
"Hello World!"
),
(
str
,
"None"
,
None
),
(
json
.
loads
,
'{"foo":1,"bar":2}'
,
{
"foo"
:
1
,
"bar"
:
2
}),
(
json
.
loads
,
"foo=1,bar=2"
,
{
"foo"
:
1
,
"bar"
:
2
}),
(
json
.
loads
,
"None"
,
None
),
])
def
test_optional_type
(
type
,
value
,
expected
):
optional_type_func
=
optional_type
(
type
)
context
=
nullcontext
()
if
value
==
"foo=1,bar=2"
:
context
=
pytest
.
warns
(
DeprecationWarning
)
with
context
:
assert
optional_type_func
(
value
)
==
expected
@
pytest
.
mark
.
parametrize
((
"type_hint"
,
"type"
,
"expected"
),
[
(
int
,
int
,
True
),
(
int
,
float
,
False
),
(
list
[
int
],
list
,
True
),
(
list
[
int
],
tuple
,
False
),
(
Literal
[
0
,
1
],
Literal
,
True
),
])
def
test_is_type
(
type_hint
,
type
,
expected
):
assert
is_type
(
type_hint
,
type
)
==
expected
@
pytest
.
mark
.
parametrize
((
"type_hints"
,
"type"
,
"expected"
),
[
({
float
,
int
},
int
,
True
),
({
int
,
tuple
[
int
]},
int
,
True
),
({
int
,
tuple
[
int
]},
float
,
False
),
({
str
,
Literal
[
"x"
,
"y"
]},
Literal
,
True
),
])
def
test_contains_type
(
type_hints
,
type
,
expected
):
assert
contains_type
(
type_hints
,
type
)
==
expected
@
pytest
.
mark
.
parametrize
((
"type_hints"
,
"type"
,
"expected"
),
[
({
int
,
float
},
int
,
int
),
({
int
,
float
},
str
,
None
),
({
str
,
Literal
[
"x"
,
"y"
]},
Literal
,
Literal
[
"x"
,
"y"
]),
])
def
test_get_type
(
type_hints
,
type
,
expected
):
assert
get_type
(
type_hints
,
type
)
==
expected
@
config
@
dataclass
class
DummyConfigClass
:
regular_bool
:
bool
=
True
"""Regular bool with default True"""
optional_bool
:
Optional
[
bool
]
=
None
"""Optional bool with default None"""
optional_literal
:
Optional
[
Literal
[
"x"
,
"y"
]]
=
None
"""Optional literal with default None"""
tuple_n
:
tuple
[
int
,
...]
=
field
(
default_factory
=
lambda
:
(
1
,
2
,
3
))
"""Tuple with default (1, 2, 3)"""
tuple_2
:
tuple
[
int
,
int
]
=
field
(
default_factory
=
lambda
:
(
1
,
2
))
"""Tuple with default (1, 2)"""
list_n
:
list
[
int
]
=
field
(
default_factory
=
lambda
:
[
1
,
2
,
3
])
"""List with default [1, 2, 3]"""
@
pytest
.
mark
.
parametrize
((
"type_hint"
,
"expected"
),
[
(
int
,
False
),
(
DummyConfigClass
,
True
),
])
def
test_is_not_builtin
(
type_hint
,
expected
):
assert
is_not_builtin
(
type_hint
)
==
expected
def
test_get_kwargs
():
kwargs
=
get_kwargs
(
DummyConfigClass
)
print
(
kwargs
)
# bools should not have their type set
assert
kwargs
[
"regular_bool"
].
get
(
"type"
)
is
None
assert
kwargs
[
"optional_bool"
].
get
(
"type"
)
is
None
# optional literals should have None as a choice
assert
kwargs
[
"optional_literal"
][
"choices"
]
==
[
"x"
,
"y"
,
"None"
]
# tuples should have the correct nargs
assert
kwargs
[
"tuple_n"
][
"nargs"
]
==
"+"
assert
kwargs
[
"tuple_2"
][
"nargs"
]
==
2
# lists should work
assert
kwargs
[
"list_n"
][
"type"
]
is
int
assert
kwargs
[
"list_n"
][
"nargs"
]
==
"+"
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
),
[
(
None
,
None
),
(
None
,
dict
()
),
(
"image=16"
,
{
"image"
:
16
}),
...
...
@@ -24,6 +128,10 @@ from vllm.utils import FlexibleArgumentParser
}),
])
def
test_limit_mm_per_prompt_parser
(
arg
,
expected
):
"""This functionality is deprecated and will be removed in the future.
This argument should be passed as JSON string instead.
TODO: Remove with nullable_kvs."""
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
args
=
parser
.
parse_args
([])
...
...
@@ -53,12 +161,20 @@ def test_compilation_config():
assert
args
.
compilation_config
.
level
==
3
# set to string form of a dict
args
=
parser
.
parse_args
([
"--compilation-config"
,
"{'level': 3}"
])
assert
args
.
compilation_config
.
level
==
3
args
=
parser
.
parse_args
([
"--compilation-config"
,
"{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}"
,
])
assert
(
args
.
compilation_config
.
level
==
3
and
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
])
# set to string form of a dict
args
=
parser
.
parse_args
([
"--compilation-config={'level': 3}"
])
assert
args
.
compilation_config
.
level
==
3
args
=
parser
.
parse_args
([
"--compilation-config="
"{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}"
,
])
assert
(
args
.
compilation_config
.
level
==
3
and
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
])
def
test_prefix_cache_default
():
...
...
tests/entrypoints/llm/test_chat.py
View file @
dcb5624a
...
...
@@ -91,3 +91,31 @@ def test_chat_multi_image(image_urls: list[str]):
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
def
test_llm_chat_tokenization_no_double_bos
():
"""
LLM.chat() should not add special tokens when using chat templates.
Check we get a single BOS token for llama chat.
"""
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
)
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello!"
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
prompt_token_ids
=
getattr
(
outputs
[
0
],
"prompt_token_ids"
,
None
)
assert
prompt_token_ids
is
not
None
bos_token
=
llm
.
get_tokenizer
().
bos_token_id
# Ensure we have a single BOS
assert
prompt_token_ids
[
0
]
==
bos_token
assert
prompt_token_ids
[
1
]
!=
bos_token
,
"Double BOS"
tests/entrypoints/llm/test_guided_generate.py
View file @
dcb5624a
...
...
@@ -308,7 +308,7 @@ def test_disable_guided_decoding_fallback(sample_regex, llm):
with
pytest
.
raises
(
ValueError
,
match
=
"xgrammar does not support advanced JSON schema features "
"like
enums, patterns or numeric range
s."
):
"like
string length, item limits, or property bound
s."
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
...
...
@@ -386,4 +386,118 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
\ No newline at end of file
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_number_range_json_completion
(
llm
,
guided_decoding_backend
:
str
):
sample_output_schema
=
{
"type"
:
"object"
,
"properties"
:
{
"age"
:
{
"type"
:
"integer"
,
"minimum"
:
18
,
"maximum"
:
99
},
"score"
:
{
"type"
:
"number"
,
"minimum"
:
0.0
,
"maximum"
:
100.0
},
"zipcode"
:
{
"type"
:
"string"
,
"pattern"
:
r
"^\d{5}(-\d{4})?$"
},
},
"required"
:
[
"age"
,
"score"
,
"zipcode"
],
}
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_output_schema
,
backend
=
guided_decoding_backend
),
)
outputs
=
llm
.
generate
(
prompts
=
[
"Create a JSON object for a user with age, score, and zipcode."
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_output_schema
)
assert
18
<=
output_json
[
"age"
]
<=
99
assert
0.0
<=
output_json
[
"score"
]
<=
100.0
assert
(
re
.
fullmatch
(
r
"^\d{5}(-\d{4})?$"
,
output_json
[
"zipcode"
])
is
not
None
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guidance_no_additional_properties
(
llm
):
schema
=
{
'type'
:
'object'
,
'properties'
:
{
'a1'
:
{
'type'
:
'string'
},
'a2'
:
{
'type'
:
'string'
},
'a3'
:
{
'type'
:
'string'
}
},
'required'
:
[
'a1'
,
'a2'
,
'a3'
],
}
prompt
=
(
"<|im_start|>system
\n
You are Qwen, created by Alibaba Cloud. You are a "
"helpful assistant.<|im_end|>
\n
<|im_start|>user
\n
Please generate a "
"large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
"<|im_end|>
\n
<|im_start|>assistant
\n
"
)
def
generate_with_backend
(
backend
):
guided_params
=
GuidedDecodingParams
(
json
=
schema
,
backend
=
backend
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
256
,
guided_decoding
=
guided_params
)
outputs
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
)
assert
outputs
is
not
None
generated_text
=
outputs
[
0
].
outputs
[
0
].
text
assert
generated_text
is
not
None
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
jsonschema
.
validate
(
instance
=
parsed_json
,
schema
=
schema
)
return
parsed_json
base_generated
=
generate_with_backend
(
'guidance:disable-any-whitespace'
)
assert
"a1"
in
base_generated
assert
"a2"
in
base_generated
assert
"a3"
in
base_generated
# by default additional keys are generated
assert
"a4"
in
base_generated
assert
"a5"
in
base_generated
assert
"a6"
in
base_generated
generated
=
generate_with_backend
(
'guidance:no-additional-properties,disable-any-whitespace'
)
assert
"a1"
in
generated
assert
"a2"
in
generated
assert
"a3"
in
generated
assert
"a4"
not
in
generated
assert
"a5"
not
in
generated
assert
"a6"
not
in
generated
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
View file @
dcb5624a
...
...
@@ -150,6 +150,7 @@ def test_wer_correctness(model_name,
expected_wer
,
n_examples
=-
1
,
max_concurrent_request
=
None
):
# TODO refactor to use `ASRDataset`
with
RemoteOpenAIServer
(
model_name
,
[
'--enforce-eager'
])
as
remote_server
:
dataset
=
load_hf_dataset
(
dataset_repo
)
...
...
tests/entrypoints/openai/test_audio.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
json
import
openai
import
pytest
import
os
...
...
@@ -27,7 +29,7 @@ def server():
"--enforce-eager"
,
"--trust-remote-code"
,
"--limit-mm-per-prompt"
,
f
"audio
=
{
MAXIMUM_AUDIOS
}
"
,
json
.
dumps
({
"audio
"
:
MAXIMUM_AUDIOS
}
)
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
...
@@ -102,6 +104,35 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
[
TEST_AUDIO_URLS
[
0
]])
async
def
test_error_on_invalid_audio_url_type
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
audio_url
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# audio_url should be a dict {"url": "some url"}, not directly a string
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
[
TEST_AUDIO_URLS
[
0
]])
...
...
tests/entrypoints/openai/test_embedding.py
View file @
dcb5624a
...
...
@@ -12,11 +12,13 @@ import requests
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...models.embedding.utils
import
c
heck_embeddings_close
from
...models.embedding.utils
import
c
orrectness_test
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-small"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
@@ -26,7 +28,7 @@ def server():
"embed"
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
DTYPE
,
"--enforce-eager"
,
"--max-model-len"
,
"512"
,
...
...
@@ -44,9 +46,17 @@ async def client(server):
yield
async_client
@
pytest
.
fixture
(
scope
=
"module"
)
def
hf_model
(
hf_runner
):
with
hf_runner
(
MODEL_NAME
,
dtype
=
DTYPE
,
is_sentence_transformer
=
True
)
as
hf_model
:
yield
hf_model
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_single_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_single_embedding
(
hf_model
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
...
...
@@ -67,6 +77,9 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
usage
.
prompt_tokens
==
11
assert
embeddings
.
usage
.
total_tokens
==
11
vllm_outputs
=
[
d
.
embedding
for
d
in
embeddings
.
data
]
correctness_test
(
hf_model
,
input_texts
,
vllm_outputs
)
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embedding_response
=
await
client
.
embeddings
.
create
(
...
...
@@ -87,7 +100,8 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_batch_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_batch_embedding
(
hf_model
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test list[str]
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
...
...
@@ -108,6 +122,9 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
usage
.
prompt_tokens
==
33
assert
embeddings
.
usage
.
total_tokens
==
33
vllm_outputs
=
[
d
.
embedding
for
d
in
embeddings
.
data
]
correctness_test
(
hf_model
,
input_texts
,
vllm_outputs
)
# test list[list[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
...
...
@@ -182,7 +199,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_batch_base64_embedding
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_batch_base64_embedding
(
hf_model
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Hello my name is"
,
...
...
@@ -193,6 +210,7 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
model
=
model_name
,
encoding_format
=
"float"
)
float_data
=
[
d
.
embedding
for
d
in
responses_float
.
data
]
correctness_test
(
hf_model
,
input_texts
,
float_data
)
responses_base64
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
,
...
...
@@ -203,24 +221,13 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
dtype
=
"float32"
).
tolist
())
check_embeddings_close
(
embeddings_0_lst
=
float_data
,
embeddings_1_lst
=
base64_data
,
name_0
=
"float"
,
name_1
=
"base64"
,
)
correctness_test
(
hf_model
,
input_texts
,
base64_data
)
# Default response is float32 decoded from base64 by OpenAI Client
responses_default
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
)
default_data
=
[
d
.
embedding
for
d
in
responses_default
.
data
]
check_embeddings_close
(
embeddings_0_lst
=
float_data
,
embeddings_1_lst
=
default_data
,
name_0
=
"float"
,
name_1
=
"default"
,
)
correctness_test
(
hf_model
,
input_texts
,
default_data
)
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_embedding_dimensions.py
View file @
dcb5624a
...
...
@@ -3,80 +3,121 @@
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
"""
from
typing
import
NamedTuple
from
typing
import
Optional
import
openai
import
pytest
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
from
...conftest
import
HfRunner
from
...models.embedding.utils
import
EmbedModelInfo
,
correctness_test
from
...utils
import
RemoteOpenAIServer
class
ModelInfo
(
NamedTuple
):
name
:
str
is_matryoshka
:
bool
MODELS
=
[
ModelInfo
(
name
=
"BAAI/bge-m3"
,
is_matryoshka
=
False
),
ModelInfo
(
name
=
"jinaai/jina-embeddings-v3"
,
is_matryoshka
=
True
),
EmbedModelInfo
(
"intfloat/multilingual-e5-small"
,
is_matryoshka
=
False
),
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v1.5"
,
is_matryoshka
=
True
,
matryoshka_dimensions
=
[
256
]),
]
input_texts
=
[
"The chef prepared a delicious meal."
,
]
*
3
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
async
def
test_validating_dimensions
(
model
:
ModelInfo
):
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
MODELS
)
def
model_info
(
request
):
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
"bfloat16"
])
def
dtype
(
request
):
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
model_info
,
dtype
:
str
):
args
=
[
"--task"
,
"embed"
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
dtype
,
"--enforce-eager"
,
"--max-model-len"
,
"512"
,
"--trust_remote_code"
"512"
]
with
RemoteOpenAIServer
(
model
.
name
,
args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
async
def
make_request
(
dimensions
):
embedding_response
=
await
client
.
embeddings
.
create
(
model
=
model
.
name
,
input
=
input_texts
,
dimensions
=
dimensions
,
encoding_format
=
"float"
,
)
embeddings
=
EmbeddingResponse
.
model_validate
(
embedding_response
.
model_dump
(
mode
=
"json"
))
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
>
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
>
0
assert
embeddings
.
usage
.
total_tokens
>
0
if
dimensions
is
not
None
:
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
dimensions
if
model
.
is_matryoshka
:
for
dimensions
in
[
None
,
16
]:
await
make_request
(
dimensions
)
if
model_info
.
name
==
"Snowflake/snowflake-arctic-embed-m-v1.5"
:
# Manually enable Matryoshka Embeddings
args
.
extend
([
"--trust_remote_code"
,
"--hf_overrides"
,
'{"matryoshka_dimensions":[256]}'
])
with
RemoteOpenAIServer
(
model_info
.
name
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
hf_model
(
hf_runner
,
model_info
,
dtype
:
str
):
with
hf_runner
(
model_info
.
name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
yield
hf_model
@
pytest
.
mark
.
asyncio
async
def
test_matryoshka
(
model_info
:
EmbedModelInfo
,
server
:
RemoteOpenAIServer
,
hf_model
:
HfRunner
):
client
=
server
.
get_async_client
()
async
def
make_request_and_correctness_test
(
dimensions
):
prompts
=
input_texts
*
3
embedding_response
=
await
client
.
embeddings
.
create
(
model
=
model_info
.
name
,
input
=
prompts
,
dimensions
=
dimensions
,
encoding_format
=
"float"
,
)
embeddings
=
EmbeddingResponse
.
model_validate
(
embedding_response
.
model_dump
(
mode
=
"json"
))
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
>
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
>
0
assert
embeddings
.
usage
.
total_tokens
>
0
if
dimensions
is
not
None
:
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
dimensions
vllm_outputs
=
[
d
.
embedding
for
d
in
embeddings
.
data
]
correctness_test
(
hf_model
,
prompts
,
vllm_outputs
,
dimensions
)
if
model_info
.
is_matryoshka
:
valid_dimensions
:
list
[
Optional
[
int
]]
=
[
None
]
if
model_info
.
matryoshka_dimensions
is
not
None
:
valid_dimensions
+=
model_info
.
matryoshka_dimensions
[:
2
]
for
dimensions
in
valid_dimensions
:
await
make_request_and_correctness_test
(
dimensions
)
invalid_dimensions
:
list
[
Optional
[
int
]]
=
[
-
1
]
if
model_info
.
matryoshka_dimensions
is
not
None
:
assert
5
not
in
model_info
.
matryoshka_dimensions
invalid_dimensions
.
append
(
5
)
for
dimensions
in
invalid_dimensions
:
with
pytest
.
raises
(
openai
.
BadRequestError
):
for
dimensions
in
[
-
1
]:
await
make_request
(
dimensions
)
await
make_request_and_correctness_test
(
dimensions
)
else
:
for
dimensions
in
[
None
]:
await
make_request
(
dimensions
)
else
:
for
dimensions
in
[
None
]:
await
make_request
_and_correctness_test
(
dimensions
)
for
dimensions
in
[
-
1
,
16
]:
with
pytest
.
raises
(
openai
.
BadRequestError
):
for
dimensions
in
[
-
1
,
16
]:
await
make_request
(
dimensions
)
await
make_request_and_correctness_test
(
dimensions
)
Prev
1
…
7
8
9
10
11
12
13
14
15
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment