Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dcb5624a
Commit
dcb5624a
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-dev
parents
55880ca2
ba41cc90
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1096 additions
and
135 deletions
+1096
-135
tests/benchmarks/test_latency_cli.py
tests/benchmarks/test_latency_cli.py
+19
-0
tests/benchmarks/test_serve_cli.py
tests/benchmarks/test_serve_cli.py
+44
-0
tests/benchmarks/test_throughput_cli.py
tests/benchmarks/test_throughput_cli.py
+19
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+1
-5
tests/compile/test_functionalization.py
tests/compile/test_functionalization.py
+8
-6
tests/compile/test_fusion.py
tests/compile/test_fusion.py
+5
-4
tests/compile/test_pass_manager.py
tests/compile/test_pass_manager.py
+5
-4
tests/compile/test_sequence_parallelism.py
tests/compile/test_sequence_parallelism.py
+190
-0
tests/conftest.py
tests/conftest.py
+58
-32
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+3
-3
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+30
-1
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+3
-3
tests/distributed/test_sequence_parallel.py
tests/distributed/test_sequence_parallel.py
+296
-0
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+123
-7
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+28
-0
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+116
-2
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+1
-0
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+32
-1
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+25
-18
tests/entrypoints/openai/test_embedding_dimensions.py
tests/entrypoints/openai/test_embedding_dimensions.py
+90
-49
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
tests/benchmarks/test_latency_cli.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
pytest
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
@
pytest
.
mark
.
benchmark
def
test_bench_latency
():
command
=
[
"vllm"
,
"bench"
,
"latency"
,
"--model"
,
MODEL_NAME
,
"--input-len"
,
"32"
,
"--output-len"
,
"1"
,
"--enforce-eager"
,
"--load-format"
,
"dummy"
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
tests/benchmarks/test_serve_cli.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
pytest
from
..utils
import
RemoteOpenAIServer
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--max-model-len"
,
"1024"
,
"--enforce-eager"
,
"--load-format"
,
"dummy"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
benchmark
def
test_bench_serve
(
server
):
command
=
[
"vllm"
,
"bench"
,
"serve"
,
"--model"
,
MODEL_NAME
,
"--host"
,
server
.
host
,
"--port"
,
str
(
server
.
port
),
"--random-input-len"
,
"32"
,
"--random-output-len"
,
"4"
,
"--num-prompts"
,
"5"
,
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
tests/benchmarks/test_throughput_cli.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
pytest
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
@
pytest
.
mark
.
benchmark
def
test_bench_throughput
():
command
=
[
"vllm"
,
"bench"
,
"throughput"
,
"--model"
,
MODEL_NAME
,
"--input-len"
,
"32"
,
"--output-len"
,
"1"
,
"--enforce-eager"
,
"--load-format"
,
"dummy"
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
tests/compile/test_full_graph.py
View file @
dcb5624a
...
@@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
...
@@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
(
"facebook/opt-125m"
,
{}),
(
"facebook/opt-125m"
,
{}),
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
{
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
{
"dtype"
:
torch
.
float16
,
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
}),
(
"neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic"
,
{
(
"neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic"
,
{
"dtype"
:
torch
.
float16
,
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
(
"neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
,
{
"quantization"
:
"compressed-tensors"
}),
}),
(
"neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
,
{}),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
{}),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
{}),
]
]
...
...
tests/compile/test_functionalization.py
View file @
dcb5624a
...
@@ -11,7 +11,7 @@ from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
...
@@ -11,7 +11,7 @@ from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
kFp8DynamicTokenSym
,
kFp8StaticTensorSym
)
kFp8DynamicTokenSym
,
kFp8StaticTensorSym
)
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
,
is_func
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
,
is_func
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
CompilationConfig
from
vllm.config
import
CompilationConfig
,
VllmConfig
from
.backend
import
TestBackend
from
.backend
import
TestBackend
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
...
@@ -51,13 +51,15 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
...
@@ -51,13 +51,15 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
do_fusion
:
bool
):
do_fusion
:
bool
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
config
=
CompilationConfig
.
PassConfig
(
enable_fusion
=
do_fusion
,
vllm_config
=
VllmConfig
()
enable_noop
=
True
)
vllm_config
.
compilation_config
=
CompilationConfig
(
pass_config
=
\
noop_pass
=
NoOpEliminationPass
(
config
)
CompilationConfig
.
PassConfig
(
enable_fusion
=
do_fusion
,
fusion_pass
=
FusionPass
.
instance
(
config
)
enable_noop
=
True
))
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
passes
=
[
noop_pass
,
fusion_pass
]
if
do_fusion
else
[
noop_pass
]
passes
=
[
noop_pass
,
fusion_pass
]
if
do_fusion
else
[
noop_pass
]
func_pass
=
FixFunctionalizationPass
(
config
)
func_pass
=
FixFunctionalizationPass
(
vllm_
config
)
backend_func
=
TestBackend
(
*
passes
,
func_pass
)
backend_func
=
TestBackend
(
*
passes
,
func_pass
)
backend_no_func
=
TestBackend
(
*
passes
)
backend_no_func
=
TestBackend
(
*
passes
)
...
...
tests/compile/test_fusion.py
View file @
dcb5624a
...
@@ -77,12 +77,13 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
...
@@ -77,12 +77,13 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
custom_ops
=
[
"+rms_norm"
]))
level
=
CompilationLevel
.
PIECEWISE
,
custom_ops
=
[
"+rms_norm"
]))
vllm_config
.
compilation_config
.
pass_config
=
\
CompilationConfig
.
PassConfig
(
enable_fusion
=
True
,
enable_noop
=
True
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
# Reshape pass is needed for the fusion pass to work
# Reshape pass is needed for the fusion pass to work
config
=
CompilationConfig
.
PassConfig
(
enable_fusion
=
True
,
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
enable_noop
=
True
)
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
noop_pass
=
NoOpEliminationPass
(
config
)
fusion_pass
=
FusionPass
.
instance
(
config
)
backend
=
TestBackend
(
noop_pass
,
fusion_pass
)
backend
=
TestBackend
(
noop_pass
,
fusion_pass
)
model
=
TestModel
(
hidden_size
,
eps
,
static
,
cutlass_fp8_enabled
)
model
=
TestModel
(
hidden_size
,
eps
,
static
,
cutlass_fp8_enabled
)
...
...
tests/compile/test_pass_manager.py
View file @
dcb5624a
...
@@ -6,7 +6,7 @@ import torch
...
@@ -6,7 +6,7 @@ import torch
from
vllm.compilation.inductor_pass
import
CallableInductorPass
,
InductorPass
from
vllm.compilation.inductor_pass
import
CallableInductorPass
,
InductorPass
from
vllm.compilation.pass_manager
import
PostGradPassManager
from
vllm.compilation.pass_manager
import
PostGradPassManager
from
vllm.config
import
Compilation
Config
from
vllm.config
import
Vllm
Config
# dummy custom pass that doesn't inherit
# dummy custom pass that doesn't inherit
...
@@ -16,7 +16,7 @@ def simple_callable(graph: torch.fx.Graph):
...
@@ -16,7 +16,7 @@ def simple_callable(graph: torch.fx.Graph):
# Should fail to add directly to the pass manager
# Should fail to add directly to the pass manager
def
test_bad_callable
():
def
test_bad_callable
():
config
=
CompilationConfig
().
pass_config
config
=
VllmConfig
()
pass_manager
=
PostGradPassManager
()
pass_manager
=
PostGradPassManager
()
pass_manager
.
configure
(
config
)
pass_manager
.
configure
(
config
)
...
@@ -43,7 +43,7 @@ class ProperPass(InductorPass):
...
@@ -43,7 +43,7 @@ class ProperPass(InductorPass):
],
],
)
)
def
test_pass_manager_uuid
(
callable
):
def
test_pass_manager_uuid
(
callable
):
config
=
CompilationConfig
().
pass_config
config
=
VllmConfig
()
pass_manager
=
PostGradPassManager
()
pass_manager
=
PostGradPassManager
()
pass_manager
.
configure
(
config
)
pass_manager
.
configure
(
config
)
...
@@ -64,7 +64,8 @@ def test_pass_manager_uuid(callable):
...
@@ -64,7 +64,8 @@ def test_pass_manager_uuid(callable):
# UUID should be different due to config change
# UUID should be different due to config change
config2
=
copy
.
deepcopy
(
config
)
config2
=
copy
.
deepcopy
(
config
)
config2
.
enable_fusion
=
not
config2
.
enable_fusion
config2
.
compilation_config
.
pass_config
.
enable_fusion
=
not
\
config2
.
compilation_config
.
pass_config
.
enable_fusion
pass_manager3
=
PostGradPassManager
()
pass_manager3
=
PostGradPassManager
()
pass_manager3
.
configure
(
config2
)
pass_manager3
.
configure
(
config2
)
pass_manager3
.
add
(
callable
)
pass_manager3
.
add
(
callable
)
...
...
tests/compile/test_sequence_parallelism.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
import
vllm.envs
as
envs
from
vllm.compilation.fix_functionalization
import
FixFunctionalizationPass
from
vllm.compilation.fx_utils
import
(
find_auto_fn
,
find_auto_fn_maybe
,
find_specified_fn
,
find_specified_fn_maybe
,
is_func
)
from
vllm.compilation.sequence_parallelism
import
SequenceParallelismPass
from
vllm.config
import
(
CompilationConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
)
from
vllm.distributed
import
tensor_model_parallel_all_reduce
from
vllm.distributed.parallel_state
import
(
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.platforms
import
current_platform
from
vllm.utils
import
update_environment_variables
from
..utils
import
multi_gpu_test
from
.backend
import
TestBackend
OPS_IN_MODEL_BEFORE
=
[
torch
.
ops
.
vllm
.
all_reduce
.
default
,
]
OPS_IN_MODEL_AFTER
=
[
torch
.
ops
.
vllm
.
reduce_scatter
.
default
,
torch
.
ops
.
vllm
.
all_gather
.
default
,
]
OPS_IN_MODEL
=
[
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
]
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
,
intermediate_size
=
32
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
gate_proj
=
torch
.
nn
.
Parameter
(
torch
.
empty
((
intermediate_size
,
hidden_size
)))
self
.
norm
=
RMSNorm
(
hidden_size
,
1e-05
)
# Initialize weights
torch
.
nn
.
init
.
normal_
(
self
.
gate_proj
,
std
=
0.02
)
def
forward
(
self
,
hidden_states
,
residual
):
"""
Forward pass implementing the operations in the FX graph
Args:
hidden_states: Input tensor
residual: Residual tensor from previous layer
Returns:
Tuple containing the output tensor
"""
# Reshape input
view
=
hidden_states
.
reshape
(
-
1
,
self
.
hidden_size
)
#matrix multiplication
permute
=
self
.
gate_proj
.
permute
(
1
,
0
)
mm
=
torch
.
mm
(
view
,
permute
)
# Tensor parallel all-reduce
all_reduce
=
tensor_model_parallel_all_reduce
(
mm
)
# layer normalization
norm_output
,
residual_output
=
self
.
norm
(
all_reduce
,
residual
)
return
norm_output
,
residual_output
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
def
test_sequence_parallelism_pass
(
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
num_processes
=
2
def
run_torch_spawn
(
fn
,
nprocs
):
# need to use torch.mp.spawn otherwise will have problems with
# torch.distributed and cuda
torch
.
multiprocessing
.
spawn
(
fn
,
args
=
(
num_processes
,
batch_size
,
seq_len
,
hidden_size
,
dtype
),
nprocs
=
nprocs
)
run_torch_spawn
(
sequence_parallelism_pass_on_test_model
,
num_processes
)
def
sequence_parallelism_pass_on_test_model
(
local_rank
:
int
,
world_size
:
int
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
current_platform
.
seed_everything
(
0
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
({
'RANK'
:
str
(
local_rank
),
'LOCAL_RANK'
:
str
(
local_rank
),
'WORLD_SIZE'
:
str
(
world_size
),
'MASTER_ADDR'
:
'localhost'
,
'MASTER_PORT'
:
'12345'
,
})
# initialize distributed
init_distributed_environment
()
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
# configure vllm config for SequenceParallelismPass
vllm_config
=
VllmConfig
()
vllm_config
.
compilation_config
=
CompilationConfig
(
pass_config
=
CompilationConfig
.
PassConfig
(
enable_sequence_parallelism
=
True
,
),
)
vllm_config
.
device_config
=
DeviceConfig
(
device
=
torch
.
device
(
"cuda"
))
# this is a fake model name to construct the model config
# in the vllm_config, it's not really used.
model
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config
.
model_config
=
ModelConfig
(
model
=
model
,
task
=
"auto"
,
tokenizer
=
model
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
dtype
,
seed
=
42
)
sequence_parallelism_pass
=
SequenceParallelismPass
(
vllm_config
)
backend_no_func
=
TestBackend
(
sequence_parallelism_pass
)
func_pass
=
FixFunctionalizationPass
(
vllm_config
)
backend_func
=
TestBackend
(
sequence_parallelism_pass
,
func_pass
)
model
=
TestModel
(
hidden_size
,
hidden_size
*
2
)
hidden_states
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
dtype
=
dtype
)
residual
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
dtype
=
dtype
)
compiled_model_no_func
=
torch
.
compile
(
model
,
backend
=
backend_no_func
)
compiled_model_no_func
(
hidden_states
,
residual
)
compiled_model_func
=
torch
.
compile
(
model
,
backend
=
backend_func
)
compiled_model_func
(
hidden_states
,
residual
)
# Check substitution worked
pre_nodes
=
backend_no_func
.
graph_pre_pass
.
nodes
post_nodes
=
backend_no_func
.
graph_post_pass
.
nodes
# In pre-nodes, all reduce should be there,
# reduce scatter and all gather should not
for
op
in
OPS_IN_MODEL_BEFORE
:
find_specified_fn
(
pre_nodes
,
op
)
for
op
in
OPS_IN_MODEL_AFTER
:
assert
find_specified_fn_maybe
(
pre_nodes
,
op
)
is
None
# In post-nodes, reduce scatter and all gather should be there,
# all reduce should not
for
op
in
OPS_IN_MODEL_AFTER
:
find_specified_fn
(
post_nodes
,
op
)
for
op
in
OPS_IN_MODEL_BEFORE
:
assert
find_specified_fn_maybe
(
post_nodes
,
op
)
is
None
# check if the functionalization pass is applied
for
op
in
OPS_IN_MODEL
:
find_auto_fn
(
backend_no_func
.
graph_post_pass
.
nodes
,
op
)
assert
find_auto_fn_maybe
(
backend_func
.
graph_post_pass
.
nodes
,
op
)
is
None
# noqa: E501
# make sure the ops were all de-functionalized
found
=
dict
()
for
node
in
backend_func
.
graph_post_pass
.
nodes
:
for
op
in
OPS_IN_MODEL
:
if
is_func
(
node
,
op
):
found
[
op
]
=
True
assert
all
(
found
[
op
]
for
op
in
OPS_IN_MODEL
)
tests/conftest.py
View file @
dcb5624a
...
@@ -24,23 +24,24 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass
...
@@ -24,23 +24,24 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass
from
tests.models.utils
import
(
TokensTextLogprobs
,
from
tests.models.utils
import
(
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
)
TokensTextLogprobsPromptLogprobs
)
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
,
_get_and_verify_dtype
from
vllm.config
import
TaskOption
,
_get_and_verify_dtype
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
init_distributed_environment
,
initialize_model_parallel
)
initialize_model_parallel
)
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
TokensPrompt
,
to_enc_dec_tuple_list
,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.utils
import
cuda_device_count_stateless
,
is_list_of
from
vllm.utils
import
cuda_device_count_stateless
from
.utils
import
models_path_prefix
from
.utils
import
models_path_prefix
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
_TEST_DIR
=
os
.
path
.
dirname
(
__file__
)
_TEST_DIR
=
os
.
path
.
dirname
(
__file__
)
...
@@ -109,10 +110,25 @@ class _VideoAssets(_VideoAssetsBase):
...
@@ -109,10 +110,25 @@ class _VideoAssets(_VideoAssetsBase):
return
[
prompts
[
"sample_demo_1"
]]
return
[
prompts
[
"sample_demo_1"
]]
class
_AudioAssetsBase
(
UserList
[
AudioAsset
]):
pass
class
_AudioAssets
(
_AudioAssetsBase
):
def
__init__
(
self
)
->
None
:
super
().
__init__
([
AudioAsset
(
"mary_had_lamb"
),
AudioAsset
(
"winning_call"
),
])
IMAGE_ASSETS
=
_ImageAssets
()
IMAGE_ASSETS
=
_ImageAssets
()
"""Singleton instance of :class:`_ImageAssets`."""
"""Singleton instance of :class:`_ImageAssets`."""
VIDEO_ASSETS
=
_VideoAssets
()
VIDEO_ASSETS
=
_VideoAssets
()
"""Singleton instance of :class:`_VideoAssets`."""
"""Singleton instance of :class:`_VideoAssets`."""
AUDIO_ASSETS
=
_AudioAssets
()
"""Singleton instance of :class:`_AudioAssets`."""
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
...
@@ -269,6 +285,11 @@ def video_assets() -> _VideoAssets:
...
@@ -269,6 +285,11 @@ def video_assets() -> _VideoAssets:
return
VIDEO_ASSETS
return
VIDEO_ASSETS
@
pytest
.
fixture
(
scope
=
"session"
)
def
audio_assets
()
->
_AudioAssets
:
return
AUDIO_ASSETS
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
,
BatchFeature
,
dict
)
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
,
BatchFeature
,
dict
)
_R
=
TypeVar
(
"_R"
)
_R
=
TypeVar
(
"_R"
)
...
@@ -396,10 +417,15 @@ class HfRunner:
...
@@ -396,10 +417,15 @@ class HfRunner:
processor_kwargs
[
"images"
]
=
image
processor_kwargs
[
"images"
]
=
image
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
processor_kwargs
[
"videos"
]
=
video
processor_kwargs
[
"videos"
]
=
video
if
audios
is
not
None
and
(
audio_tuple
:
=
audios
[
i
])
is
not
None
:
if
audios
is
not
None
and
(
audio_inputs
:
=
audios
[
i
])
is
not
None
:
audio
,
sr
=
audio_tuple
# HACK - not all processors take sampling_rate; we should
processor_kwargs
[
"audio"
]
=
audio
# clean this up in the future.
processor_kwargs
[
"sampling_rate"
]
=
sr
if
len
(
audio_inputs
)
==
2
:
audio
,
sr
=
audio_inputs
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
else
:
processor_kwargs
[
"audio"
]
=
audio_inputs
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
processor
(
**
processor_kwargs
)
if
isinstance
(
inputs
,
BatchFeature
):
if
isinstance
(
inputs
,
BatchFeature
):
...
@@ -474,12 +500,19 @@ class HfRunner:
...
@@ -474,12 +500,19 @@ class HfRunner:
prompts
:
list
[
str
],
prompts
:
list
[
str
],
beam_width
:
int
,
beam_width
:
int
,
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]:
)
->
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]:
outputs
=
self
.
generate
(
prompts
,
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
max_new_tokens
=
max_tokens
,
num_beams
=
beam_width
,
num_beams
=
beam_width
,
num_return_sequences
=
beam_width
)
num_return_sequences
=
beam_width
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
output_ids
,
output_str
=
outputs
[
i
]
output_ids
,
output_str
=
outputs
[
i
]
for
j
in
range
(
len
(
output_ids
)):
for
j
in
range
(
len
(
output_ids
)):
...
@@ -530,7 +563,10 @@ class HfRunner:
...
@@ -530,7 +563,10 @@ class HfRunner:
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
logits
=
torch
.
matmul
(
last_hidden_states
.
to
(
output_embeddings
.
weight
.
device
),
last_hidden_states
.
to
(
device
=
output_embeddings
.
weight
.
device
,
dtype
=
output_embeddings
.
weight
.
dtype
,
),
output_embeddings
.
weight
.
t
(),
output_embeddings
.
weight
.
t
(),
)
)
if
getattr
(
output_embeddings
,
"bias"
,
None
)
is
not
None
:
if
getattr
(
output_embeddings
,
"bias"
,
None
)
is
not
None
:
...
@@ -924,6 +960,7 @@ class VllmRunner:
...
@@ -924,6 +960,7 @@ class VllmRunner:
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
skip_special_tokens
:
bool
=
True
,
)
->
Union
[
list
[
TokensTextLogprobs
],
)
->
Union
[
list
[
TokensTextLogprobs
],
list
[
TokensTextLogprobsPromptLogprobs
]]:
list
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
greedy_logprobs_params
=
SamplingParams
(
...
@@ -931,6 +968,7 @@ class VllmRunner:
...
@@ -931,6 +968,7 @@ class VllmRunner:
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
,
logprobs
=
num_logprobs
,
prompt_logprobs
=
(
num_prompt_logprobs
),
prompt_logprobs
=
(
num_prompt_logprobs
),
skip_special_tokens
=
skip_special_tokens
,
)
)
'''
'''
Greedy logprobs generation for vLLM encoder/decoder models
Greedy logprobs generation for vLLM encoder/decoder models
...
@@ -941,18 +979,20 @@ class VllmRunner:
...
@@ -941,18 +979,20 @@ class VllmRunner:
def
generate_beam_search
(
def
generate_beam_search
(
self
,
self
,
prompts
:
Union
[
list
[
str
],
list
[
list
[
int
]]],
prompts
:
list
[
str
],
beam_width
:
int
,
beam_width
:
int
,
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]:
)
->
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]:
if
is_list_of
(
prompts
,
str
,
check
=
"all"
):
inputs
=
self
.
get_inputs
(
prompts
,
prompts
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
images
=
images
,
else
:
videos
=
videos
,
prompts
=
[
audios
=
audios
)
TokensPrompt
(
prompt_token_ids
=
tokens
)
for
tokens
in
prompts
]
outputs
=
self
.
model
.
beam_search
(
outputs
=
self
.
model
.
beam_search
(
promp
ts
,
inpu
ts
,
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
))
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
))
returned_outputs
=
[]
returned_outputs
=
[]
for
output
in
outputs
:
for
output
in
outputs
:
...
@@ -1005,20 +1045,6 @@ def vllm_runner():
...
@@ -1005,20 +1045,6 @@ def vllm_runner():
return
VllmRunner
return
VllmRunner
def
get_tokenizer_pool_config
(
tokenizer_group_type
):
if
tokenizer_group_type
is
None
:
return
None
if
tokenizer_group_type
==
"ray"
:
return
TokenizerPoolConfig
(
pool_size
=
1
,
pool_type
=
"ray"
,
extra_config
=
{})
if
isinstance
(
tokenizer_group_type
,
type
):
return
TokenizerPoolConfig
(
pool_size
=
1
,
pool_type
=
tokenizer_group_type
,
extra_config
=
{})
raise
ValueError
(
f
"Unknown tokenizer_group_type:
{
tokenizer_group_type
}
"
)
@
pytest
.
fixture
()
@
pytest
.
fixture
()
def
temporary_enable_log_propagate
():
def
temporary_enable_log_propagate
():
import
logging
import
logging
...
...
tests/core/block/e2e/test_correctness.py
View file @
dcb5624a
...
@@ -197,15 +197,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
...
@@ -197,15 +197,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
"max_num_batched_tokens"
:
2
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
"max_num_seqs"
:
2
,
},
{
},
{
"block_size"
:
8
,
"block_size"
:
16
,
"max_num_batched_tokens"
:
3
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
"max_num_seqs"
:
2
,
},
{
},
{
"block_size"
:
8
,
"block_size"
:
16
,
"max_num_batched_tokens"
:
256
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
"max_num_seqs"
:
10
,
}])
}])
...
...
tests/distributed/test_comm_ops.py
View file @
dcb5624a
...
@@ -14,7 +14,8 @@ import torch
...
@@ -14,7 +14,8 @@ import torch
from
vllm.distributed
import
(
broadcast_tensor_dict
,
get_pp_group
,
from
vllm.distributed
import
(
broadcast_tensor_dict
,
get_pp_group
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
)
tensor_model_parallel_all_reduce
,
tensor_model_parallel_reduce_scatter
)
from
..utils
import
init_test_distributed_environment
,
multi_process_parallel
from
..utils
import
init_test_distributed_environment
,
multi_process_parallel
...
@@ -47,6 +48,34 @@ def all_reduce_test_worker(
...
@@ -47,6 +48,34 @@ def all_reduce_test_worker(
torch
.
testing
.
assert_close
(
t
,
expected
)
torch
.
testing
.
assert_close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
reduce_scatter_test_worker
(
monkeypatch
:
pytest
.
MonkeyPatch
,
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
num_elements
=
8
all_tensors
=
[
torch
.
arange
(
num_elements
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
*
(
r
+
1
)
for
r
in
range
(
tp_size
)
]
index
=
rank
%
tp_size
partition_size
=
num_elements
//
tp_size
all_reduce
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
expected
=
all_reduce
[
index
*
partition_size
:(
index
+
1
)
*
partition_size
]
t
=
all_tensors
[
index
]
t
=
tensor_model_parallel_reduce_scatter
(
t
,
0
)
torch
.
testing
.
assert_close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_gather_test_worker
(
def
all_gather_test_worker
(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
...
...
tests/distributed/test_pipeline_parallel.py
View file @
dcb5624a
...
@@ -161,12 +161,12 @@ TEXT_GENERATION_MODELS = {
...
@@ -161,12 +161,12 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-
2b
"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-
1.1-2b-it
"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-9b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-9b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"EleutherAI/gpt-j-6b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"EleutherAI/gpt-j-6b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"EleutherAI/pythia-1
2
b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"EleutherAI/pythia-1
.4
b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerLM-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerLM-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
):
PPTestSettings
.
fast
(),
# Uses Llama
# Uses Llama
...
@@ -195,7 +195,7 @@ TEXT_GENERATION_MODELS = {
...
@@ -195,7 +195,7 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
):
PPTestSettings
.
detailed
(
multi_node_only
=
True
,
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
):
PPTestSettings
.
detailed
(
multi_node_only
=
True
,
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2
-7
B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2
.5-0.5
B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
):
PPTestSettings
.
fast
(),
...
...
tests/distributed/test_sequence_parallel.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import
json
import
os
from
dataclasses
import
dataclass
from
typing
import
Literal
,
NamedTuple
,
Optional
import
pytest
from
vllm.config
import
TaskOption
from
vllm.logger
import
init_logger
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
compare_two_settings
,
create_new_process_for_each_test
logger
=
init_logger
(
"test_sequence_parallel"
)
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
class
ParallelSetup
(
NamedTuple
):
tp_size
:
int
sp_enabled
:
bool
eager_mode
:
bool
chunked_prefill
:
bool
class
SPTestOptions
(
NamedTuple
):
multi_node_only
:
bool
load_format
:
Optional
[
str
]
=
None
@
dataclass
class
SPTestSettings
:
parallel_setups
:
list
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
list
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
list
[
str
]
task
:
TaskOption
test_options
:
SPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
def
detailed
(
*
,
tp_base
:
int
=
2
,
multi_node_only
:
bool
=
False
,
task
:
TaskOption
=
"auto"
,
load_format
:
Optional
[
str
]
=
None
,
):
return
SPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
True
)
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
task
=
task
,
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
)
@
staticmethod
def
fast
(
*
,
tp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
multi_node_only
:
bool
=
False
,
load_format
:
Optional
[
str
]
=
None
,
):
return
SPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
task
=
task
,
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
)
def
iter_params
(
self
,
model_id
:
str
):
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
backend
,
vllm_major_version
in
zip
(
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
model_id
,
parallel_setup
,
backend
,
vllm_major_version
,
self
.
task
,
opts
)
def
_compare_sp
(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
SPTestOptions
,
num_gpus_available
:
int
,
*
,
method
:
Literal
[
"generate"
,
"encode"
],
is_multimodal
:
bool
,
):
(
tp_size
,
sp_enabled
,
eager_mode
,
chunked_prefill
,
)
=
parallel_setup
multi_node_only
,
load_format
=
test_options
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
if
load_format
==
"dummy"
:
# Avoid OOM
text_overrides
=
{
"num_hidden_layers"
:
4
,
"hidden_size"
:
512
,
"intermediate_size"
:
800
,
"num_attention_heads"
:
4
,
"num_key_value_heads"
:
1
,
}
if
is_multimodal
:
hf_overrides
.
update
({
"text_config"
:
text_overrides
})
else
:
hf_overrides
.
update
(
text_overrides
)
else
:
model_info
.
check_available_online
(
on_fail
=
"skip"
)
pp_size
=
1
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
if
VLLM_MULTI_NODE
and
distributed_backend
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
if
multi_node_only
and
not
VLLM_MULTI_NODE
:
pytest
.
skip
(
"Not in multi-node setting"
)
common_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"8"
,
]
if
chunked_prefill
:
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
if
task
!=
"auto"
:
common_args
.
extend
([
"--task"
,
task
])
if
trust_remote_code
:
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
common_args
.
extend
([
"--tokenizer-mode"
,
tokenizer_mode
])
if
load_format
:
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
compilation_config
=
{
'level'
:
3
,
'custom_ops'
:
[
"+rms_norm"
],
'compile_sizes'
:
[
4
,
8
],
'splitting_ops'
:
[],
'pass_config'
:
{
'enable_sequence_parallism'
:
sp_enabled
,
'enable_noop'
:
True
,
'enable_fusion'
:
True
,
},
}
tp_sp_env
=
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
}
tp_sp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--distributed-executor-backend"
,
distributed_backend
,
"--compilation_config"
,
str
(
compilation_config
),
]
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
}
tp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--distributed-executor-backend"
,
"mp"
,
]
try
:
compare_two_settings
(
model_id
,
tp_sp_args
,
tp_args
,
tp_sp_env
,
tp_env
,
method
=
method
)
except
Exception
:
testing_ray_compiled_graph
=
tp_sp_env
is
not
None
if
testing_ray_compiled_graph
and
vllm_major_version
==
"0"
:
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger
.
exception
(
"Ray Compiled Graph tests failed"
)
else
:
raise
SP_TEXT_GENERATION_MODELS
=
{
# [Decoder-only]
"meta-llama/Llama-3.2-1B-Instruct"
:
SPTestSettings
.
detailed
(),
}
SP_TEST_MODELS
=
[
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct"
,
]
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
[
params
for
model_id
,
settings
in
SP_TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
SP_TEST_MODELS
],
)
@
create_new_process_for_each_test
()
def
test_tp_sp_generation
(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
SPTestOptions
,
num_gpus_available
,
):
_compare_sp
(
model_id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"generate"
,
is_multimodal
=
False
)
tests/engine/test_arg_utils.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
json
from
argparse
import
ArgumentError
,
ArgumentTypeError
from
argparse
import
ArgumentError
,
ArgumentTypeError
from
contextlib
import
nullcontext
from
dataclasses
import
dataclass
,
field
from
typing
import
Literal
,
Optional
import
pytest
import
pytest
from
vllm.config
import
PoolerConfig
from
vllm.config
import
PoolerConfig
,
config
from
vllm.engine.arg_utils
import
EngineArgs
,
nullable_kvs
from
vllm.engine.arg_utils
import
(
EngineArgs
,
contains_type
,
get_kwargs
,
get_type
,
is_not_builtin
,
is_type
,
nullable_kvs
,
optional_type
)
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
@
pytest
.
mark
.
parametrize
((
"type"
,
"value"
,
"expected"
),
[
(
int
,
"42"
,
42
),
(
int
,
"None"
,
None
),
(
float
,
"3.14"
,
3.14
),
(
float
,
"None"
,
None
),
(
str
,
"Hello World!"
,
"Hello World!"
),
(
str
,
"None"
,
None
),
(
json
.
loads
,
'{"foo":1,"bar":2}'
,
{
"foo"
:
1
,
"bar"
:
2
}),
(
json
.
loads
,
"foo=1,bar=2"
,
{
"foo"
:
1
,
"bar"
:
2
}),
(
json
.
loads
,
"None"
,
None
),
])
def
test_optional_type
(
type
,
value
,
expected
):
optional_type_func
=
optional_type
(
type
)
context
=
nullcontext
()
if
value
==
"foo=1,bar=2"
:
context
=
pytest
.
warns
(
DeprecationWarning
)
with
context
:
assert
optional_type_func
(
value
)
==
expected
@
pytest
.
mark
.
parametrize
((
"type_hint"
,
"type"
,
"expected"
),
[
(
int
,
int
,
True
),
(
int
,
float
,
False
),
(
list
[
int
],
list
,
True
),
(
list
[
int
],
tuple
,
False
),
(
Literal
[
0
,
1
],
Literal
,
True
),
])
def
test_is_type
(
type_hint
,
type
,
expected
):
assert
is_type
(
type_hint
,
type
)
==
expected
@
pytest
.
mark
.
parametrize
((
"type_hints"
,
"type"
,
"expected"
),
[
({
float
,
int
},
int
,
True
),
({
int
,
tuple
[
int
]},
int
,
True
),
({
int
,
tuple
[
int
]},
float
,
False
),
({
str
,
Literal
[
"x"
,
"y"
]},
Literal
,
True
),
])
def
test_contains_type
(
type_hints
,
type
,
expected
):
assert
contains_type
(
type_hints
,
type
)
==
expected
@
pytest
.
mark
.
parametrize
((
"type_hints"
,
"type"
,
"expected"
),
[
({
int
,
float
},
int
,
int
),
({
int
,
float
},
str
,
None
),
({
str
,
Literal
[
"x"
,
"y"
]},
Literal
,
Literal
[
"x"
,
"y"
]),
])
def
test_get_type
(
type_hints
,
type
,
expected
):
assert
get_type
(
type_hints
,
type
)
==
expected
@
config
@
dataclass
class
DummyConfigClass
:
regular_bool
:
bool
=
True
"""Regular bool with default True"""
optional_bool
:
Optional
[
bool
]
=
None
"""Optional bool with default None"""
optional_literal
:
Optional
[
Literal
[
"x"
,
"y"
]]
=
None
"""Optional literal with default None"""
tuple_n
:
tuple
[
int
,
...]
=
field
(
default_factory
=
lambda
:
(
1
,
2
,
3
))
"""Tuple with default (1, 2, 3)"""
tuple_2
:
tuple
[
int
,
int
]
=
field
(
default_factory
=
lambda
:
(
1
,
2
))
"""Tuple with default (1, 2)"""
list_n
:
list
[
int
]
=
field
(
default_factory
=
lambda
:
[
1
,
2
,
3
])
"""List with default [1, 2, 3]"""
@
pytest
.
mark
.
parametrize
((
"type_hint"
,
"expected"
),
[
(
int
,
False
),
(
DummyConfigClass
,
True
),
])
def
test_is_not_builtin
(
type_hint
,
expected
):
assert
is_not_builtin
(
type_hint
)
==
expected
def
test_get_kwargs
():
kwargs
=
get_kwargs
(
DummyConfigClass
)
print
(
kwargs
)
# bools should not have their type set
assert
kwargs
[
"regular_bool"
].
get
(
"type"
)
is
None
assert
kwargs
[
"optional_bool"
].
get
(
"type"
)
is
None
# optional literals should have None as a choice
assert
kwargs
[
"optional_literal"
][
"choices"
]
==
[
"x"
,
"y"
,
"None"
]
# tuples should have the correct nargs
assert
kwargs
[
"tuple_n"
][
"nargs"
]
==
"+"
assert
kwargs
[
"tuple_2"
][
"nargs"
]
==
2
# lists should work
assert
kwargs
[
"list_n"
][
"type"
]
is
int
assert
kwargs
[
"list_n"
][
"nargs"
]
==
"+"
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
),
[
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
),
[
(
None
,
None
),
(
None
,
dict
()
),
(
"image=16"
,
{
(
"image=16"
,
{
"image"
:
16
"image"
:
16
}),
}),
...
@@ -24,6 +128,10 @@ from vllm.utils import FlexibleArgumentParser
...
@@ -24,6 +128,10 @@ from vllm.utils import FlexibleArgumentParser
}),
}),
])
])
def
test_limit_mm_per_prompt_parser
(
arg
,
expected
):
def
test_limit_mm_per_prompt_parser
(
arg
,
expected
):
"""This functionality is deprecated and will be removed in the future.
This argument should be passed as JSON string instead.
TODO: Remove with nullable_kvs."""
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
if
arg
is
None
:
args
=
parser
.
parse_args
([])
args
=
parser
.
parse_args
([])
...
@@ -53,12 +161,20 @@ def test_compilation_config():
...
@@ -53,12 +161,20 @@ def test_compilation_config():
assert
args
.
compilation_config
.
level
==
3
assert
args
.
compilation_config
.
level
==
3
# set to string form of a dict
# set to string form of a dict
args
=
parser
.
parse_args
([
"--compilation-config"
,
"{'level': 3}"
])
args
=
parser
.
parse_args
([
assert
args
.
compilation_config
.
level
==
3
"--compilation-config"
,
"{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}"
,
])
assert
(
args
.
compilation_config
.
level
==
3
and
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
])
# set to string form of a dict
# set to string form of a dict
args
=
parser
.
parse_args
([
"--compilation-config={'level': 3}"
])
args
=
parser
.
parse_args
([
assert
args
.
compilation_config
.
level
==
3
"--compilation-config="
"{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}"
,
])
assert
(
args
.
compilation_config
.
level
==
3
and
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
])
def
test_prefix_cache_default
():
def
test_prefix_cache_default
():
...
...
tests/entrypoints/llm/test_chat.py
View file @
dcb5624a
...
@@ -91,3 +91,31 @@ def test_chat_multi_image(image_urls: list[str]):
...
@@ -91,3 +91,31 @@ def test_chat_multi_image(image_urls: list[str]):
}]
}]
outputs
=
llm
.
chat
(
messages
)
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
assert
len
(
outputs
)
>=
0
def
test_llm_chat_tokenization_no_double_bos
():
"""
LLM.chat() should not add special tokens when using chat templates.
Check we get a single BOS token for llama chat.
"""
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
)
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello!"
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
prompt_token_ids
=
getattr
(
outputs
[
0
],
"prompt_token_ids"
,
None
)
assert
prompt_token_ids
is
not
None
bos_token
=
llm
.
get_tokenizer
().
bos_token_id
# Ensure we have a single BOS
assert
prompt_token_ids
[
0
]
==
bos_token
assert
prompt_token_ids
[
1
]
!=
bos_token
,
"Double BOS"
tests/entrypoints/llm/test_guided_generate.py
View file @
dcb5624a
...
@@ -308,7 +308,7 @@ def test_disable_guided_decoding_fallback(sample_regex, llm):
...
@@ -308,7 +308,7 @@ def test_disable_guided_decoding_fallback(sample_regex, llm):
with
pytest
.
raises
(
with
pytest
.
raises
(
ValueError
,
ValueError
,
match
=
"xgrammar does not support advanced JSON schema features "
match
=
"xgrammar does not support advanced JSON schema features "
"like
enums, patterns or numeric range
s."
):
"like
string length, item limits, or property bound
s."
):
llm
.
generate
(
prompts
=
"This should fail"
,
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
)
...
@@ -386,4 +386,118 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
...
@@ -386,4 +386,118 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
assert
generated_text
is
not
None
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
\ No newline at end of file
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_number_range_json_completion
(
llm
,
guided_decoding_backend
:
str
):
sample_output_schema
=
{
"type"
:
"object"
,
"properties"
:
{
"age"
:
{
"type"
:
"integer"
,
"minimum"
:
18
,
"maximum"
:
99
},
"score"
:
{
"type"
:
"number"
,
"minimum"
:
0.0
,
"maximum"
:
100.0
},
"zipcode"
:
{
"type"
:
"string"
,
"pattern"
:
r
"^\d{5}(-\d{4})?$"
},
},
"required"
:
[
"age"
,
"score"
,
"zipcode"
],
}
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_output_schema
,
backend
=
guided_decoding_backend
),
)
outputs
=
llm
.
generate
(
prompts
=
[
"Create a JSON object for a user with age, score, and zipcode."
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_output_schema
)
assert
18
<=
output_json
[
"age"
]
<=
99
assert
0.0
<=
output_json
[
"score"
]
<=
100.0
assert
(
re
.
fullmatch
(
r
"^\d{5}(-\d{4})?$"
,
output_json
[
"zipcode"
])
is
not
None
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guidance_no_additional_properties
(
llm
):
schema
=
{
'type'
:
'object'
,
'properties'
:
{
'a1'
:
{
'type'
:
'string'
},
'a2'
:
{
'type'
:
'string'
},
'a3'
:
{
'type'
:
'string'
}
},
'required'
:
[
'a1'
,
'a2'
,
'a3'
],
}
prompt
=
(
"<|im_start|>system
\n
You are Qwen, created by Alibaba Cloud. You are a "
"helpful assistant.<|im_end|>
\n
<|im_start|>user
\n
Please generate a "
"large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
"<|im_end|>
\n
<|im_start|>assistant
\n
"
)
def
generate_with_backend
(
backend
):
guided_params
=
GuidedDecodingParams
(
json
=
schema
,
backend
=
backend
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
256
,
guided_decoding
=
guided_params
)
outputs
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
)
assert
outputs
is
not
None
generated_text
=
outputs
[
0
].
outputs
[
0
].
text
assert
generated_text
is
not
None
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
jsonschema
.
validate
(
instance
=
parsed_json
,
schema
=
schema
)
return
parsed_json
base_generated
=
generate_with_backend
(
'guidance:disable-any-whitespace'
)
assert
"a1"
in
base_generated
assert
"a2"
in
base_generated
assert
"a3"
in
base_generated
# by default additional keys are generated
assert
"a4"
in
base_generated
assert
"a5"
in
base_generated
assert
"a6"
in
base_generated
generated
=
generate_with_backend
(
'guidance:no-additional-properties,disable-any-whitespace'
)
assert
"a1"
in
generated
assert
"a2"
in
generated
assert
"a3"
in
generated
assert
"a4"
not
in
generated
assert
"a5"
not
in
generated
assert
"a6"
not
in
generated
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
View file @
dcb5624a
...
@@ -150,6 +150,7 @@ def test_wer_correctness(model_name,
...
@@ -150,6 +150,7 @@ def test_wer_correctness(model_name,
expected_wer
,
expected_wer
,
n_examples
=-
1
,
n_examples
=-
1
,
max_concurrent_request
=
None
):
max_concurrent_request
=
None
):
# TODO refactor to use `ASRDataset`
with
RemoteOpenAIServer
(
model_name
,
[
'--enforce-eager'
])
as
remote_server
:
with
RemoteOpenAIServer
(
model_name
,
[
'--enforce-eager'
])
as
remote_server
:
dataset
=
load_hf_dataset
(
dataset_repo
)
dataset
=
load_hf_dataset
(
dataset_repo
)
...
...
tests/entrypoints/openai/test_audio.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
json
import
openai
import
openai
import
pytest
import
pytest
import
os
import
os
...
@@ -27,7 +29,7 @@ def server():
...
@@ -27,7 +29,7 @@ def server():
"--enforce-eager"
,
"--enforce-eager"
,
"--trust-remote-code"
,
"--trust-remote-code"
,
"--limit-mm-per-prompt"
,
"--limit-mm-per-prompt"
,
f
"audio
=
{
MAXIMUM_AUDIOS
}
"
,
json
.
dumps
({
"audio
"
:
MAXIMUM_AUDIOS
}
)
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
@@ -102,6 +104,35 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
...
@@ -102,6 +104,35 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
[
TEST_AUDIO_URLS
[
0
]])
async
def
test_error_on_invalid_audio_url_type
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
audio_url
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# audio_url should be a dict {"url": "some url"}, not directly a string
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
[
TEST_AUDIO_URLS
[
0
]])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
[
TEST_AUDIO_URLS
[
0
]])
...
...
tests/entrypoints/openai/test_embedding.py
View file @
dcb5624a
...
@@ -12,11 +12,13 @@ import requests
...
@@ -12,11 +12,13 @@ import requests
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...models.embedding.utils
import
c
heck_embeddings_close
from
...models.embedding.utils
import
c
orrectness_test
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-small"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-small"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -26,7 +28,7 @@ def server():
...
@@ -26,7 +28,7 @@ def server():
"embed"
,
"embed"
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
"bfloat16"
,
DTYPE
,
"--enforce-eager"
,
"--enforce-eager"
,
"--max-model-len"
,
"--max-model-len"
,
"512"
,
"512"
,
...
@@ -44,9 +46,17 @@ async def client(server):
...
@@ -44,9 +46,17 @@ async def client(server):
yield
async_client
yield
async_client
@
pytest
.
fixture
(
scope
=
"module"
)
def
hf_model
(
hf_runner
):
with
hf_runner
(
MODEL_NAME
,
dtype
=
DTYPE
,
is_sentence_transformer
=
True
)
as
hf_model
:
yield
hf_model
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_single_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_single_embedding
(
hf_model
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
input_texts
=
[
"The chef prepared a delicious meal."
,
"The chef prepared a delicious meal."
,
]
]
...
@@ -67,6 +77,9 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -67,6 +77,9 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
usage
.
prompt_tokens
==
11
assert
embeddings
.
usage
.
prompt_tokens
==
11
assert
embeddings
.
usage
.
total_tokens
==
11
assert
embeddings
.
usage
.
total_tokens
==
11
vllm_outputs
=
[
d
.
embedding
for
d
in
embeddings
.
data
]
correctness_test
(
hf_model
,
input_texts
,
vllm_outputs
)
# test using token IDs
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embedding_response
=
await
client
.
embeddings
.
create
(
embedding_response
=
await
client
.
embeddings
.
create
(
...
@@ -87,7 +100,8 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -87,7 +100,8 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_batch_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_batch_embedding
(
hf_model
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test list[str]
# test list[str]
input_texts
=
[
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
...
@@ -108,6 +122,9 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -108,6 +122,9 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
usage
.
prompt_tokens
==
33
assert
embeddings
.
usage
.
prompt_tokens
==
33
assert
embeddings
.
usage
.
total_tokens
==
33
assert
embeddings
.
usage
.
total_tokens
==
33
vllm_outputs
=
[
d
.
embedding
for
d
in
embeddings
.
data
]
correctness_test
(
hf_model
,
input_texts
,
vllm_outputs
)
# test list[list[int]]
# test list[list[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
[
25
,
32
,
64
,
77
]]
...
@@ -182,7 +199,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
...
@@ -182,7 +199,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_batch_base64_embedding
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_batch_base64_embedding
(
hf_model
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
input_texts
=
[
input_texts
=
[
"Hello my name is"
,
"Hello my name is"
,
...
@@ -193,6 +210,7 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
...
@@ -193,6 +210,7 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
model
=
model_name
,
model
=
model_name
,
encoding_format
=
"float"
)
encoding_format
=
"float"
)
float_data
=
[
d
.
embedding
for
d
in
responses_float
.
data
]
float_data
=
[
d
.
embedding
for
d
in
responses_float
.
data
]
correctness_test
(
hf_model
,
input_texts
,
float_data
)
responses_base64
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
responses_base64
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
,
model
=
model_name
,
...
@@ -203,24 +221,13 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
...
@@ -203,24 +221,13 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
dtype
=
"float32"
).
tolist
())
dtype
=
"float32"
).
tolist
())
check_embeddings_close
(
correctness_test
(
hf_model
,
input_texts
,
base64_data
)
embeddings_0_lst
=
float_data
,
embeddings_1_lst
=
base64_data
,
name_0
=
"float"
,
name_1
=
"base64"
,
)
# Default response is float32 decoded from base64 by OpenAI Client
# Default response is float32 decoded from base64 by OpenAI Client
responses_default
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
responses_default
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
)
model
=
model_name
)
default_data
=
[
d
.
embedding
for
d
in
responses_default
.
data
]
default_data
=
[
d
.
embedding
for
d
in
responses_default
.
data
]
correctness_test
(
hf_model
,
input_texts
,
default_data
)
check_embeddings_close
(
embeddings_0_lst
=
float_data
,
embeddings_1_lst
=
default_data
,
name_0
=
"float"
,
name_1
=
"default"
,
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_embedding_dimensions.py
View file @
dcb5624a
...
@@ -3,80 +3,121 @@
...
@@ -3,80 +3,121 @@
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
"""
"""
from
typing
import
NamedTuple
from
typing
import
Optional
import
openai
import
openai
import
pytest
import
pytest
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
from
...conftest
import
HfRunner
from
...models.embedding.utils
import
EmbedModelInfo
,
correctness_test
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
class
ModelInfo
(
NamedTuple
):
name
:
str
is_matryoshka
:
bool
MODELS
=
[
MODELS
=
[
ModelInfo
(
name
=
"BAAI/bge-m3"
,
is_matryoshka
=
False
),
EmbedModelInfo
(
"intfloat/multilingual-e5-small"
,
is_matryoshka
=
False
),
ModelInfo
(
name
=
"jinaai/jina-embeddings-v3"
,
is_matryoshka
=
True
),
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v1.5"
,
is_matryoshka
=
True
,
matryoshka_dimensions
=
[
256
]),
]
]
input_texts
=
[
input_texts
=
[
"The chef prepared a delicious meal."
,
"The chef prepared a delicious meal."
,
]
*
3
]
@
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
model_info
(
request
):
async
def
test_validating_dimensions
(
model
:
ModelInfo
):
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
"bfloat16"
])
def
dtype
(
request
):
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
model_info
,
dtype
:
str
):
args
=
[
args
=
[
"--task"
,
"--task"
,
"embed"
,
"embed"
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
"bfloat16"
,
dtype
,
"--enforce-eager"
,
"--enforce-eager"
,
"--max-model-len"
,
"--max-model-len"
,
"512"
,
"512"
"--trust_remote_code"
]
]
with
RemoteOpenAIServer
(
model
.
name
,
args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
async
def
make_request
(
dimensions
):
embedding_response
=
await
client
.
embeddings
.
create
(
model
=
model
.
name
,
input
=
input_texts
,
dimensions
=
dimensions
,
encoding_format
=
"float"
,
)
embeddings
=
EmbeddingResponse
.
model_validate
(
embedding_response
.
model_dump
(
mode
=
"json"
))
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
>
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
>
0
assert
embeddings
.
usage
.
total_tokens
>
0
if
dimensions
is
not
None
:
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
dimensions
if
model
.
is_matryoshka
:
for
dimensions
in
[
None
,
16
]:
await
make_request
(
dimensions
)
if
model_info
.
name
==
"Snowflake/snowflake-arctic-embed-m-v1.5"
:
# Manually enable Matryoshka Embeddings
args
.
extend
([
"--trust_remote_code"
,
"--hf_overrides"
,
'{"matryoshka_dimensions":[256]}'
])
with
RemoteOpenAIServer
(
model_info
.
name
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
hf_model
(
hf_runner
,
model_info
,
dtype
:
str
):
with
hf_runner
(
model_info
.
name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
yield
hf_model
@
pytest
.
mark
.
asyncio
async
def
test_matryoshka
(
model_info
:
EmbedModelInfo
,
server
:
RemoteOpenAIServer
,
hf_model
:
HfRunner
):
client
=
server
.
get_async_client
()
async
def
make_request_and_correctness_test
(
dimensions
):
prompts
=
input_texts
*
3
embedding_response
=
await
client
.
embeddings
.
create
(
model
=
model_info
.
name
,
input
=
prompts
,
dimensions
=
dimensions
,
encoding_format
=
"float"
,
)
embeddings
=
EmbeddingResponse
.
model_validate
(
embedding_response
.
model_dump
(
mode
=
"json"
))
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
>
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
>
0
assert
embeddings
.
usage
.
total_tokens
>
0
if
dimensions
is
not
None
:
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
dimensions
vllm_outputs
=
[
d
.
embedding
for
d
in
embeddings
.
data
]
correctness_test
(
hf_model
,
prompts
,
vllm_outputs
,
dimensions
)
if
model_info
.
is_matryoshka
:
valid_dimensions
:
list
[
Optional
[
int
]]
=
[
None
]
if
model_info
.
matryoshka_dimensions
is
not
None
:
valid_dimensions
+=
model_info
.
matryoshka_dimensions
[:
2
]
for
dimensions
in
valid_dimensions
:
await
make_request_and_correctness_test
(
dimensions
)
invalid_dimensions
:
list
[
Optional
[
int
]]
=
[
-
1
]
if
model_info
.
matryoshka_dimensions
is
not
None
:
assert
5
not
in
model_info
.
matryoshka_dimensions
invalid_dimensions
.
append
(
5
)
for
dimensions
in
invalid_dimensions
:
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
for
dimensions
in
[
-
1
]:
await
make_request_and_correctness_test
(
dimensions
)
await
make_request
(
dimensions
)
else
:
else
:
for
dimensions
in
[
None
]:
for
dimensions
in
[
None
]:
await
make_request
(
dimensions
)
await
make_request
_and_correctness_test
(
dimensions
)
for
dimensions
in
[
-
1
,
16
]:
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
for
dimensions
in
[
-
1
,
16
]:
await
make_request_and_correctness_test
(
dimensions
)
await
make_request
(
dimensions
)
Prev
1
…
7
8
9
10
11
12
13
14
15
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment