Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2b52805
Commit
d2b52805
authored
Sep 07, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori
parents
9a521c23
5438967f
Changes
511
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
526 additions
and
371 deletions
+526
-371
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+7
-2
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pp_cudagraph.py
+0
-1
tests/distributed/test_sequence_parallel.py
tests/distributed/test_sequence_parallel.py
+1
-1
tests/distributed/test_symm_mem_allreduce.py
tests/distributed/test_symm_mem_allreduce.py
+108
-0
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+6
-9
tests/entrypoints/llm/test_classify.py
tests/entrypoints/llm/test_classify.py
+8
-11
tests/entrypoints/llm/test_embedding.py
tests/entrypoints/llm/test_embedding.py
+2
-3
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+3
-57
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+3
-40
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+0
-81
tests/entrypoints/llm/test_reward.py
tests/entrypoints/llm/test_reward.py
+2
-11
tests/entrypoints/llm/test_score.py
tests/entrypoints/llm/test_score.py
+2
-11
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+45
-9
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+7
-4
tests/entrypoints/openai/test_classification.py
tests/entrypoints/openai/test_classification.py
+30
-0
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_cli_args.py
+22
-0
tests/entrypoints/openai/test_collective_rpc.py
tests/entrypoints/openai/test_collective_rpc.py
+88
-0
tests/entrypoints/openai/test_completion_with_function_calling.py
...trypoints/openai/test_completion_with_function_calling.py
+191
-123
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+0
-8
tests/entrypoints/openai/test_lora_resolvers.py
tests/entrypoints/openai/test_lora_resolvers.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
511 of 511+
files are displayed.
Plain diff
Email patch
tests/distributed/test_pipeline_parallel.py
View file @
d2b52805
...
...
@@ -118,6 +118,8 @@ class PPTestSettings:
multi_node_only
:
bool
=
False
,
load_format
:
Optional
[
str
]
=
None
,
):
vllm_major_versions
=
[
"1"
]
if
runner
==
"pooling"
else
[
"0"
]
return
PPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
...
...
@@ -126,7 +128,7 @@ class PPTestSettings:
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
],
vllm_major_versions
=
[
"0"
]
,
vllm_major_versions
=
vllm_major_versions
,
runner
=
runner
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
...
...
@@ -213,7 +215,9 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS
=
{
# type: ignore[var-annotated]
# [Text-only]
"intfloat/e5-mistral-7b-instruct"
:
PPTestSettings
.
fast
(
runner
=
"pooling"
),
"BAAI/bge-multilingual-gemma2"
:
PPTestSettings
.
fast
(
runner
=
"pooling"
),
# TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
# is fixed
#"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
"Qwen/Qwen2.5-Math-RM-72B"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
,
runner
=
"pooling"
),
...
...
@@ -233,6 +237,7 @@ MULTIMODAL_MODELS = {
"openbmb/MiniCPM-Llama3-V-2_5"
:
PPTestSettings
.
fast
(),
"allenai/Molmo-7B-D-0924"
:
PPTestSettings
.
fast
(),
"AIDC-AI/Ovis2-1B"
:
PPTestSettings
.
fast
(),
"AIDC-AI/Ovis2.5-2B"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3.5-vision-instruct"
:
PPTestSettings
.
fast
(),
"mistralai/Pixtral-12B-2409"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
"Qwen/Qwen-VL-Chat"
:
PPTestSettings
.
fast
(),
...
...
tests/distributed/test_pp_cudagraph.py
View file @
d2b52805
...
...
@@ -17,7 +17,6 @@ if TYPE_CHECKING:
])
@
pytest
.
mark
.
parametrize
(
"ATTN_BACKEND"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
])
@
create_new_process_for_each_test
()
def
test_pp_cudagraph
(
...
...
tests/distributed/test_sequence_parallel.py
View file @
d2b52805
...
...
@@ -292,7 +292,7 @@ SP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct"
,
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
,
]
...
...
tests/distributed/test_symm_mem_allreduce.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
import
typing
import
pytest
import
torch
import
torch.distributed
as
dist
import
torch.multiprocessing
as
mp
import
vllm.envs
as
envs
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed.communication_op
import
tensor_model_parallel_all_reduce
from
vllm.distributed.device_communicators.cuda_communicator
import
(
CudaCommunicator
)
from
vllm.distributed.parallel_state
import
(
get_tensor_model_parallel_group
,
get_tp_group
,
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
update_environment_variables
torch
.
manual_seed
(
42
)
random
.
seed
(
44
)
test_size_elements
=
4
*
1024
*
1024
def
symm_mem_allreduce_worker
(
local_rank
:
int
,
world_size
:
int
):
monkeypatch
=
pytest
.
MonkeyPatch
()
with
monkeypatch
.
context
()
as
m
:
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
({
'RANK'
:
str
(
local_rank
),
'LOCAL_RANK'
:
str
(
local_rank
),
'WORLD_SIZE'
:
str
(
world_size
),
'MASTER_ADDR'
:
'localhost'
,
'MASTER_PORT'
:
'12345'
,
})
init_distributed_environment
()
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
cuda_communicator
=
typing
.
cast
(
CudaCommunicator
,
get_tp_group
().
device_communicator
)
symm_mem_comm
=
cuda_communicator
.
symm_mem_comm
if
symm_mem_comm
is
None
or
symm_mem_comm
.
disabled
:
pytest
.
skip
(
"SymmMemCommunicator is not available or disabled."
)
inp_direct_symm_mem
=
torch
.
randint
(
1
,
23
,
(
test_size_elements
,
),
dtype
=
dtype
,
device
=
device
)
if
not
symm_mem_comm
.
should_use_symm_mem
(
inp_direct_symm_mem
):
pytest
.
skip
(
"SymmMemCommunicator isn't used for this world and input size."
)
original_inp_direct_symm_mem
=
inp_direct_symm_mem
.
clone
()
out_direct_symm_mem
=
symm_mem_comm
.
all_reduce
(
inp_direct_symm_mem
)
assert
out_direct_symm_mem
is
not
None
group
=
get_tensor_model_parallel_group
().
device_group
dist
.
all_reduce
(
original_inp_direct_symm_mem
,
group
=
group
)
torch
.
testing
.
assert_close
(
out_direct_symm_mem
,
original_inp_direct_symm_mem
,
atol
=
2.5
,
rtol
=
0.1
)
# Test tensor_model_parallel_all_reduce which should use symm_mem
inp_tensor_parallel
=
torch
.
randint
(
-
23
,
1
,
(
test_size_elements
,
),
dtype
=
dtype
,
device
=
device
)
original_inp_tensor_parallel
=
inp_tensor_parallel
.
clone
()
out_tensor_parallel
=
tensor_model_parallel_all_reduce
(
inp_tensor_parallel
)
dist
.
all_reduce
(
original_inp_tensor_parallel
,
group
=
group
)
torch
.
testing
.
assert_close
(
out_tensor_parallel
,
original_inp_tensor_parallel
,
atol
=
2.5
,
rtol
=
0.1
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"SymmMemAllreduce is only available for CUDA platforms."
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"pipeline_parallel_size"
,
[
1
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
def
test_symm_mem_allreduce
(
monkeypatch
:
pytest
.
MonkeyPatch
,
tp_size
,
pipeline_parallel_size
):
world_size
=
tp_size
*
pipeline_parallel_size
if
world_size
>
torch
.
cuda
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
# Enable SymmMemCommunicator
monkeypatch
.
setenv
(
"VLLM_ALLREDUCE_USE_SYMM_MEM"
,
"1"
)
mp
.
spawn
(
symm_mem_allreduce_worker
,
args
=
(
world_size
,
),
nprocs
=
world_size
)
cleanup_dist_env_and_memory
()
tests/entrypoints/llm/test_chat.py
View file @
d2b52805
...
...
@@ -18,7 +18,6 @@ def text_llm():
enforce_eager
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
@@ -88,7 +87,6 @@ def vision_llm():
seed
=
0
,
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
@@ -158,7 +156,6 @@ def thinking_llm():
seed
=
0
,
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
tests/entrypoints/llm/test_classify.py
View file @
d2b52805
...
...
@@ -16,14 +16,6 @@ MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
prompts
=
[
"The chef prepared a delicious meal."
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
...
...
@@ -35,7 +27,6 @@ def llm():
enforce_eager
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
@@ -71,3 +62,9 @@ def test_encode_api(llm: LLM):
err_msg
=
"pooling_task must be one of.+"
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
encode
(
prompts
,
use_tqdm
=
False
)
def
test_score_api
(
llm
:
LLM
):
err_msg
=
"Score API is only enabled for num_labels == 1."
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
tests/entrypoints/llm/test_embedding.py
View file @
d2b52805
...
...
@@ -26,7 +26,6 @@ def llm():
enforce_eager
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
tests/entrypoints/llm/test_encode.py
View file @
d2b52805
...
...
@@ -5,11 +5,9 @@ import weakref
import
pytest
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm
import
LLM
,
PoolingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...models.utils
import
check_embeddings_close
MODEL_NAME
=
"intfloat/multilingual-e5-small"
PROMPTS
=
[
...
...
@@ -29,14 +27,6 @@ TOKEN_IDS = [
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
...
...
@@ -48,7 +38,6 @@ def llm():
enforce_eager
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
@@ -56,49 +45,6 @@ def llm():
cleanup_dist_env_and_memory
()
def
assert_outputs_match
(
o1
:
list
[
PoolingRequestOutput
],
o2
:
list
[
PoolingRequestOutput
]):
check_embeddings_close
(
embeddings_0_lst
=
[
o
.
outputs
.
data
for
o
in
o1
],
embeddings_1_lst
=
[
o
.
outputs
.
data
for
o
in
o2
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
prompt_token_ids
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompt_token_ids'"
):
v1_output
=
llm
.
encode
(
prompt_token_ids
=
prompt_token_ids
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
({
"prompt_token_ids"
:
prompt_token_ids
},
pooling_params
=
pooling_params
)
assert_outputs_match
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompt_token_ids'"
):
v1_output
=
llm
.
encode
(
prompt_token_ids
=
TOKEN_IDS
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
[{
"prompt_token_ids"
:
p
}
for
p
in
TOKEN_IDS
],
pooling_params
=
pooling_params
,
)
assert_outputs_match
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_multiple_pooling_params
(
llm
:
LLM
):
pooling_params
=
[
...
...
tests/entrypoints/llm/test_generate.py
View file @
d2b52805
...
...
@@ -5,7 +5,7 @@ import weakref
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
MODEL_NAME
=
"distilbert/distilgpt2"
...
...
@@ -41,7 +41,6 @@ def llm():
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
@@ -49,42 +48,6 @@ def llm():
cleanup_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
list
[
RequestOutput
],
o2
:
list
[
RequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
prompt_token_ids
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompt_token_ids'"
):
v1_output
=
llm
.
generate
(
prompt_token_ids
=
prompt_token_ids
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
({
"prompt_token_ids"
:
prompt_token_ids
},
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompt_token_ids'"
):
v1_output
=
llm
.
generate
(
prompt_token_ids
=
TOKEN_IDS
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
[{
"prompt_token_ids"
:
p
}
for
p
in
TOKEN_IDS
],
sampling_params
=
sampling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_multiple_sampling_params
(
llm
:
LLM
):
sampling_params
=
[
...
...
tests/entrypoints/llm/test_generate_multiple_loras.py
deleted
100644 → 0
View file @
9a521c23
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
weakref
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
PROMPTS
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
mpatch
=
MonkeyPatch
()
yield
mpatch
mpatch
.
undo
()
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
False
,
True
])
def
llm
(
request
,
monkeypatch_module
):
use_v1
=
request
.
param
monkeypatch_module
.
setenv
(
'VLLM_USE_V1'
,
'1'
if
use_v1
else
'0'
)
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
64
,
max_num_seqs
=
128
,
enforce_eager
=
True
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup_dist_env_and_memory
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_multiple_lora_requests
(
llm
:
LLM
,
zephyr_lora_files
):
lora_request
=
[
LoRARequest
(
LORA_NAME
+
str
(
idx
),
idx
+
1
,
zephyr_lora_files
)
for
idx
in
range
(
len
(
PROMPTS
))
]
# Multiple SamplingParams should be matched with each prompt
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
# Exception raised, if the size of params does not match the size of prompts
with
pytest
.
raises
(
ValueError
):
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
[:
1
])
# Single LoRARequest should be applied to every prompt
single_lora_request
=
lora_request
[
0
]
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
single_lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
tests/entrypoints/llm/test_reward.py
View file @
d2b52805
...
...
@@ -16,14 +16,6 @@ MODEL_NAME = "internlm/internlm2-1_8b-reward"
prompts
=
[
"The chef prepared a delicious meal."
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
...
...
@@ -36,7 +28,6 @@ def llm():
trust_remote_code
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
tests/entrypoints/llm/test_score.py
View file @
d2b52805
...
...
@@ -14,14 +14,6 @@ from ...models.utils import softmax
MODEL_NAME
=
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
...
...
@@ -33,7 +25,6 @@ def llm():
enforce_eager
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
...
...
tests/entrypoints/offline_mode/test_offline_mode.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for HF_HUB_OFFLINE mode"""
import
dataclasses
import
importlib
import
sys
...
...
@@ -9,6 +10,7 @@ import urllib3
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.engine.arg_utils
import
EngineArgs
MODEL_CONFIGS
=
[
{
...
...
@@ -30,15 +32,16 @@ MODEL_CONFIGS = [
"tensor_parallel_size"
:
1
,
"tokenizer_mode"
:
"mistral"
,
},
{
"model"
:
"sentence-transformers/all-MiniLM-L12-v2"
,
"enforce_eager"
:
True
,
"gpu_memory_utilization"
:
0.20
,
"max_model_len"
:
64
,
"max_num_batched_tokens"
:
64
,
"max_num_seqs"
:
64
,
"tensor_parallel_size"
:
1
,
},
# TODO: re-enable once these tests are run with V1
# {
# "model": "sentence-transformers/all-MiniLM-L12-v2",
# "enforce_eager": True,
# "gpu_memory_utilization": 0.20,
# "max_model_len": 64,
# "max_num_batched_tokens": 64,
# "max_num_seqs": 64,
# "tensor_parallel_size": 1,
# },
]
...
...
@@ -108,3 +111,36 @@ def _re_import_modules():
# Error this test if reloading a module failed
if
reload_exception
is
not
None
:
raise
reload_exception
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
usefixtures
(
"cache_models"
)
def
test_model_from_huggingface_offline
(
monkeypatch
:
pytest
.
MonkeyPatch
):
# Set HF to offline mode and ensure we can still construct an LLM
with
monkeypatch
.
context
()
as
m
:
try
:
m
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
m
.
setenv
(
"VLLM_NO_USAGE_STATS"
,
"1"
)
def
disable_connect
(
*
args
,
**
kwargs
):
raise
RuntimeError
(
"No http calls allowed"
)
m
.
setattr
(
urllib3
.
connection
.
HTTPConnection
,
"connect"
,
disable_connect
,
)
m
.
setattr
(
urllib3
.
connection
.
HTTPSConnection
,
"connect"
,
disable_connect
,
)
# Need to re-import huggingface_hub
# and friends to setup offline mode
_re_import_modules
()
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
)
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
finally
:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
_re_import_modules
()
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
View file @
d2b52805
...
...
@@ -49,8 +49,7 @@ async def transcribe_audio(client, tokenizer, y, sr):
return
latency
,
num_output_tokens
,
transcription
.
text
async
def
bound_transcribe
(
model_name
,
sem
,
client
,
audio
,
reference
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
async
def
bound_transcribe
(
sem
,
client
,
tokenizer
,
audio
,
reference
):
# Use semaphore to limit concurrent requests.
async
with
sem
:
result
=
await
transcribe_audio
(
client
,
tokenizer
,
*
audio
)
...
...
@@ -63,15 +62,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference):
async
def
process_dataset
(
model
,
client
,
data
,
concurrent_request
):
sem
=
asyncio
.
Semaphore
(
concurrent_request
)
# Load tokenizer once outside the loop
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
# Warmup call as the first `librosa.load` server-side is quite slow.
audio
,
sr
=
data
[
0
][
"audio"
][
"array"
],
data
[
0
][
"audio"
][
"sampling_rate"
]
_
=
await
bound_transcribe
(
model
,
sem
,
client
,
(
audio
,
sr
),
""
)
_
=
await
bound_transcribe
(
sem
,
client
,
tokenizer
,
(
audio
,
sr
),
""
)
tasks
:
list
[
asyncio
.
Task
]
=
[]
for
sample
in
data
:
audio
,
sr
=
sample
[
"audio"
][
"array"
],
sample
[
"audio"
][
"sampling_rate"
]
task
=
asyncio
.
create_task
(
bound_transcribe
(
model
,
sem
,
client
,
(
audio
,
sr
),
sample
[
"text"
]))
bound_transcribe
(
sem
,
client
,
tokenizer
,
(
audio
,
sr
),
sample
[
"text"
]))
tasks
.
append
(
task
)
return
await
asyncio
.
gather
(
*
tasks
)
...
...
tests/entrypoints/openai/test_classification.py
View file @
d2b52805
...
...
@@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
},
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_score
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
# score api is only enabled for num_labels == 1.
response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
model_name
,
"text_1"
:
"ping"
,
"text_2"
:
"pong"
,
},
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_rerank
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
# rerank api is only enabled for num_labels == 1.
response
=
requests
.
post
(
server
.
url_for
(
"rerank"
),
json
=
{
"model"
:
model_name
,
"query"
:
"ping"
,
"documents"
:
[
"pong"
],
},
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
tests/entrypoints/openai/test_cli_args.py
View file @
d2b52805
...
...
@@ -27,6 +27,28 @@ def serve_parser():
return
make_arg_parser
(
parser
)
### Test config parsing
def
test_config_arg_parsing
(
serve_parser
,
cli_config_file
):
args
=
serve_parser
.
parse_args
([])
assert
args
.
port
==
8000
args
=
serve_parser
.
parse_args
([
'--config'
,
cli_config_file
])
assert
args
.
port
==
12312
args
=
serve_parser
.
parse_args
([
'--config'
,
cli_config_file
,
'--port'
,
'9000'
,
])
assert
args
.
port
==
9000
args
=
serve_parser
.
parse_args
([
'--port'
,
'9000'
,
'--config'
,
cli_config_file
,
])
assert
args
.
port
==
9000
### Tests for LoRA module parsing
def
test_valid_key_value_format
(
serve_parser
):
# Test old format: name=path
...
...
tests/entrypoints/openai/test_collective_rpc.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
import
pytest
import
requests
from
tests.utils
import
RemoteOpenAIServer
MODEL_NAME
=
"Qwen/Qwen3-0.6B"
class
TestWorkerExtension
:
def
get_model_name
(
self
)
->
str
:
"""Test non-pydantic return type."""
return
MODEL_NAME
def
echo_args_kwargs
(
self
,
*
args
,
**
kwargs
)
->
dict
[
str
,
Any
]:
"""Echo back both args and kwargs."""
return
dict
(
args
=
list
(
args
),
kwargs
=
kwargs
,
total_items
=
len
(
args
)
+
len
(
kwargs
),
)
def
return_none
(
self
,
*
args
,
**
kwargs
)
->
None
:
"""Test method that does not return anything"""
return
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--worker-extension-cls"
,
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
{
"VLLM_SERVER_DEV_MODE"
:
"1"
,
"CUDA_VISIBLE_DEVICES"
:
"0"
},
)
as
remote_server
:
yield
remote_server
def
test_get_model_name
(
server
):
"""Test basic response"""
response
=
requests
.
post
(
server
.
url_for
(
"collective_rpc"
),
json
=
{
"method"
:
"get_model_name"
})
assert
response
.
status_code
==
200
results
=
response
.
json
()
assert
"results"
in
results
assert
results
[
"results"
]
==
[
MODEL_NAME
]
def
test_return_none
(
server
):
"""Test return none"""
response
=
requests
.
post
(
server
.
url_for
(
"collective_rpc"
),
json
=
{
"method"
:
"return_none"
})
assert
response
.
status_code
==
200
results
=
response
.
json
()
assert
results
[
"results"
]
==
[
None
]
def
test_echo_args_kwargs
(
server
):
"""Test args, kwargs, and dict response"""
args
=
[
"arg1"
,
"arg2"
]
kwargs
=
{
"key1"
:
"value1"
,
"key2"
:
"value2"
}
response
=
requests
.
post
(
server
.
url_for
(
"collective_rpc"
),
json
=
{
"method"
:
"echo_args_kwargs"
,
"args"
:
args
,
"kwargs"
:
kwargs
})
assert
response
.
status_code
==
200
results
=
response
.
json
()
result
=
results
[
"results"
][
0
]
assert
result
[
"args"
]
==
args
assert
result
[
"kwargs"
]
==
kwargs
assert
result
[
"total_items"
]
==
len
(
args
)
+
len
(
kwargs
)
tests/entrypoints/openai/test_completion_with_function_calling.py
View file @
d2b52805
...
...
@@ -13,48 +13,7 @@ from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"Qwen/Qwen3-0.6B"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"half"
,
"--enable-auto-tool-choice"
,
"--guided-decoding-backend"
,
"xgrammar"
,
"--tool-call-parser"
,
"hermes"
,
"--reasoning-parser"
,
"qwen3"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"stream"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tool_choice"
,
[
"auto"
,
"required"
,
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
}
])
@
pytest
.
mark
.
parametrize
(
"enable_thinking"
,
[
True
,
False
])
async
def
test_function_tool_use
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
stream
:
bool
,
tool_choice
:
Union
[
str
,
dict
],
enable_thinking
:
bool
):
tools
=
[
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -77,14 +36,12 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
"options"
:
{
"$ref"
:
"#/$defs/WeatherOptions"
,
"description"
:
"Optional parameters for weather query"
,
"description"
:
"Optional parameters for weather query"
,
},
},
"required"
:
[
"country"
,
"unit"
],
...
...
@@ -149,8 +106,7 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
...
...
@@ -158,9 +114,9 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
},
},
},
]
]
messages
=
[
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hi! How are you doing today?"
...
...
@@ -176,7 +132,51 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
"Can you tell me what the current weather is in Berlin and the "
\
"forecast for the next 5 days, in fahrenheit?"
,
},
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"half"
,
"--enable-auto-tool-choice"
,
"--guided-decoding-backend"
,
"xgrammar"
,
"--tool-call-parser"
,
"hermes"
,
"--reasoning-parser"
,
"qwen3"
,
"--gpu-memory-utilization"
,
"0.4"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"stream"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tool_choice"
,
[
"auto"
,
"required"
,
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
}
])
@
pytest
.
mark
.
parametrize
(
"enable_thinking"
,
[
True
,
False
])
async
def
test_function_tool_use
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
stream
:
bool
,
tool_choice
:
Union
[
str
,
dict
],
enable_thinking
:
bool
):
if
not
stream
:
# Non-streaming test
chat_completion
=
await
client
.
chat
.
completions
.
create
(
...
...
@@ -216,3 +216,71 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
output
.
extend
(
chunk
.
choices
[
0
].
delta
.
tool_calls
)
assert
len
(
output
)
>
0
@
pytest
.
fixture
(
scope
=
"module"
)
def
k2_server
():
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"half"
,
"--enable-auto-tool-choice"
,
"--guided-decoding-backend"
,
"xgrammar"
,
"--tool-call-parser"
,
"hermes"
,
"--reasoning-parser"
,
"qwen3"
,
"--gpu-memory-utilization"
,
"0.4"
,
]
# hack to test kimi_k2 tool use tool_id format.
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
override_hf_configs
=
{
"model_type"
:
'kimi_k2'
,
'kv_lora_rank'
:
None
})
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
k2_client
(
k2_server
):
async
with
k2_server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"stream"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tool_choice"
,
[
"required"
])
async
def
test_tool_id_kimi_k2
(
k2_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
stream
:
bool
,
tool_choice
:
str
):
if
not
stream
:
# Non-streaming test
chat_completion
=
await
k2_client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
tools
=
tools
,
tool_choice
=
tool_choice
)
assert
chat_completion
.
choices
[
0
].
message
.
tool_calls
is
not
None
assert
len
(
chat_completion
.
choices
[
0
].
message
.
tool_calls
)
>
0
assert
chat_completion
.
choices
[
0
].
message
.
tool_calls
[
0
].
id
==
'functions.get_current_weather:0'
else
:
# Streaming test
output_stream
=
await
k2_client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
tools
=
tools
,
tool_choice
=
tool_choice
,
stream
=
True
)
output
=
[]
async
for
chunk
in
output_stream
:
if
chunk
.
choices
and
chunk
.
choices
[
0
].
delta
.
tool_calls
:
output
.
extend
(
chunk
.
choices
[
0
].
delta
.
tool_calls
)
for
o
in
output
:
assert
o
.
id
is
None
or
o
.
id
==
'functions.get_current_weather:0'
tests/entrypoints/openai/test_embedding.py
View file @
d2b52805
...
...
@@ -24,14 +24,6 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
...
...
tests/entrypoints/openai/test_lora_resolvers.py
View file @
d2b52805
...
...
@@ -47,6 +47,7 @@ class MockModelConfig:
allowed_local_media_path
:
str
=
""
encoder_config
=
None
generation_config
:
str
=
"auto"
skip_tokenizer_init
:
bool
=
False
def
get_diff_sampling_param
(
self
):
return
self
.
diff_sampling_param
or
{}
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment