Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4eabe123
Commit
4eabe123
authored
May 28, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori
parents
45840cd2
58738772
Changes
670
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
618 additions
and
92 deletions
+618
-92
requirements/test.in
requirements/test.in
+1
-0
requirements/test.txt
requirements/test.txt
+21
-1
requirements/tpu.txt
requirements/tpu.txt
+5
-5
setup.py
setup.py
+1
-2
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+65
-10
tests/compile/backend.py
tests/compile/backend.py
+18
-0
tests/compile/test_async_tp.py
tests/compile/test_async_tp.py
+248
-0
tests/compile/test_fusion.py
tests/compile/test_fusion.py
+17
-19
tests/compile/test_sequence_parallelism.py
tests/compile/test_sequence_parallelism.py
+18
-29
tests/conftest.py
tests/conftest.py
+9
-0
tests/distributed/test_events.py
tests/distributed/test_events.py
+4
-5
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_broadcast.py
+5
-5
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+1
-1
tests/entrypoints/openai/correctness/test_mteb.py
tests/entrypoints/openai/correctness/test_mteb.py
+41
-0
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+1
-1
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+1
-2
tests/entrypoints/openai/test_openai_schema.py
tests/entrypoints/openai/test_openai_schema.py
+56
-1
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_prompt_validation.py
+6
-6
tests/entrypoints/openai/test_score.py
tests/entrypoints/openai/test_score.py
+3
-5
tests/entrypoints/openai/test_tensorizer_entrypoint.py
tests/entrypoints/openai/test_tensorizer_entrypoint.py
+97
-0
No files found.
requirements/test.in
View file @
4eabe123
...
...
@@ -33,6 +33,7 @@ num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.51.3
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
...
...
requirements/test.txt
View file @
4eabe123
...
...
@@ -99,6 +99,7 @@ datasets==3.0.2
# via
# evaluate
# lm-eval
# mteb
decorator==5.1.1
# via librosa
dill==0.3.8
...
...
@@ -124,6 +125,8 @@ email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
eval-type-backport==0.2.2
# via mteb
evaluate==0.4.3
# via lm-eval
fastparquet==2024.11.0
...
...
@@ -291,6 +294,8 @@ msgpack==1.1.0
# via
# librosa
# ray
mteb==1.38.11
# via -r requirements/test.in
multidict==6.1.0
# via
# aiohttp
...
...
@@ -331,6 +336,7 @@ numpy==1.26.4
# librosa
# matplotlib
# mistral-common
# mteb
# numba
# numexpr
# opencv-python-headless
...
...
@@ -443,6 +449,8 @@ plotly==5.24.1
# via genai-perf
pluggy==1.5.0
# via pytest
polars==1.29.0
# via mteb
pooch==1.8.2
# via librosa
portalocker==2.10.1
...
...
@@ -476,6 +484,7 @@ pydantic==2.9.2
# via
# datamodel-code-generator
# mistral-common
# mteb
pydantic-core==2.23.4
# via pydantic
pygments==2.18.0
...
...
@@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
# typepy
python-rapidjson==1.20
# via tritonclient
pytrec-eval-terrier==0.5.7
# via mteb
pytz==2024.2
# via
# pandas
...
...
@@ -564,6 +575,7 @@ requests==2.32.3
# huggingface-hub
# lm-eval
# mistral-common
# mteb
# pooch
# ray
# responses
...
...
@@ -580,6 +592,7 @@ rfc3987==1.3.8
rich==13.9.4
# via
# genai-perf
# mteb
# typer
rouge-score==0.1.2
# via lm-eval
...
...
@@ -607,16 +620,20 @@ scikit-learn==1.5.2
# via
# librosa
# lm-eval
# mteb
# sentence-transformers
scipy==1.13.1
# via
# librosa
# mteb
# scikit-learn
# sentence-transformers
# statsmodels
# vocos
sentence-transformers==3.2.1
# via -r requirements/test.in
# via
# -r requirements/test.in
# mteb
sentencepiece==0.2.0
# via mistral-common
setuptools==77.0.3
...
...
@@ -696,6 +713,7 @@ torch==2.7.0+cu128
# fastsafetensors
# lm-eval
# mamba-ssm
# mteb
# peft
# runai-model-streamer
# sentence-transformers
...
...
@@ -720,6 +738,7 @@ tqdm==4.66.6
# evaluate
# huggingface-hub
# lm-eval
# mteb
# nltk
# peft
# pqdm
...
...
@@ -759,6 +778,7 @@ typing-extensions==4.12.2
# huggingface-hub
# librosa
# mistral-common
# mteb
# pqdm
# pydantic
# pydantic-core
...
...
requirements/tpu.txt
View file @
4eabe123
...
...
@@ -18,9 +18,9 @@ setuptools==78.1.0
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.8.0.dev20250
430
torchvision==0.22.0.dev20250
430
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250
430
-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250
430
-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250
430
-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch==2.8.0.dev20250
518
torchvision==0.22.0.dev20250
518
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250
518
-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250
518
-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250
518
-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
setup.py
100755 → 100644
View file @
4eabe123
...
...
@@ -5,12 +5,12 @@ import importlib.util
import
json
import
logging
import
os
import
re
import
subprocess
import
sys
from
pathlib
import
Path
from
shutil
import
which
import
regex
as
re
import
torch
from
packaging.version
import
Version
,
parse
from
setuptools
import
Extension
,
setup
...
...
@@ -389,7 +389,6 @@ class repackage_wheel(build_ext):
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
import
re
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
+=
list
(
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
4eabe123
...
...
@@ -8,12 +8,13 @@ import weakref
from
unittest.mock
import
Mock
import
pytest
import
torch
from
vllm
import
LLM
from
vllm
import
LLM
,
envs
from
vllm.platforms
import
current_platform
from
vllm.v1.engine.llm_engine
import
LLMEngine
as
LLMEngineV1
from
..conftest
import
VllmRunner
from
..conftest
import
HfRunner
,
VllmRunner
from
..models.utils
import
check_outputs_equal
from
..utils
import
multi_gpu_test
...
...
@@ -43,11 +44,26 @@ def test_vllm_gc_ed():
assert
weak_llm
()
is
None
def
_fix_prompt_embed_outputs
(
vllm_outputs
:
list
[
tuple
[
list
[
int
],
str
]],
hf_model
:
HfRunner
,
example_prompts
:
list
[
str
])
->
list
[
tuple
[
list
[
int
],
str
]]:
fixed_vllm_outputs
=
[]
for
vllm_output
,
hf_input
,
prompt
in
zip
(
vllm_outputs
,
hf_model
.
get_inputs
(
example_prompts
),
example_prompts
):
hf_input_ids
=
hf_input
[
"input_ids"
].
tolist
()[
0
]
fixed_vllm_outputs
.
append
(
(
hf_input_ids
+
vllm_output
[
0
][
len
(
hf_input_ids
):],
prompt
+
vllm_output
[
1
]))
return
fixed_vllm_outputs
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"enable_prompt_embeds"
,
[
True
,
False
])
def
test_models
(
monkeypatch
:
pytest
.
MonkeyPatch
,
hf_runner
,
...
...
@@ -56,8 +72,13 @@ def test_models(
dtype
:
str
,
max_tokens
:
int
,
enforce_eager
:
bool
,
enable_prompt_embeds
:
bool
,
)
->
None
:
if
enable_prompt_embeds
and
envs
.
is_set
(
"VLLM_USE_V1"
)
and
envs
.
VLLM_USE_V1
:
pytest
.
skip
(
"enable_prompt_embeds is not supported in v1."
)
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
...
...
@@ -78,14 +99,25 @@ def test_models(
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
if
enable_prompt_embeds
:
with
torch
.
no_grad
():
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
with
VllmRunner
(
model
,
max_model_len
=
8192
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
vllm_outputs
=
_fix_prompt_embed_outputs
(
vllm_outputs
,
hf_model
,
example_prompts
)
else
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
...
...
@@ -108,6 +140,7 @@ def test_models(
(
"distilbert/distilgpt2"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
])
@
pytest
.
mark
.
parametrize
(
"enable_prompt_embeds"
,
[
True
,
False
])
def
test_models_distributed
(
monkeypatch
:
pytest
.
MonkeyPatch
,
hf_runner
,
...
...
@@ -117,14 +150,22 @@ def test_models_distributed(
distributed_executor_backend
:
str
,
attention_backend
:
str
,
test_suite
:
str
,
enable_prompt_embeds
:
bool
,
)
->
None
:
if
enable_prompt_embeds
and
envs
.
is_set
(
"VLLM_USE_V1"
)
and
envs
.
VLLM_USE_V1
:
pytest
.
skip
(
"enable_prompt_embeds is not supported in v1."
)
if
test_suite
!=
TARGET_TEST_SUITE
:
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
with
monkeypatch
.
context
()
as
monkeypatch_context
:
if
model
==
"meta-llama/Llama-3.2-1B-Instruct"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
# test Ray Compiled Graph
if
enable_prompt_embeds
:
pytest
.
skip
(
"enable_prompt_embeds does not work with ray compiled dag."
)
monkeypatch_context
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
monkeypatch_context
.
setenv
(
"VLLM_USE_RAY_COMPILED_DAG"
,
"1"
)
...
...
@@ -147,12 +188,26 @@ def test_models_distributed(
dtype
=
dtype
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
if
enable_prompt_embeds
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
torch
.
no_grad
():
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
vllm_outputs
=
_fix_prompt_embed_outputs
(
vllm_outputs
,
hf_model
,
example_prompts
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
else
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
...
...
tests/compile/backend.py
View file @
4eabe123
...
...
@@ -5,6 +5,8 @@ from typing import Callable, Union
from
torch
import
fx
from
vllm.compilation.fx_utils
import
(
find_specified_fn
,
find_specified_fn_maybe
)
from
vllm.compilation.inductor_pass
import
InductorPass
from
vllm.config
import
get_current_vllm_config
...
...
@@ -44,3 +46,19 @@ class TestBackend:
self
.
graph_post_pass
=
deepcopy
(
graph
)
# assign by reference, will reflect the final state of the graph
self
.
final_graph
=
graph
def
check_before_ops
(
self
,
ops
,
find_fn
=
find_specified_fn
,
\
find_fn_maybe
=
find_specified_fn_maybe
,
\
ops_fully_replaced
=
True
):
for
op
in
ops
:
find_fn
(
self
.
graph_pre_pass
.
nodes
,
op
)
if
ops_fully_replaced
:
assert
find_fn_maybe
(
self
.
graph_post_pass
.
nodes
,
op
)
is
None
def
check_after_ops
(
self
,
ops
,
find_fn
=
find_specified_fn
,
\
find_fn_maybe
=
find_specified_fn_maybe
):
for
op
in
ops
:
find_fn
(
self
.
graph_post_pass
.
nodes
,
op
)
assert
find_fn_maybe
(
self
.
graph_pre_pass
.
nodes
,
op
)
is
None
tests/compile/test_async_tp.py
0 → 100644
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
import
json
import
pytest
import
torch
import
vllm.envs
as
envs
from
vllm.compilation.collective_fusion
import
AsyncTPPass
from
vllm.config
import
(
CompilationConfig
,
DeviceConfig
,
ModelConfig
,
PassConfig
,
VllmConfig
)
from
vllm.distributed
import
(
tensor_model_parallel_all_gather
,
tensor_model_parallel_reduce_scatter
)
from
vllm.distributed.parallel_state
import
(
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
update_environment_variables
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
(
compare_two_settings
,
create_new_process_for_each_test
,
multi_gpu_test
)
from
.backend
import
TestBackend
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
class
TestMMRSModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
gate_proj
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
(
self
.
hidden_size
*
2
,
hidden_size
)),
requires_grad
=
False
)
# Initialize weights
torch
.
nn
.
init
.
normal_
(
self
.
gate_proj
,
std
=
0.02
)
def
forward
(
self
,
hidden_states
):
"""
Forward pass implementing the mm + reduce scatter in the FX graph
"""
# Reshape input
view
=
hidden_states
.
reshape
(
-
1
,
self
.
hidden_size
)
# matrix multiplication
permute
=
self
.
gate_proj
.
permute
(
1
,
0
)
mm
=
torch
.
mm
(
view
,
permute
)
reduce_scatter
=
tensor_model_parallel_reduce_scatter
(
mm
,
dim
=
0
)
return
reduce_scatter
def
ops_in_model_before
(
self
):
return
[
torch
.
ops
.
vllm
.
reduce_scatter
.
default
]
def
ops_in_model_after
(
self
):
return
[
torch
.
ops
.
symm_mem
.
fused_matmul_reduce_scatter
.
default
]
class
TestAGMMModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
weight
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
(
hidden_size
,
hidden_size
)),
requires_grad
=
False
)
# Initialize weights
torch
.
nn
.
init
.
normal_
(
self
.
weight
,
std
=
0.02
)
def
forward
(
self
,
hidden_states
):
"""
Forward pass implementing the mm + all gather in the FX graph
"""
# Reshape input
view
=
hidden_states
.
reshape
(
-
1
,
self
.
hidden_size
)
all_gather
=
tensor_model_parallel_all_gather
(
view
,
dim
=
0
)
permute
=
self
.
weight
.
permute
(
1
,
0
)
mm
=
torch
.
mm
(
all_gather
,
permute
)
return
mm
def
ops_in_model_before
(
self
):
return
[
torch
.
ops
.
vllm
.
all_gather
.
default
]
def
ops_in_model_after
(
self
):
return
[
torch
.
ops
.
symm_mem
.
fused_all_gather_matmul
.
default
]
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"test_model"
,
[
TestMMRSModel
,
TestAGMMModel
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
def
test_async_tp_pass_replace
(
test_model
:
str
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
num_processes
=
2
def
run_torch_spawn
(
fn
,
nprocs
):
# need to use torch.mp.spawn otherwise will have problems with
# torch.distributed and cuda
torch
.
multiprocessing
.
spawn
(
fn
,
args
=
(
num_processes
,
test_model
,
batch_size
,
seq_len
,
hidden_size
,
dtype
),
nprocs
=
nprocs
)
run_torch_spawn
(
async_tp_pass_on_test_model
,
num_processes
)
def
async_tp_pass_on_test_model
(
local_rank
:
int
,
world_size
:
int
,
test_model_cls
:
torch
.
nn
.
Module
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
current_platform
.
seed_everything
(
0
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
({
'RANK'
:
str
(
local_rank
),
'LOCAL_RANK'
:
str
(
local_rank
),
'WORLD_SIZE'
:
str
(
world_size
),
'MASTER_ADDR'
:
'localhost'
,
'MASTER_PORT'
:
'12345'
,
})
# initialize distributed
init_distributed_environment
()
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
# configure vllm config for SequenceParallelismPass
vllm_config
=
VllmConfig
()
vllm_config
.
compilation_config
=
CompilationConfig
(
pass_config
=
PassConfig
(
enable_async_tp
=
True
,
),
)
vllm_config
.
device_config
=
DeviceConfig
(
device
=
torch
.
device
(
"cuda"
))
# this is a fake model name to construct the model config
# in the vllm_config, it's not really used.
model_name
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config
.
model_config
=
ModelConfig
(
model
=
model_name
,
task
=
"auto"
,
tokenizer
=
model_name
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
dtype
,
seed
=
42
)
async_tp_pass
=
AsyncTPPass
(
vllm_config
)
backend
=
TestBackend
(
async_tp_pass
)
model
=
test_model_cls
(
hidden_size
)
hidden_states
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
dtype
=
dtype
,
requires_grad
=
False
)
compiled_model
=
torch
.
compile
(
model
,
backend
=
backend
)
compiled_model
(
hidden_states
)
# In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend
.
check_before_ops
(
model
.
ops_in_model_before
(),
ops_fully_replaced
=
False
)
# In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"meta-llama/Llama-3.2-1B-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"async_tp_enabled"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"distributed_backend"
,
[
"mp"
])
@
pytest
.
mark
.
parametrize
(
"eager_mode"
,
[
False
,
True
])
def
test_async_tp_pass_correctness
(
model_id
:
str
,
tp_size
:
int
,
async_tp_enabled
:
bool
,
distributed_backend
:
str
,
eager_mode
:
bool
,
num_gpus_available
:
int
,
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
pp_size
=
1
if
num_gpus_available
<
tp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
common_args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"8"
,
]
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
compilation_config
=
{
'level'
:
3
,
'compile_sizes'
:
[
2
,
4
,
8
],
'splitting_ops'
:
[],
'pass_config'
:
{
'enable_async_tp'
:
async_tp_enabled
},
}
async_tp_env
=
tp_env
=
{
"VLLM_USE_V1"
:
"1"
,
}
aysnc_tp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--distributed-executor-backend"
,
distributed_backend
,
"--compilation_config"
,
json
.
dumps
(
compilation_config
),
]
tp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--distributed-executor-backend"
,
"mp"
,
]
compare_two_settings
(
model_id
,
aysnc_tp_args
,
tp_args
,
async_tp_env
,
tp_env
,
method
=
"generate"
)
tests/compile/test_fusion.py
View file @
4eabe123
...
...
@@ -29,6 +29,10 @@ class TestModel(torch.nn.Module):
self
.
cutlass_fp8_enabled
=
cutlass_fp8_enabled
self
.
norm
=
[
RMSNorm
(
hidden_size
,
eps
)
for
_
in
range
(
3
)]
self
.
wscale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
self
.
key
=
QuantKey
(
dtype
=
FP8_DTYPE
,
static
=
static
,
per_tensor
=
static
,
symmetric
=
True
)
if
static
:
self
.
scale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
else
:
...
...
@@ -59,6 +63,15 @@ class TestModel(torch.nn.Module):
y3
,
resid
=
self
.
norm
[
2
](
x3
,
resid
)
# use resid here
return
y3
def
ops_in_model_before
(
self
):
return
[
QUANT_OPS
[
self
.
key
]]
def
ops_in_model_after
(
self
):
return
[
FUSED_OPS
[
FusedRMSQuantKey
(
self
.
key
,
False
)],
FUSED_OPS
[
FusedRMSQuantKey
(
self
.
key
,
True
)]
]
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
64
,
3392
,
4096
])
...
...
@@ -107,25 +120,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
torch
.
testing
.
assert_close
(
result
,
result2
,
atol
=
ATOL
,
rtol
=
RTOL
)
# Check substitution worked
pre_nodes
=
backend
.
graph_pre_pass
.
nodes
post_nodes
=
backend
.
graph_post_pass
.
nodes
# static is per-tensor, dynamic is per-token
key
=
QuantKey
(
dtype
=
FP8_DTYPE
,
static
=
static
,
per_tensor
=
static
,
symmetric
=
True
)
rms_quant
=
FUSED_OPS
[
FusedRMSQuantKey
(
key
,
False
)]
add_rms_quant
=
FUSED_OPS
[
FusedRMSQuantKey
(
key
,
True
)]
fp8_quant
=
QUANT_OPS
[
key
]
# In pre-nodes, fp8 quant should be there and fused kernels should not
assert
find_auto_fn_maybe
(
pre_nodes
,
rms_quant
)
is
None
assert
find_auto_fn_maybe
(
pre_nodes
,
add_rms_quant
)
is
None
find_auto_fn
(
pre_nodes
,
fp8_quant
)
backend
.
check_before_ops
(
model
.
ops_in_model_before
(),
find_auto_fn
,
find_auto_fn_maybe
)
# In post-nodes, fused kernels should be there and fp8 quant should not
find_auto_fn
(
post_nodes
,
rms_quant
)
find_auto_fn
(
post_nodes
,
add_rms_quant
)
assert
find_auto_fn_maybe
(
post_nodes
,
fp8_quant
)
is
None
backend
.
check_after_ops
(
model
.
ops_in_model_after
(),
find_auto_fn
,
find_auto_fn_maybe
)
tests/compile/test_sequence_parallelism.py
View file @
4eabe123
...
...
@@ -5,9 +5,7 @@ import torch
import
vllm.envs
as
envs
from
vllm.compilation.fix_functionalization
import
FixFunctionalizationPass
from
vllm.compilation.fx_utils
import
(
find_auto_fn
,
find_auto_fn_maybe
,
find_specified_fn
,
find_specified_fn_maybe
,
is_func
)
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
,
is_func
from
vllm.compilation.sequence_parallelism
import
SequenceParallelismPass
from
vllm.config
import
(
CompilationConfig
,
DeviceConfig
,
ModelConfig
,
PassConfig
,
VllmConfig
)
...
...
@@ -21,17 +19,6 @@ from vllm.utils import update_environment_variables
from
..utils
import
multi_gpu_test
from
.backend
import
TestBackend
OPS_IN_MODEL_BEFORE
=
[
torch
.
ops
.
vllm
.
all_reduce
.
default
,
]
OPS_IN_MODEL_AFTER
=
[
torch
.
ops
.
vllm
.
reduce_scatter
.
default
,
torch
.
ops
.
vllm
.
all_gather
.
default
,
]
OPS_IN_MODEL
=
[
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
]
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
@@ -78,6 +65,18 @@ class TestModel(torch.nn.Module):
return
norm_output
,
residual_output
def
ops_in_model_before
(
self
):
return
[
torch
.
ops
.
vllm
.
all_reduce
.
default
]
def
ops_in_model_after
(
self
):
return
[
torch
.
ops
.
vllm
.
reduce_scatter
.
default
,
torch
.
ops
.
vllm
.
all_gather
.
default
]
def
ops_in_model
(
self
):
return
[
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
]
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
...
...
@@ -156,26 +155,16 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
compiled_model_func
=
torch
.
compile
(
model
,
backend
=
backend_func
)
compiled_model_func
(
hidden_states
,
residual
)
# Check substitution worked
pre_nodes
=
backend_no_func
.
graph_pre_pass
.
nodes
post_nodes
=
backend_no_func
.
graph_post_pass
.
nodes
# In pre-nodes, all reduce should be there,
# reduce scatter and all gather should not
for
op
in
OPS_IN_MODEL_BEFORE
:
find_specified_fn
(
pre_nodes
,
op
)
for
op
in
OPS_IN_MODEL_AFTER
:
assert
find_specified_fn_maybe
(
pre_nodes
,
op
)
is
None
backend_no_func
.
check_before_ops
(
model
.
ops_in_model_before
())
# In post-nodes, reduce scatter and all gather should be there,
# all reduce should not
for
op
in
OPS_IN_MODEL_AFTER
:
find_specified_fn
(
post_nodes
,
op
)
for
op
in
OPS_IN_MODEL_BEFORE
:
assert
find_specified_fn_maybe
(
post_nodes
,
op
)
is
None
backend_no_func
.
check_after_ops
(
model
.
ops_in_model_after
())
# check if the functionalization pass is applied
for
op
in
OPS_IN_MODEL
:
for
op
in
model
.
ops_in_model
()
:
find_auto_fn
(
backend_no_func
.
graph_post_pass
.
nodes
,
op
)
assert
find_auto_fn_maybe
(
backend_func
.
graph_post_pass
.
nodes
,
op
)
is
None
# noqa: E501
...
...
@@ -183,7 +172,7 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
# make sure the ops were all de-functionalized
found
=
dict
()
for
node
in
backend_func
.
graph_post_pass
.
nodes
:
for
op
in
OPS_IN_MODEL
:
for
op
in
model
.
ops_in_model
()
:
if
is_func
(
node
,
op
):
found
[
op
]
=
True
assert
all
(
found
[
op
]
for
op
in
OPS_IN_MODEL
)
assert
all
(
found
[
op
]
for
op
in
model
.
ops_in_model
()
)
tests/conftest.py
View file @
4eabe123
...
...
@@ -430,6 +430,15 @@ class HfRunner:
return
all_inputs
def
get_prompt_embeddings
(
self
,
prompts
:
list
[
str
])
->
list
[
torch
.
Tensor
]:
all_inputs
=
self
.
get_inputs
(
prompts
)
embeddings
=
[]
for
inputs
in
all_inputs
:
input_ids
=
self
.
wrap_device
(
inputs
)[
"input_ids"
]
embedding
=
self
.
model
.
get_input_embeddings
()(
input_ids
).
squeeze
(
0
)
embeddings
.
append
(
embedding
)
return
embeddings
def
classify
(
self
,
prompts
:
list
[
str
])
->
list
[
str
]:
# output is final logits
all_inputs
=
self
.
get_inputs
(
prompts
)
...
...
tests/distributed/test_events.py
View file @
4eabe123
...
...
@@ -119,13 +119,12 @@ def test_topic_filtering(publisher_config):
"""
publisher_config
.
replay_endpoint
=
None
cfg
=
publisher_config
.
model_copy
()
cfg
.
topic
=
"foo"
pub
=
EventPublisherFactory
.
create
(
cfg
)
publisher_config
.
topic
=
"foo"
pub
=
EventPublisherFactory
.
create
(
publisher_config
)
from
.conftest
import
MockSubscriber
sub_foo
=
MockSubscriber
(
cf
g
.
endpoint
,
None
,
"foo"
)
sub_bar
=
MockSubscriber
(
cf
g
.
endpoint
,
None
,
"bar"
)
sub_foo
=
MockSubscriber
(
publisher_confi
g
.
endpoint
,
None
,
"foo"
)
sub_bar
=
MockSubscriber
(
publisher_confi
g
.
endpoint
,
None
,
"bar"
)
try
:
time
.
sleep
(
0.1
)
...
...
tests/distributed/test_shm_broadcast.py
View file @
4eabe123
...
...
@@ -9,7 +9,7 @@ import torch.distributed as dist
from
vllm.distributed.device_communicators.shm_broadcast
import
MessageQueue
from
vllm.distributed.utils
import
StatelessProcessGroup
from
vllm.utils
import
get_ip
,
get_open_port
,
update_environment_variables
from
vllm.utils
import
get_open_port
,
update_environment_variables
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
list
[
np
.
ndarray
]:
...
...
@@ -60,12 +60,12 @@ def worker_fn():
rank
=
dist
.
get_rank
()
if
rank
==
0
:
port
=
get_open_port
()
ip
=
get_ip
()
ip
=
'127.0.0.1'
dist
.
broadcast_object_list
([
ip
,
port
],
src
=
0
)
else
:
recv
=
[
None
,
None
]
dist
.
broadcast_object_list
(
recv
,
src
=
0
)
ip
,
port
=
recv
ip
,
port
=
recv
# type: ignore
stateless_pg
=
StatelessProcessGroup
.
create
(
ip
,
port
,
rank
,
dist
.
get_world_size
())
...
...
@@ -107,10 +107,10 @@ def worker_fn():
if
pg
==
dist
.
group
.
WORLD
:
dist
.
barrier
()
print
(
"torch distributed passed the test!"
)
print
(
f
"torch distributed passed the test!
Rank
{
rank
}
"
)
else
:
pg
.
barrier
()
print
(
"StatelessProcessGroup passed the test!"
)
print
(
f
"StatelessProcessGroup passed the test!
Rank
{
rank
}
"
)
def
test_shm_broadcast
():
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
import
json
import
re
import
weakref
from
enum
import
Enum
import
jsonschema
import
pytest
import
regex
as
re
from
pydantic
import
BaseModel
from
vllm.distributed
import
cleanup_dist_env_and_memory
...
...
tests/entrypoints/openai/correctness/test_mteb.py
0 → 100644
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
from
tests.models.language.pooling.mteb_utils
import
(
MTEB_EMBED_TASKS
,
OpenAIClientMtebEncoder
,
run_mteb_embed_task
,
run_mteb_embed_task_st
)
from
tests.utils
import
RemoteOpenAIServer
os
.
environ
[
"VLLM_LOGGING_LEVEL"
]
=
"WARNING"
MODEL_NAME
=
"BAAI/bge-m3"
DTYPE
=
"float16"
MAIN_SCORE
=
0.7873427091972599
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--task"
,
"embed"
,
"--dtype"
,
DTYPE
,
"--enforce-eager"
,
"--max-model-len"
,
"512"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
def
test_mteb
(
server
):
client
=
server
.
get_client
()
encoder
=
OpenAIClientMtebEncoder
(
MODEL_NAME
,
client
)
vllm_main_score
=
run_mteb_embed_task
(
encoder
,
MTEB_EMBED_TASKS
)
st_main_score
=
MAIN_SCORE
or
run_mteb_embed_task_st
(
MODEL_NAME
,
MTEB_EMBED_TASKS
)
print
(
"VLLM main score: "
,
vllm_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
rel
=
1e-4
)
tests/entrypoints/openai/test_chat.py
View file @
4eabe123
...
...
@@ -2,13 +2,13 @@
# imports for guided decoding tests
import
json
import
re
from
typing
import
Optional
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
import
regex
as
re
import
requests
import
torch
from
openai
import
BadRequestError
,
OpenAI
...
...
tests/entrypoints/openai/test_completion.py
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
# imports for guided decoding tests
import
json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
Optional
...
...
@@ -11,6 +9,7 @@ import jsonschema
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
import
regex
as
re
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
...
...
tests/entrypoints/openai/test_openai_schema.py
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Final
import
pytest
import
schemathesis
from
hypothesis
import
settings
from
schemathesis
import
GenerationConfig
from
...utils
import
RemoteOpenAIServer
...
...
@@ -9,6 +12,8 @@ schemathesis.experimental.OPEN_API_3_1.enable()
MODEL_NAME
=
"HuggingFaceTB/SmolVLM-256M-Instruct"
MAXIMUM_IMAGES
=
2
DEFAULT_TIMEOUT_SECONDS
:
Final
[
int
]
=
10
LONG_TIMEOUT_SECONDS
:
Final
[
int
]
=
60
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
@@ -42,8 +47,58 @@ def get_schema(server):
schema
=
schemathesis
.
from_pytest_fixture
(
"get_schema"
)
@
schemathesis
.
hook
def
before_generate_case
(
context
:
schemathesis
.
hooks
.
HookContext
,
strategy
):
op
=
context
.
operation
assert
op
is
not
None
def
no_file_type
(
case
:
schemathesis
.
models
.
Case
):
"""
This filter skips test cases for the `POST /tokenize` endpoint where the
HTTP request body uses `"type": "file"` in any message's content.
We expect these cases to fail because that type isn't implemented here
https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
Example test cases that are skipped:
curl -X POST -H 'Content-Type: application/json'
\
-d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}'
\
http://localhost:8000/tokenize
curl -X POST -H 'Content-Type: application/json'
\
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}'
\
http://localhost:8000/tokenize
"""
# noqa: E501
if
(
op
.
method
.
lower
()
==
"post"
and
op
.
path
==
"/tokenize"
and
hasattr
(
case
,
"body"
)
and
isinstance
(
case
.
body
,
dict
)
and
"messages"
in
case
.
body
and
isinstance
(
case
.
body
[
"messages"
],
list
)
and
len
(
case
.
body
[
"messages"
])
>
0
):
for
message
in
case
.
body
[
"messages"
]:
if
not
isinstance
(
message
,
dict
):
continue
content
=
message
.
get
(
"content"
,
[])
if
not
isinstance
(
content
,
list
)
or
len
(
content
)
==
0
:
continue
if
any
(
item
.
get
(
"type"
)
==
"file"
for
item
in
content
):
return
False
return
True
return
strategy
.
filter
(
no_file_type
)
@
schema
.
parametrize
()
@
schema
.
override
(
headers
=
{
"Content-Type"
:
"application/json"
})
@
settings
(
deadline
=
LONG_TIMEOUT_SECONDS
*
1000
)
def
test_openapi_stateless
(
case
:
schemathesis
.
Case
):
key
=
(
case
.
operation
.
method
.
upper
(),
case
.
operation
.
path
,
)
timeout
=
{
# requires a longer timeout
(
"POST"
,
"/v1/chat/completions"
):
LONG_TIMEOUT_SECONDS
,
}.
get
(
key
,
DEFAULT_TIMEOUT_SECONDS
)
#No need to verify SSL certificate for localhost
case
.
call_and_validate
(
verify
=
False
)
case
.
call_and_validate
(
verify
=
False
,
timeout
=
timeout
)
tests/entrypoints/openai/test_prompt_validation.py
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
# imports for guided decoding tests
import
re
import
openai
import
pytest
import
regex
as
re
from
...utils
import
RemoteOpenAIServer
...
...
@@ -32,7 +31,7 @@ async def test_out_of_vocab_token_ids():
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
re
.
compile
(
'.*out of vocabulary.*'
)):
match
=
re
.
compile
(
'.*out of vocabulary.*'
)
.
pattern
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
999999
],
max_tokens
=
5
,
...
...
@@ -46,9 +45,10 @@ async def test_reject_multistep_with_guided_decoding():
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
re
.
compile
(
'.*Guided decoding .* multi-step decoding.*'
)):
'.*Guided decoding .* multi-step decoding.*'
)
.
pattern
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello"
,
...
...
tests/entrypoints/openai/test_score.py
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
import
math
from
typing
import
Any
import
pytest
...
...
@@ -92,7 +90,7 @@ class TestModel:
hf_outputs
=
run_transformers
(
runner
,
model
,
text_pairs
)
for
i
in
range
(
len
(
vllm_outputs
)):
assert
math
.
isclose
(
hf_outputs
[
i
],
vllm_outputs
[
i
],
rel
_tol
=
0.01
)
assert
hf_outputs
[
i
]
==
pytest
.
approx
(
vllm_outputs
[
i
],
rel
=
0.01
)
def
test_text_1_list_text_2_list
(
self
,
server
:
RemoteOpenAIServer
,
model
:
dict
[
str
,
Any
],
runner
):
...
...
@@ -124,7 +122,7 @@ class TestModel:
hf_outputs
=
run_transformers
(
runner
,
model
,
text_pairs
)
for
i
in
range
(
len
(
vllm_outputs
)):
assert
math
.
isclose
(
hf_outputs
[
i
],
vllm_outputs
[
i
],
rel
_tol
=
0.01
)
assert
hf_outputs
[
i
]
==
pytest
.
approx
(
vllm_outputs
[
i
],
rel
=
0.01
)
def
test_text_1_str_text_2_str
(
self
,
server
:
RemoteOpenAIServer
,
model
:
dict
[
str
,
Any
],
runner
):
...
...
@@ -150,7 +148,7 @@ class TestModel:
hf_outputs
=
run_transformers
(
runner
,
model
,
text_pairs
)
for
i
in
range
(
len
(
vllm_outputs
)):
assert
math
.
isclose
(
hf_outputs
[
i
],
vllm_outputs
[
i
],
rel
_tol
=
0.01
)
assert
hf_outputs
[
i
]
==
pytest
.
approx
(
vllm_outputs
[
i
],
rel
=
0.01
)
def
test_score_max_model_len
(
self
,
server
:
RemoteOpenAIServer
,
model
:
dict
[
str
,
Any
]):
...
...
tests/entrypoints/openai/test_tensorizer_entrypoint.py
0 → 100644
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
import
gc
import
json
import
tempfile
import
openai
import
pytest
import
pytest_asyncio
import
torch.cuda
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
tensorize_lora_adapter
,
tensorize_vllm_model
)
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"unsloth/llama-3.2-1b-Instruct"
LORA_PATH
=
"davzoku/finqa_adapter_1b"
def
_cleanup
():
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup
():
_cleanup
()
@
pytest
.
fixture
(
scope
=
'module'
)
def
tmp_dir
():
with
tempfile
.
TemporaryDirectory
()
as
path
:
yield
path
@
pytest
.
fixture
(
scope
=
'module'
)
def
model_uri
(
tmp_dir
):
yield
f
"
{
tmp_dir
}
/model.tensors"
@
pytest
.
fixture
(
scope
=
"module"
)
def
tensorize_model_and_lora
(
tmp_dir
,
model_uri
):
tensorizer_config
=
TensorizerConfig
(
tensorizer_uri
=
model_uri
,
lora_dir
=
tmp_dir
)
args
=
EngineArgs
(
model
=
MODEL_NAME
,
device
=
"cuda"
)
tensorize_lora_adapter
(
LORA_PATH
,
tensorizer_config
)
tensorize_vllm_model
(
args
,
tensorizer_config
)
# Manually invoke a _cleanup() here, as the cleanup()
# fixture won't be guaranteed to be called after this
# when this fixture is used for a test
_cleanup
()
yield
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
model_uri
,
tensorize_model_and_lora
):
model_loader_extra_config
=
{
"tensorizer_uri"
:
model_uri
,
}
## Start OpenAI API server
args
=
[
"--load-format"
,
"tensorizer"
,
"--device"
,
"cuda"
,
"--model-loader-extra-config"
,
json
.
dumps
(
model_loader_extra_config
),
"--enable-lora"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
_cleanup
()
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
model
==
MODEL_NAME
assert
len
(
completion
.
choices
)
==
1
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
Prev
1
…
14
15
16
17
18
19
20
21
22
…
34
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment