Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
250 additions
and
147 deletions
+250
-147
tests/distributed/test_pipeline_partition.py
tests/distributed/test_pipeline_partition.py
+57
-27
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pp_cudagraph.py
+29
-18
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+2
-3
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_broadcast.py
+1
-2
tests/distributed/test_torchrun_example.py
tests/distributed/test_torchrun_example.py
+10
-1
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+11
-3
tests/engine/conftest.py
tests/engine/conftest.py
+11
-0
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+1
-6
tests/engine/test_executor.py
tests/engine/test_executor.py
+7
-20
tests/engine/test_multi_step_output_processor.py
tests/engine/test_multi_step_output_processor.py
+1
-1
tests/engine/test_multiproc_workers.py
tests/engine/test_multiproc_workers.py
+3
-3
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+5
-8
tests/entrypoints/llm/test_accuracy.py
tests/entrypoints/llm/test_accuracy.py
+10
-2
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+4
-14
tests/entrypoints/llm/test_collective_rpc.py
tests/entrypoints/llm/test_collective_rpc.py
+5
-14
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+5
-7
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+2
-5
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+1
-3
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+74
-5
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+11
-5
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
tests/distributed/test_pipeline_partition.py
View file @
469e903b
...
...
@@ -7,30 +7,60 @@ import pytest
from
vllm.distributed.utils
import
get_pp_indices
def
test_custom_layer_partition
():
def
_verify
(
partition_str
,
num_layers
,
pp_size
,
goldens
):
bak
=
os
.
environ
.
get
(
"VLLM_PP_LAYER_PARTITION"
,
None
)
os
.
environ
[
"VLLM_PP_LAYER_PARTITION"
]
=
partition_str
for
pp_rank
,
golden
in
enumerate
(
goldens
):
assert
get_pp_indices
(
num_layers
,
pp_rank
,
pp_size
)
==
golden
if
bak
is
not
None
:
os
.
environ
[
"VLLM_PP_LAYER_PARTITION"
]
=
bak
# Even partition
_verify
(
"5,5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Balanced partition
_verify
(
"4,6,6,4"
,
20
,
4
,
[(
0
,
4
),
(
4
,
10
),
(
10
,
16
),
(
16
,
20
)])
# Put reminder somewhere
_verify
(
"5,6,5,6"
,
22
,
4
,
[(
0
,
5
),
(
5
,
11
),
(
11
,
16
),
(
16
,
22
)])
# Invalid partition strings
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,5,"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,a"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Wrong number of partitions
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Wrong number of layers
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,5"
,
21
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
def
test_custom_layer_partition
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
def
_verify
(
partition_str
,
num_layers
,
pp_size
,
goldens
):
bak
=
os
.
environ
.
get
(
"VLLM_PP_LAYER_PARTITION"
,
None
)
m
.
setenv
(
"VLLM_PP_LAYER_PARTITION"
,
partition_str
)
for
pp_rank
,
golden
in
enumerate
(
goldens
):
assert
get_pp_indices
(
num_layers
,
pp_rank
,
pp_size
)
==
golden
if
bak
is
not
None
:
m
.
setenv
(
"VLLM_PP_LAYER_PARTITION"
,
bak
)
# Even partition
_verify
(
"5,5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Balanced partition
_verify
(
"4,6,6,4"
,
20
,
4
,
[(
0
,
4
),
(
4
,
10
),
(
10
,
16
),
(
16
,
20
)])
# Put reminder somewhere
_verify
(
"5,6,5,6"
,
22
,
4
,
[(
0
,
5
),
(
5
,
11
),
(
11
,
16
),
(
16
,
22
)])
# Invalid partition strings
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,5,"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,a"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Wrong number of partitions
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Wrong number of layers
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,5"
,
21
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
@
pytest
.
mark
.
parametrize
(
"num_hidden_layers,pp_size,pp_rank,indices"
,
[
# pp_size 2
(
2
,
2
,
0
,
(
0
,
1
)),
(
2
,
2
,
1
,
(
1
,
2
)),
(
3
,
2
,
0
,
(
0
,
2
)),
(
3
,
2
,
1
,
(
2
,
3
)),
# pp_size 3
(
3
,
3
,
0
,
(
0
,
1
)),
(
3
,
3
,
1
,
(
1
,
2
)),
(
3
,
3
,
2
,
(
2
,
3
)),
(
4
,
3
,
0
,
(
0
,
1
)),
(
4
,
3
,
1
,
(
1
,
3
)),
(
4
,
3
,
2
,
(
3
,
4
)),
(
5
,
3
,
0
,
(
0
,
2
)),
(
5
,
3
,
1
,
(
2
,
4
)),
(
5
,
3
,
2
,
(
4
,
5
)),
])
def
test_uneven_auto_partition
(
num_hidden_layers
:
int
,
pp_size
:
int
,
pp_rank
:
int
,
indices
:
tuple
[
int
,
int
],
):
assert
indices
==
get_pp_indices
(
num_hidden_layers
,
pp_rank
,
pp_size
)
tests/distributed/test_pp_cudagraph.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
import
os
from
typing
import
TYPE_CHECKING
import
os
import
pytest
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
,
models_path_prefix
from
..utils
import
compare_two_settings
,
create_new_process_for_each_test
,
models_path_prefix
if
TYPE_CHECKING
:
from
typing_extensions
import
LiteralString
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
...
...
@@ -14,19 +19,25 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test, models
"FLASH_ATTN"
,
# "FLASHINFER",
])
@
fork_new_process_for_each_test
def
test_pp_cudagraph
(
PP_SIZE
,
MODEL_NAME
,
ATTN_BACKEND
):
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
ATTN_BACKEND
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
@
create_new_process_for_each_test
()
def
test_pp_cudagraph
(
monkeypatch
:
pytest
.
MonkeyPatch
,
PP_SIZE
:
int
,
MODEL_NAME
:
str
,
ATTN_BACKEND
:
LiteralString
,
):
with
monkeypatch
.
context
()
as
m
:
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
ATTN_BACKEND
)
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
tests/distributed/test_pynccl.py
View file @
469e903b
...
...
@@ -2,7 +2,6 @@
import
multiprocessing
import
os
from
typing
import
Dict
,
List
import
pytest
import
torch
...
...
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
processes
:
L
ist
[
multiprocessing
.
Process
]
=
[]
processes
:
l
ist
[
multiprocessing
.
Process
]
=
[]
for
i
in
range
(
number_of_processes
):
env
:
D
ict
[
str
,
str
]
=
{}
env
:
d
ict
[
str
,
str
]
=
{}
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
...
...
tests/distributed/test_shm_broadcast.py
View file @
469e903b
...
...
@@ -3,7 +3,6 @@
import
multiprocessing
import
random
import
time
from
typing
import
List
import
numpy
as
np
import
torch.distributed
as
dist
...
...
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
from
vllm.utils
import
get_ip
,
get_open_port
,
update_environment_variables
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
L
ist
[
np
.
ndarray
]:
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
l
ist
[
np
.
ndarray
]:
np
.
random
.
seed
(
seed
)
sizes
=
np
.
random
.
randint
(
1
,
10_000
,
n
)
# on average, each array will have 5k elements
...
...
tests/distributed/test_torchrun_example.py
View file @
469e903b
...
...
@@ -9,6 +9,8 @@ import torch.distributed as dist
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed.parallel_state
import
get_world_group
dist
.
init_process_group
(
backend
=
"gloo"
)
# Create prompts
prompts
=
[
"Hello, my name is"
,
...
...
@@ -25,7 +27,8 @@ llm = LLM(model="facebook/opt-125m",
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"external_launcher"
,
gpu_memory_utilization
=
random
.
uniform
(
0.7
,
0.9
),
swap_space
=
random
.
randint
(
1
,
4
))
swap_space
=
random
.
randint
(
1
,
4
),
seed
=
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
...
@@ -48,6 +51,12 @@ test_consistent_across_ranks(
test_consistent_across_ranks
(
llm
.
llm_engine
.
vllm_config
.
cache_config
.
num_gpu_blocks
)
# make sure we can access the model parameters from the calling process
# of the `LLM` instance.
params
=
list
(
llm
.
llm_engine
.
model_executor
.
driver_worker
.
worker
.
model_runner
.
model
.
parameters
())
test_consistent_across_ranks
(
len
(
params
))
# all ranks should have the same outputs
for
output
in
outputs
:
prompt
=
output
.
prompt
...
...
tests/encoder_decoder/test_e2e_correctness.py
View file @
469e903b
...
...
@@ -3,7 +3,7 @@
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
pytest
import
os
...
...
@@ -17,7 +17,6 @@ from vllm.sequence import SampleLogprobs
from
..conftest
import
DecoderPromptType
from
..models.utils
import
check_logprobs_close
from
..utils
import
models_path_prefix
from
vllm.utils
import
is_hip
from
vllm.attention.backends.utils
import
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
LIST_ENC_DEC_SUPPORTED_BACKENDS
=
[
...
...
@@ -25,8 +24,17 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
]
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
def
vllm_to_hf_output
(
vllm_output
:
T
uple
[
L
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
vllm_output
:
t
uple
[
l
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
):
"""Sanitize vllm output to be comparable with hf output."""
...
...
tests/engine/conftest.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
tests/engine/test_computed_prefix_blocks.py
View file @
469e903b
...
...
@@ -2,16 +2,12 @@
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
...
...
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration."
)
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
block_size
=
block_size
,
enable_prefix_caching
=
True
)
...
...
tests/engine/test_executor.py
View file @
469e903b
...
...
@@ -2,11 +2,10 @@
import
asyncio
import
os
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Any
,
Callable
,
Optional
,
Union
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
...
...
@@ -15,10 +14,6 @@ from vllm.sampling_params import SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
class
Mock
:
...
...
...
@@ -29,8 +24,8 @@ class CustomUniExecutor(UniProcExecutor):
def
collective_rpc
(
self
,
method
:
Union
[
str
,
Callable
],
timeout
:
Optional
[
float
]
=
None
,
args
:
T
uple
=
(),
kwargs
:
Optional
[
D
ict
]
=
None
)
->
L
ist
[
Any
]:
args
:
t
uple
=
(),
kwargs
:
Optional
[
d
ict
]
=
None
)
->
l
ist
[
Any
]:
# Drop marker to show that this was ran
with
open
(
".marker"
,
"w"
):
...
...
...
@@ -39,12 +34,10 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync
=
CustomUniExecutor
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)])
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
...
...
@@ -53,8 +46,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)])
def
test_custom_executor
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
...
...
@@ -63,7 +55,6 @@ def test_custom_executor(model, tmp_path):
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutor
,
enforce_eager
=
True
,
# reduce test time
)
...
...
@@ -78,8 +69,7 @@ def test_custom_executor(model, tmp_path):
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
def
test_custom_executor_async
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
...
...
@@ -88,7 +78,6 @@ def test_custom_executor_async(model, tmp_path):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
enforce_eager
=
True
,
# reduce test time
)
...
...
@@ -107,8 +96,7 @@ def test_custom_executor_async(model, tmp_path):
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
def
test_respect_ray
(
model
):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
...
...
@@ -117,7 +105,6 @@ def test_respect_ray(model):
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
"ray"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
...
...
tests/engine/output_processor
/test_multi_step
.py
→
tests/engine/
test_multi_step_
output_processor.py
View file @
469e903b
...
...
@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.utils
import
Counter
from
..
.
core.utils
import
create_seq_group
from
..core.utils
import
create_seq_group
@
pytest
.
mark
.
parametrize
(
"seq_output_len"
,
[
128
])
...
...
tests/engine/test_multiproc_workers.py
View file @
469e903b
...
...
@@ -4,7 +4,7 @@ import asyncio
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
from
time
import
sleep
from
typing
import
Any
,
List
,
Tuple
from
typing
import
Any
import
pytest
...
...
@@ -17,7 +17,7 @@ from vllm.worker.worker_base import WorkerWrapperBase
class
DummyWorkerWrapper
(
WorkerWrapperBase
):
"""Dummy version of vllm.worker.worker.Worker"""
def
worker_method
(
self
,
worker_input
:
Any
)
->
T
uple
[
int
,
Any
]:
def
worker_method
(
self
,
worker_input
:
Any
)
->
t
uple
[
int
,
Any
]:
sleep
(
0.05
)
if
isinstance
(
worker_input
,
Exception
):
...
...
@@ -27,7 +27,7 @@ class DummyWorkerWrapper(WorkerWrapperBase):
return
self
.
rpc_rank
,
input
def
_start_workers
()
->
T
uple
[
L
ist
[
ProcessWorkerWrapper
],
WorkerMonitor
]:
def
_start_workers
()
->
t
uple
[
l
ist
[
ProcessWorkerWrapper
],
WorkerMonitor
]:
result_handler
=
ResultHandler
()
vllm_config
=
VllmConfig
()
workers
=
[
...
...
tests/engine/test_skip_tokenizer_init.py
View file @
469e903b
...
...
@@ -2,22 +2,19 @@
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
def
test_skip_tokenizer_initialization
(
model
:
str
):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
,
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
...
...
tests/entrypoints/llm/test_accuracy.py
View file @
469e903b
...
...
@@ -44,10 +44,14 @@ def run_test(more_args=None):
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
# TODO: [AlexM] Fix it with new CI/CD tests
TPU_TP_TEST_STR
=
""
#"tensor_parallel_size=4"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 is currently only supported on CUDA and TPU"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
...
...
@@ -58,10 +62,14 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
# Limit compilation time for TPU V1
more_args
=
"max_num_seqs=64"
# Add TP test (if provided)
if
TPU_TP_TEST_STR
:
more_args
+=
",{}"
.
format
(
TPU_TP_TEST_STR
)
run_test
(
more_args
)
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Run with the V0 Engine."""
with
monkeypatch
.
context
()
as
m
:
...
...
tests/entrypoints/llm/test_chat.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
os
import
pytest
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
...conftest
import
MODEL_WEIGHTS_S3_BUCKET
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
...utils
import
models_path_prefix
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
def
test_chat
():
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
messages
=
[
...
...
@@ -35,8 +28,7 @@ def test_chat():
def
test_multi_chat
():
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
...
...
@@ -71,11 +63,9 @@ def test_multi_chat():
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
L
ist
[
str
]):
def
test_chat_multi_image
(
image_urls
:
l
ist
[
str
]):
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Phi-3.5-vision-instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
dtype
=
"bfloat16"
,
model
=
"microsoft/Phi-3.5-vision-instruct"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
...
...
tests/entrypoints/llm/test_collective_rpc.py
View file @
469e903b
...
...
@@ -4,12 +4,12 @@ import pytest
from
vllm
import
LLM
from
...utils
import
fork
_new_process_for_each_test
from
...utils
import
create
_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"mp"
,
"ray"
])
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_collective_rpc
(
tp_size
,
backend
):
if
tp_size
==
1
and
backend
==
"ray"
:
pytest
.
skip
(
"Skip duplicate test case"
)
...
...
@@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend):
def
echo_rank
(
self
):
return
self
.
rank
from
vllm.worker.worker
import
Worker
class
MyWorker
(
Worker
):
def
echo_rank
(
self
):
return
self
.
rank
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
load_format
=
"dummy"
,
tensor_parallel_size
=
tp_size
,
distributed_executor_backend
=
backend
,
worker_cls
=
MyWorker
)
for
method
in
[
"echo_rank"
,
echo_rank
]:
assert
llm
.
collective_rpc
(
method
)
==
list
(
range
(
tp_size
))
distributed_executor_backend
=
backend
)
assert
llm
.
collective_rpc
(
echo_rank
)
==
list
(
range
(
tp_size
))
tests/entrypoints/llm/test_encode.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
weakref
from
typing
import
List
import
pytest
import
os
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
e5-mistral-7b-instruct
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
intfloat/multilingual-e5-small
"
)
PROMPTS
=
[
"Hello, my name is"
,
...
...
@@ -35,11 +33,11 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
32768
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.75
,
enforce_eager
=
True
)
enforce_eager
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
...
...
@@ -49,8 +47,8 @@ def llm():
cleanup_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
L
ist
[
PoolingRequestOutput
],
o2
:
L
ist
[
PoolingRequestOutput
]):
def
assert_outputs_equal
(
o1
:
l
ist
[
PoolingRequestOutput
],
o2
:
l
ist
[
PoolingRequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
...
...
tests/entrypoints/llm/test_generate.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
weakref
from
typing
import
List
import
os
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"distilgpt2"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
distilbert/
distilgpt2"
)
PROMPTS
=
[
"Hello, my name is"
,
...
...
@@ -33,7 +31,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
...
...
@@ -47,7 +44,7 @@ def llm():
cleanup_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
L
ist
[
RequestOutput
],
o2
:
L
ist
[
RequestOutput
]):
def
assert_outputs_equal
(
o1
:
l
ist
[
RequestOutput
],
o2
:
l
ist
[
RequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
...
...
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
469e903b
...
...
@@ -8,12 +8,11 @@ import os
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
HuggingFaceH4/
zephyr-7b-beta"
)
PROMPTS
=
[
"Hello, my name is"
,
...
...
@@ -30,7 +29,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
enable_lora
=
True
,
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
469e903b
...
...
@@ -7,8 +7,8 @@ import weakref
import
jsonschema
import
pytest
import
os
from
pydantic
import
BaseModel
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
...
...
@@ -17,16 +17,16 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen2.5-1.5B-Instruct"
)
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
,
"guidance"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_model_len
=
1024
)
llm
=
LLM
(
model
=
MODEL_NAME
,
max_model_len
=
1024
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
...
...
@@ -283,6 +283,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_disable_guided_decoding_fallback
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
,
backend
=
"xgrammar:no-fallback"
))
with
pytest
.
raises
(
ValueError
,
match
=
"xgrammar does not support regex guided decoding"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_json_object
(
llm
,
guided_decoding_backend
:
str
):
...
...
@@ -312,3 +328,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
# Parse to verify it is valid JSON
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_json_with_any_whitespace_disabled
(
llm
):
class
ResponseSchema
(
BaseModel
):
clarifying_question
:
str
cost_per_serving
:
str
calories
:
str
type_dish_ids
:
str
type_meal_ids
:
str
product_ids
:
list
[
str
]
exclude_product_ids
:
list
[
str
]
allergen_ids
:
list
[
str
]
total_cooking_time
:
str
kitchen_ids
:
str
holiday_ids
:
str
# Note: Without this setting, the response is sometimes full of `\n`
# for some models. This option prevents that.
guided_decoding_backend
=
'xgrammar:disable-any-whitespace'
schema
=
ResponseSchema
.
model_json_schema
()
guided_params
=
GuidedDecodingParams
(
json
=
schema
,
backend
=
\
guided_decoding_backend
)
sampling_params
=
SamplingParams
(
max_tokens
=
2000
,
frequency_penalty
=
0
,
presence_penalty
=-
1.1
,
repetition_penalty
=
1.3
,
guided_decoding
=
guided_params
)
prompt
=
(
"<|im_start|>system
\n
You are Qwen, created by Alibaba Cloud. You"
"are a helpful assistant.<|im_end|>
\n
<|im_start|>user
\n
I want a "
"quick launch fast with $10.<|im_end|>
\n
<|im_start|>assistant
\n
"
)
outputs
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
assert
"
\n
"
not
in
generated_text
# Parse to verify it is valid JSON
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
jsonschema
.
validate
(
instance
=
parsed_json
,
schema
=
schema
)
tests/entrypoints/llm/test_lazy_outlines.py
View file @
469e903b
...
...
@@ -4,14 +4,22 @@ import sys
import
os
from
contextlib
import
nullcontext
import
pytest
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
V1 only supports xgrammar so this is irrelevant.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
def
run_normal_opt125m
():
prompts
=
[
"Hello, my name is"
,
...
...
@@ -46,8 +54,7 @@ def run_normal():
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM without guided decoding as a baseline.
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/distilgpt2"
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
...
@@ -63,8 +70,7 @@ def run_normal():
def
run_lmfe
(
sample_regex
):
# Create an LLM with guided decoding enabled.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilgpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
gpu_memory_utilization
=
0.3
)
...
...
Prev
1
…
13
14
15
16
17
18
19
20
21
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment