Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
250 additions
and
147 deletions
+250
-147
tests/distributed/test_pipeline_partition.py
tests/distributed/test_pipeline_partition.py
+57
-27
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pp_cudagraph.py
+29
-18
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+2
-3
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_broadcast.py
+1
-2
tests/distributed/test_torchrun_example.py
tests/distributed/test_torchrun_example.py
+10
-1
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+11
-3
tests/engine/conftest.py
tests/engine/conftest.py
+11
-0
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+1
-6
tests/engine/test_executor.py
tests/engine/test_executor.py
+7
-20
tests/engine/test_multi_step_output_processor.py
tests/engine/test_multi_step_output_processor.py
+1
-1
tests/engine/test_multiproc_workers.py
tests/engine/test_multiproc_workers.py
+3
-3
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+5
-8
tests/entrypoints/llm/test_accuracy.py
tests/entrypoints/llm/test_accuracy.py
+10
-2
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+4
-14
tests/entrypoints/llm/test_collective_rpc.py
tests/entrypoints/llm/test_collective_rpc.py
+5
-14
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+5
-7
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+2
-5
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+1
-3
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+74
-5
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+11
-5
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
tests/distributed/test_pipeline_partition.py
View file @
469e903b
...
@@ -7,30 +7,60 @@ import pytest
...
@@ -7,30 +7,60 @@ import pytest
from
vllm.distributed.utils
import
get_pp_indices
from
vllm.distributed.utils
import
get_pp_indices
def
test_custom_layer_partition
():
def
test_custom_layer_partition
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
_verify
(
partition_str
,
num_layers
,
pp_size
,
goldens
):
with
monkeypatch
.
context
()
as
m
:
bak
=
os
.
environ
.
get
(
"VLLM_PP_LAYER_PARTITION"
,
None
)
os
.
environ
[
"VLLM_PP_LAYER_PARTITION"
]
=
partition_str
def
_verify
(
partition_str
,
num_layers
,
pp_size
,
goldens
):
for
pp_rank
,
golden
in
enumerate
(
goldens
):
bak
=
os
.
environ
.
get
(
"VLLM_PP_LAYER_PARTITION"
,
None
)
assert
get_pp_indices
(
num_layers
,
pp_rank
,
pp_size
)
==
golden
m
.
setenv
(
"VLLM_PP_LAYER_PARTITION"
,
partition_str
)
if
bak
is
not
None
:
for
pp_rank
,
golden
in
enumerate
(
goldens
):
os
.
environ
[
"VLLM_PP_LAYER_PARTITION"
]
=
bak
assert
get_pp_indices
(
num_layers
,
pp_rank
,
pp_size
)
==
golden
if
bak
is
not
None
:
# Even partition
m
.
setenv
(
"VLLM_PP_LAYER_PARTITION"
,
bak
)
_verify
(
"5,5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Balanced partition
# Even partition
_verify
(
"4,6,6,4"
,
20
,
4
,
[(
0
,
4
),
(
4
,
10
),
(
10
,
16
),
(
16
,
20
)])
_verify
(
"5,5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Put reminder somewhere
# Balanced partition
_verify
(
"5,6,5,6"
,
22
,
4
,
[(
0
,
5
),
(
5
,
11
),
(
11
,
16
),
(
16
,
22
)])
_verify
(
"4,6,6,4"
,
20
,
4
,
[(
0
,
4
),
(
4
,
10
),
(
10
,
16
),
(
16
,
20
)])
# Invalid partition strings
# Put reminder somewhere
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,6,5,6"
,
22
,
4
,
[(
0
,
5
),
(
5
,
11
),
(
11
,
16
),
(
16
,
22
)])
_verify
(
"5,5,5,5,"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Invalid partition strings
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,a"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
_verify
(
"5,5,5,5,"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Wrong number of partitions
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,a"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
_verify
(
"5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Wrong number of partitions
# Wrong number of layers
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5"
,
20
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
_verify
(
"5,5,5,5"
,
21
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
# Wrong number of layers
with
pytest
.
raises
(
ValueError
):
_verify
(
"5,5,5,5"
,
21
,
4
,
[(
0
,
5
),
(
5
,
10
),
(
10
,
15
),
(
15
,
20
)])
@
pytest
.
mark
.
parametrize
(
"num_hidden_layers,pp_size,pp_rank,indices"
,
[
# pp_size 2
(
2
,
2
,
0
,
(
0
,
1
)),
(
2
,
2
,
1
,
(
1
,
2
)),
(
3
,
2
,
0
,
(
0
,
2
)),
(
3
,
2
,
1
,
(
2
,
3
)),
# pp_size 3
(
3
,
3
,
0
,
(
0
,
1
)),
(
3
,
3
,
1
,
(
1
,
2
)),
(
3
,
3
,
2
,
(
2
,
3
)),
(
4
,
3
,
0
,
(
0
,
1
)),
(
4
,
3
,
1
,
(
1
,
3
)),
(
4
,
3
,
2
,
(
3
,
4
)),
(
5
,
3
,
0
,
(
0
,
2
)),
(
5
,
3
,
1
,
(
2
,
4
)),
(
5
,
3
,
2
,
(
4
,
5
)),
])
def
test_uneven_auto_partition
(
num_hidden_layers
:
int
,
pp_size
:
int
,
pp_rank
:
int
,
indices
:
tuple
[
int
,
int
],
):
assert
indices
==
get_pp_indices
(
num_hidden_layers
,
pp_rank
,
pp_size
)
tests/distributed/test_pp_cudagraph.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
import
os
from
typing
import
TYPE_CHECKING
import
os
import
pytest
import
pytest
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
,
models_path_prefix
from
..utils
import
compare_two_settings
,
create_new_process_for_each_test
,
models_path_prefix
if
TYPE_CHECKING
:
from
typing_extensions
import
LiteralString
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
...
@@ -14,19 +19,25 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test, models
...
@@ -14,19 +19,25 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test, models
"FLASH_ATTN"
,
"FLASH_ATTN"
,
# "FLASHINFER",
# "FLASHINFER",
])
])
@
fork_new_process_for_each_test
@
create_new_process_for_each_test
()
def
test_pp_cudagraph
(
PP_SIZE
,
MODEL_NAME
,
ATTN_BACKEND
):
def
test_pp_cudagraph
(
cudagraph_args
=
[
monkeypatch
:
pytest
.
MonkeyPatch
,
# use half precision for speed and memory savings in CI environment
PP_SIZE
:
int
,
"--dtype"
,
MODEL_NAME
:
str
,
"float16"
,
ATTN_BACKEND
:
LiteralString
,
"--pipeline-parallel-size"
,
):
str
(
PP_SIZE
),
with
monkeypatch
.
context
()
as
m
:
"--distributed-executor-backend"
,
cudagraph_args
=
[
"mp"
,
# use half precision for speed and memory savings in CI environment
]
"--dtype"
,
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
ATTN_BACKEND
"float16"
,
"--pipeline-parallel-size"
,
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
"mp"
,
]
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
ATTN_BACKEND
)
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
tests/distributed/test_pynccl.py
View file @
469e903b
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
import
multiprocessing
import
multiprocessing
import
os
import
os
from
typing
import
Dict
,
List
import
pytest
import
pytest
import
torch
import
torch
...
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
...
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
def
distributed_run
(
fn
,
world_size
):
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
number_of_processes
=
world_size
processes
:
L
ist
[
multiprocessing
.
Process
]
=
[]
processes
:
l
ist
[
multiprocessing
.
Process
]
=
[]
for
i
in
range
(
number_of_processes
):
for
i
in
range
(
number_of_processes
):
env
:
D
ict
[
str
,
str
]
=
{}
env
:
d
ict
[
str
,
str
]
=
{}
env
[
'RANK'
]
=
str
(
i
)
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
...
...
tests/distributed/test_shm_broadcast.py
View file @
469e903b
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
import
multiprocessing
import
multiprocessing
import
random
import
random
import
time
import
time
from
typing
import
List
import
numpy
as
np
import
numpy
as
np
import
torch.distributed
as
dist
import
torch.distributed
as
dist
...
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
...
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
from
vllm.utils
import
get_ip
,
get_open_port
,
update_environment_variables
from
vllm.utils
import
get_ip
,
get_open_port
,
update_environment_variables
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
L
ist
[
np
.
ndarray
]:
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
l
ist
[
np
.
ndarray
]:
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
sizes
=
np
.
random
.
randint
(
1
,
10_000
,
n
)
sizes
=
np
.
random
.
randint
(
1
,
10_000
,
n
)
# on average, each array will have 5k elements
# on average, each array will have 5k elements
...
...
tests/distributed/test_torchrun_example.py
View file @
469e903b
...
@@ -9,6 +9,8 @@ import torch.distributed as dist
...
@@ -9,6 +9,8 @@ import torch.distributed as dist
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed.parallel_state
import
get_world_group
from
vllm.distributed.parallel_state
import
get_world_group
dist
.
init_process_group
(
backend
=
"gloo"
)
# Create prompts
# Create prompts
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -25,7 +27,8 @@ llm = LLM(model="facebook/opt-125m",
...
@@ -25,7 +27,8 @@ llm = LLM(model="facebook/opt-125m",
tensor_parallel_size
=
2
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"external_launcher"
,
distributed_executor_backend
=
"external_launcher"
,
gpu_memory_utilization
=
random
.
uniform
(
0.7
,
0.9
),
gpu_memory_utilization
=
random
.
uniform
(
0.7
,
0.9
),
swap_space
=
random
.
randint
(
1
,
4
))
swap_space
=
random
.
randint
(
1
,
4
),
seed
=
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
@@ -48,6 +51,12 @@ test_consistent_across_ranks(
...
@@ -48,6 +51,12 @@ test_consistent_across_ranks(
test_consistent_across_ranks
(
test_consistent_across_ranks
(
llm
.
llm_engine
.
vllm_config
.
cache_config
.
num_gpu_blocks
)
llm
.
llm_engine
.
vllm_config
.
cache_config
.
num_gpu_blocks
)
# make sure we can access the model parameters from the calling process
# of the `LLM` instance.
params
=
list
(
llm
.
llm_engine
.
model_executor
.
driver_worker
.
worker
.
model_runner
.
model
.
parameters
())
test_consistent_across_ranks
(
len
(
params
))
# all ranks should have the same outputs
# all ranks should have the same outputs
for
output
in
outputs
:
for
output
in
outputs
:
prompt
=
output
.
prompt
prompt
=
output
.
prompt
...
...
tests/encoder_decoder/test_e2e_correctness.py
View file @
469e903b
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
pytest
import
pytest
import
os
import
os
...
@@ -17,7 +17,6 @@ from vllm.sequence import SampleLogprobs
...
@@ -17,7 +17,6 @@ from vllm.sequence import SampleLogprobs
from
..conftest
import
DecoderPromptType
from
..conftest
import
DecoderPromptType
from
..models.utils
import
check_logprobs_close
from
..models.utils
import
check_logprobs_close
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
is_hip
from
vllm.attention.backends.utils
import
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from
vllm.attention.backends.utils
import
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
LIST_ENC_DEC_SUPPORTED_BACKENDS
=
[
LIST_ENC_DEC_SUPPORTED_BACKENDS
=
[
...
@@ -25,8 +24,17 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
...
@@ -25,8 +24,17 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
]
]
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
def
vllm_to_hf_output
(
def
vllm_to_hf_output
(
vllm_output
:
T
uple
[
L
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
vllm_output
:
t
uple
[
l
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
decoder_prompt_type
:
DecoderPromptType
,
):
):
"""Sanitize vllm output to be comparable with hf output."""
"""Sanitize vllm output to be comparable with hf output."""
...
...
tests/engine/conftest.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
tests/engine/test_computed_prefix_blocks.py
View file @
469e903b
...
@@ -2,16 +2,12 @@
...
@@ -2,16 +2,12 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
# This test checks if we are able to run the engine to completion
...
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
...
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration."
)
"decoration."
)
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
block_size
=
block_size
,
block_size
=
block_size
,
enable_prefix_caching
=
True
)
enable_prefix_caching
=
True
)
...
...
tests/engine/test_executor.py
View file @
469e903b
...
@@ -2,11 +2,10 @@
...
@@ -2,11 +2,10 @@
import
asyncio
import
asyncio
import
os
import
os
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Any
,
Callable
,
Optional
,
Union
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
...
@@ -15,10 +14,6 @@ from vllm.sampling_params import SamplingParams
...
@@ -15,10 +14,6 @@ from vllm.sampling_params import SamplingParams
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
class
Mock
:
class
Mock
:
...
...
...
@@ -29,8 +24,8 @@ class CustomUniExecutor(UniProcExecutor):
...
@@ -29,8 +24,8 @@ class CustomUniExecutor(UniProcExecutor):
def
collective_rpc
(
self
,
def
collective_rpc
(
self
,
method
:
Union
[
str
,
Callable
],
method
:
Union
[
str
,
Callable
],
timeout
:
Optional
[
float
]
=
None
,
timeout
:
Optional
[
float
]
=
None
,
args
:
T
uple
=
(),
args
:
t
uple
=
(),
kwargs
:
Optional
[
D
ict
]
=
None
)
->
L
ist
[
Any
]:
kwargs
:
Optional
[
d
ict
]
=
None
)
->
l
ist
[
Any
]:
# Drop marker to show that this was ran
# Drop marker to show that this was ran
with
open
(
".marker"
,
"w"
):
with
open
(
".marker"
,
"w"
):
...
...
...
@@ -39,12 +34,10 @@ class CustomUniExecutor(UniProcExecutor):
...
@@ -39,12 +34,10 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync
=
CustomUniExecutor
CustomUniExecutorAsync
=
CustomUniExecutor
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_type_checking
(
model
):
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
Mock
)
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
...
@@ -53,8 +46,7 @@ def test_custom_executor_type_checking(model):
...
@@ -53,8 +46,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor
(
model
,
tmp_path
):
def
test_custom_executor
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
os
.
chdir
(
tmp_path
)
...
@@ -63,7 +55,6 @@ def test_custom_executor(model, tmp_path):
...
@@ -63,7 +55,6 @@ def test_custom_executor(model, tmp_path):
engine_args
=
EngineArgs
(
engine_args
=
EngineArgs
(
model
=
model
,
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutor
,
distributed_executor_backend
=
CustomUniExecutor
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
)
)
...
@@ -78,8 +69,7 @@ def test_custom_executor(model, tmp_path):
...
@@ -78,8 +69,7 @@ def test_custom_executor(model, tmp_path):
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_async
(
model
,
tmp_path
):
def
test_custom_executor_async
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
os
.
chdir
(
tmp_path
)
...
@@ -88,7 +78,6 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -88,7 +78,6 @@ def test_custom_executor_async(model, tmp_path):
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
model
,
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
)
)
...
@@ -107,8 +96,7 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -107,8 +96,7 @@ def test_custom_executor_async(model, tmp_path):
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_respect_ray
(
model
):
def
test_respect_ray
(
model
):
# even for TP=1 and PP=1,
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
# if users specify ray, we should use ray.
...
@@ -117,7 +105,6 @@ def test_respect_ray(model):
...
@@ -117,7 +105,6 @@ def test_respect_ray(model):
engine_args
=
EngineArgs
(
engine_args
=
EngineArgs
(
model
=
model
,
model
=
model
,
distributed_executor_backend
=
"ray"
,
distributed_executor_backend
=
"ray"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
...
...
tests/engine/output_processor
/test_multi_step
.py
→
tests/engine/
test_multi_step_
output_processor.py
View file @
469e903b
...
@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
...
@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.utils
import
Counter
from
vllm.utils
import
Counter
from
..
.
core.utils
import
create_seq_group
from
..core.utils
import
create_seq_group
@
pytest
.
mark
.
parametrize
(
"seq_output_len"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"seq_output_len"
,
[
128
])
...
...
tests/engine/test_multiproc_workers.py
View file @
469e903b
...
@@ -4,7 +4,7 @@ import asyncio
...
@@ -4,7 +4,7 @@ import asyncio
from
concurrent.futures
import
ThreadPoolExecutor
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
from
functools
import
partial
from
time
import
sleep
from
time
import
sleep
from
typing
import
Any
,
List
,
Tuple
from
typing
import
Any
import
pytest
import
pytest
...
@@ -17,7 +17,7 @@ from vllm.worker.worker_base import WorkerWrapperBase
...
@@ -17,7 +17,7 @@ from vllm.worker.worker_base import WorkerWrapperBase
class
DummyWorkerWrapper
(
WorkerWrapperBase
):
class
DummyWorkerWrapper
(
WorkerWrapperBase
):
"""Dummy version of vllm.worker.worker.Worker"""
"""Dummy version of vllm.worker.worker.Worker"""
def
worker_method
(
self
,
worker_input
:
Any
)
->
T
uple
[
int
,
Any
]:
def
worker_method
(
self
,
worker_input
:
Any
)
->
t
uple
[
int
,
Any
]:
sleep
(
0.05
)
sleep
(
0.05
)
if
isinstance
(
worker_input
,
Exception
):
if
isinstance
(
worker_input
,
Exception
):
...
@@ -27,7 +27,7 @@ class DummyWorkerWrapper(WorkerWrapperBase):
...
@@ -27,7 +27,7 @@ class DummyWorkerWrapper(WorkerWrapperBase):
return
self
.
rpc_rank
,
input
return
self
.
rpc_rank
,
input
def
_start_workers
()
->
T
uple
[
L
ist
[
ProcessWorkerWrapper
],
WorkerMonitor
]:
def
_start_workers
()
->
t
uple
[
l
ist
[
ProcessWorkerWrapper
],
WorkerMonitor
]:
result_handler
=
ResultHandler
()
result_handler
=
ResultHandler
()
vllm_config
=
VllmConfig
()
vllm_config
=
VllmConfig
()
workers
=
[
workers
=
[
...
...
tests/engine/test_skip_tokenizer_init.py
View file @
469e903b
...
@@ -2,22 +2,19 @@
...
@@ -2,22 +2,19 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_skip_tokenizer_initialization
(
model
:
str
):
def
test_skip_tokenizer_initialization
(
model
:
str
):
# This test checks if the flag skip_tokenizer_init skips the initialization
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
# token ids.
llm
=
LLM
(
model
=
model
,
llm
=
LLM
(
skip_tokenizer_init
=
True
,
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
skip_tokenizer_init
=
True
,
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
...
...
tests/entrypoints/llm/test_accuracy.py
View file @
469e903b
...
@@ -44,10 +44,14 @@ def run_test(more_args=None):
...
@@ -44,10 +44,14 @@ def run_test(more_args=None):
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
# TODO: [AlexM] Fix it with new CI/CD tests
TPU_TP_TEST_STR
=
""
#"tensor_parallel_size=4"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
and
not
current_platform
.
is_tpu
(),
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 is currently only supported on CUDA and TPU"
)
reason
=
"V1 is currently only supported on CUDA and TPU"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Run with the V1 Engine."""
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
@@ -58,10 +62,14 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
...
@@ -58,10 +62,14 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
# Limit compilation time for TPU V1
# Limit compilation time for TPU V1
more_args
=
"max_num_seqs=64"
more_args
=
"max_num_seqs=64"
# Add TP test (if provided)
if
TPU_TP_TEST_STR
:
more_args
+=
",{}"
.
format
(
TPU_TP_TEST_STR
)
run_test
(
more_args
)
run_test
(
more_args
)
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Run with the V0 Engine."""
"""Run with the V0 Engine."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
...
tests/entrypoints/llm/test_chat.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
os
import
os
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
...conftest
import
MODEL_WEIGHTS_S3_BUCKET
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
def
test_chat
():
def
test_chat
():
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
prompt1
=
"Explain the concept of entropy."
messages
=
[
messages
=
[
...
@@ -35,8 +28,7 @@ def test_chat():
...
@@ -35,8 +28,7 @@ def test_chat():
def
test_multi_chat
():
def
test_multi_chat
():
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
prompt2
=
"Explain what among us is."
...
@@ -71,11 +63,9 @@ def test_multi_chat():
...
@@ -71,11 +63,9 @@ def test_multi_chat():
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
L
ist
[
str
]):
def
test_chat_multi_image
(
image_urls
:
l
ist
[
str
]):
llm
=
LLM
(
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Phi-3.5-vision-instruct"
,
model
=
"microsoft/Phi-3.5-vision-instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
enforce_eager
=
True
,
...
...
tests/entrypoints/llm/test_collective_rpc.py
View file @
469e903b
...
@@ -4,12 +4,12 @@ import pytest
...
@@ -4,12 +4,12 @@ import pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
...utils
import
fork
_new_process_for_each_test
from
...utils
import
create
_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"mp"
,
"ray"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"mp"
,
"ray"
])
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_collective_rpc
(
tp_size
,
backend
):
def
test_collective_rpc
(
tp_size
,
backend
):
if
tp_size
==
1
and
backend
==
"ray"
:
if
tp_size
==
1
and
backend
==
"ray"
:
pytest
.
skip
(
"Skip duplicate test case"
)
pytest
.
skip
(
"Skip duplicate test case"
)
...
@@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend):
...
@@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend):
def
echo_rank
(
self
):
def
echo_rank
(
self
):
return
self
.
rank
return
self
.
rank
from
vllm.worker.worker
import
Worker
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
class
MyWorker
(
Worker
):
def
echo_rank
(
self
):
return
self
.
rank
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
enforce_eager
=
True
,
load_format
=
"dummy"
,
load_format
=
"dummy"
,
tensor_parallel_size
=
tp_size
,
tensor_parallel_size
=
tp_size
,
distributed_executor_backend
=
backend
,
distributed_executor_backend
=
backend
)
worker_cls
=
MyWorker
)
assert
llm
.
collective_rpc
(
echo_rank
)
==
list
(
range
(
tp_size
))
for
method
in
[
"echo_rank"
,
echo_rank
]:
assert
llm
.
collective_rpc
(
method
)
==
list
(
range
(
tp_size
))
tests/entrypoints/llm/test_encode.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
weakref
import
weakref
from
typing
import
List
import
pytest
import
pytest
import
os
import
os
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
e5-mistral-7b-instruct
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
intfloat/multilingual-e5-small
"
)
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -35,11 +33,11 @@ def llm():
...
@@ -35,11 +33,11 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
32768
,
max_num_batched_tokens
=
32768
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.75
,
gpu_memory_utilization
=
0.75
,
enforce_eager
=
True
)
enforce_eager
=
True
,
seed
=
0
)
with
llm
.
deprecate_legacy_api
():
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
yield
weakref
.
proxy
(
llm
)
...
@@ -49,8 +47,8 @@ def llm():
...
@@ -49,8 +47,8 @@ def llm():
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
L
ist
[
PoolingRequestOutput
],
def
assert_outputs_equal
(
o1
:
l
ist
[
PoolingRequestOutput
],
o2
:
L
ist
[
PoolingRequestOutput
]):
o2
:
l
ist
[
PoolingRequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
...
...
tests/entrypoints/llm/test_generate.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
weakref
import
weakref
from
typing
import
List
import
os
import
os
import
pytest
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"distilgpt2"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
distilbert/
distilgpt2"
)
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -33,7 +31,6 @@ def llm():
...
@@ -33,7 +31,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
gpu_memory_utilization
=
0.10
,
...
@@ -47,7 +44,7 @@ def llm():
...
@@ -47,7 +44,7 @@ def llm():
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
L
ist
[
RequestOutput
],
o2
:
L
ist
[
RequestOutput
]):
def
assert_outputs_equal
(
o1
:
l
ist
[
RequestOutput
],
o2
:
l
ist
[
RequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
...
...
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
469e903b
...
@@ -8,12 +8,11 @@ import os
...
@@ -8,12 +8,11 @@ import os
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
HuggingFaceH4/
zephyr-7b-beta"
)
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -30,7 +29,6 @@ def llm():
...
@@ -30,7 +29,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
max_model_len
=
8192
,
enable_lora
=
True
,
enable_lora
=
True
,
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
469e903b
...
@@ -7,8 +7,8 @@ import weakref
...
@@ -7,8 +7,8 @@ import weakref
import
jsonschema
import
jsonschema
import
pytest
import
pytest
import
os
import
os
from
pydantic
import
BaseModel
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
...
@@ -17,16 +17,16 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams
...
@@ -17,16 +17,16 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen2.5-1.5B-Instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen2.5-1.5B-Instruct"
)
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
,
"guidance"
]
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
max_model_len
=
1024
,
seed
=
0
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_model_len
=
1024
)
with
llm
.
deprecate_legacy_api
():
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
yield
weakref
.
proxy
(
llm
)
...
@@ -283,6 +283,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
...
@@ -283,6 +283,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_disable_guided_decoding_fallback
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
,
backend
=
"xgrammar:no-fallback"
))
with
pytest
.
raises
(
ValueError
,
match
=
"xgrammar does not support regex guided decoding"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_json_object
(
llm
,
guided_decoding_backend
:
str
):
def
test_guided_json_object
(
llm
,
guided_decoding_backend
:
str
):
...
@@ -312,3 +328,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
...
@@ -312,3 +328,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
# Parse to verify it is valid JSON
# Parse to verify it is valid JSON
parsed_json
=
json
.
loads
(
generated_text
)
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
assert
isinstance
(
parsed_json
,
dict
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_json_with_any_whitespace_disabled
(
llm
):
class
ResponseSchema
(
BaseModel
):
clarifying_question
:
str
cost_per_serving
:
str
calories
:
str
type_dish_ids
:
str
type_meal_ids
:
str
product_ids
:
list
[
str
]
exclude_product_ids
:
list
[
str
]
allergen_ids
:
list
[
str
]
total_cooking_time
:
str
kitchen_ids
:
str
holiday_ids
:
str
# Note: Without this setting, the response is sometimes full of `\n`
# for some models. This option prevents that.
guided_decoding_backend
=
'xgrammar:disable-any-whitespace'
schema
=
ResponseSchema
.
model_json_schema
()
guided_params
=
GuidedDecodingParams
(
json
=
schema
,
backend
=
\
guided_decoding_backend
)
sampling_params
=
SamplingParams
(
max_tokens
=
2000
,
frequency_penalty
=
0
,
presence_penalty
=-
1.1
,
repetition_penalty
=
1.3
,
guided_decoding
=
guided_params
)
prompt
=
(
"<|im_start|>system
\n
You are Qwen, created by Alibaba Cloud. You"
"are a helpful assistant.<|im_end|>
\n
<|im_start|>user
\n
I want a "
"quick launch fast with $10.<|im_end|>
\n
<|im_start|>assistant
\n
"
)
outputs
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
assert
"
\n
"
not
in
generated_text
# Parse to verify it is valid JSON
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
jsonschema
.
validate
(
instance
=
parsed_json
,
schema
=
schema
)
tests/entrypoints/llm/test_lazy_outlines.py
View file @
469e903b
...
@@ -4,14 +4,22 @@ import sys
...
@@ -4,14 +4,22 @@ import sys
import
os
import
os
from
contextlib
import
nullcontext
from
contextlib
import
nullcontext
import
pytest
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
V1 only supports xgrammar so this is irrelevant.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
def
run_normal_opt125m
():
def
run_normal_opt125m
():
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -46,8 +54,7 @@ def run_normal():
...
@@ -46,8 +54,7 @@ def run_normal():
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM without guided decoding as a baseline.
# Create an LLM without guided decoding as a baseline.
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/distilgpt2"
,
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
gpu_memory_utilization
=
0.3
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
@@ -63,8 +70,7 @@ def run_normal():
...
@@ -63,8 +70,7 @@ def run_normal():
def
run_lmfe
(
sample_regex
):
def
run_lmfe
(
sample_regex
):
# Create an LLM with guided decoding enabled.
# Create an LLM with guided decoding enabled.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilgpt2"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
guided_decoding_backend
=
"lm-format-enforcer"
,
gpu_memory_utilization
=
0.3
)
gpu_memory_utilization
=
0.3
)
...
...
Prev
1
…
13
14
15
16
17
18
19
20
21
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment