Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec5e299c
Commit
ec5e299c
authored
Feb 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.3' into v0.7.3-dev
parents
47bd229c
ed6e9075
Changes
521
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
733 additions
and
139 deletions
+733
-139
tests/conftest.py
tests/conftest.py
+73
-1
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+314
-2
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+5
-5
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+2
-2
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+124
-94
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+6
-3
tests/engine/test_detokenization.py
tests/engine/test_detokenization.py
+6
-4
tests/engine/test_executor.py
tests/engine/test_executor.py
+36
-5
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+8
-4
tests/engine/test_stop_reason.py
tests/engine/test_stop_reason.py
+1
-1
tests/entrypoints/conftest.py
tests/entrypoints/conftest.py
+41
-0
tests/entrypoints/llm/test_accuracy.py
tests/entrypoints/llm/test_accuracy.py
+15
-5
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+10
-3
tests/entrypoints/llm/test_collective_rpc.py
tests/entrypoints/llm/test_collective_rpc.py
+1
-1
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+3
-1
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+3
-1
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+3
-1
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+46
-2
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+29
-2
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+7
-2
No files found.
Too many changes to show.
To preserve performance only
521 of 521+
files are displayed.
Plain diff
Email patch
tests/conftest.py
View file @
ec5e299c
...
...
@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
from
vllm.config
import
LoadFormat
,
TaskOption
,
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
...
...
@@ -49,6 +49,71 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
_SYS_MSG
=
os
.
path
.
join
(
_TEST_DIR
,
"system_messages"
,
"sonnet3.5_nov2024.txt"
)
_M
=
TypeVar
(
"_M"
)
MODELS_ON_S3
=
[
"distilbert/distilgpt2"
,
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Meta-Llama-3-8B"
,
"meta-llama/Llama-3.2-1B"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
"openai-community/gpt2"
,
"ArthurZ/Ilama-3.2-1B"
,
"llava-hf/llava-1.5-7b-hf"
,
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
"ai21labs/Jamba-tiny-random"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Phi-3-mini-128k-instruct-FP8"
,
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
,
"AMead10/Llama-3.2-1B-Instruct-AWQ"
,
"shuyuej/Llama-3.2-1B-Instruct-GPTQ"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head"
,
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024"
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
,
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"
,
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
,
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor"
,
"nm-testing/llama2.c-stories42M-pruned2.4-compressed"
,
]
MODEL_WEIGHTS_S3_BUCKET
=
models_path_prefix
_PromptMultiModalInput
=
Union
[
List
[
_M
],
List
[
List
[
_M
]]]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
...
...
@@ -680,8 +745,14 @@ class VllmRunner:
enable_chunked_prefill
:
bool
=
False
,
swap_space
:
int
=
4
,
enforce_eager
:
Optional
[
bool
]
=
False
,
load_format
:
Optional
[
LoadFormat
]
=
None
,
**
kwargs
,
)
->
None
:
if
model_name
in
MODELS_ON_S3
and
not
load_format
:
model_name
=
(
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
model_name
}
"
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
if
not
load_format
:
load_format
=
LoadFormat
.
AUTO
self
.
model
=
LLM
(
model
=
model_name
,
task
=
task
,
...
...
@@ -696,6 +767,7 @@ class VllmRunner:
max_model_len
=
max_model_len
,
block_size
=
block_size
,
enable_chunked_prefill
=
enable_chunked_prefill
,
load_format
=
load_format
,
**
kwargs
,
)
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
ec5e299c
...
...
@@ -7,6 +7,9 @@ import pytest # noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
.utils
import
create_dummy_prompt
...
...
@@ -16,7 +19,7 @@ def get_sequence_groups(scheduler_output):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
seq_group
,
token_id
:
int
):
def
append_new_token
(
seq_group
:
SequenceGroup
,
token_id
:
int
):
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
...
...
@@ -123,6 +126,232 @@ def test_chunk():
assert
out
.
num_batched_tokens
==
57
def
test_concurrent_chunking
():
"""Verify prefills are chunked properly when
--max-num-partial-prefills is > 1"""
block_size
=
4
max_seqs
=
60
max_model_len
=
2000
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
,
# Up to 2 partial prefills at a time
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Verify both requests are chunked with half of max_num_batched_tokens each
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
32
assert
seq_group_meta
[
1
].
token_chunk_size
==
32
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
64
# After one iteration, both should have 60 - 32 = 28 tokens left to prefill
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
28
assert
seq_group_meta
[
1
].
token_chunk_size
==
28
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
56
def
test_concurrent_chunking_large_requests
():
"""Verify large prefill requests are run one at a time"""
block_size
=
4
max_seqs
=
60
max_model_len
=
2000
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
,
# Up to 2 partial prefills at a time
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
3200
# large KV cache size for large requests
cache_config
.
num_gpu_blocks
=
3200
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
1200
,
# Very large prompt
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
# Verify only a single request is chunked, and it gets all 64 tokens
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
1
assert
seq_group_meta
[
0
].
token_chunk_size
==
64
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
64
def
test_short_prompts_jump_long_prompts_in_queue
():
"""Verify large prefill requests are punted behind smaller ones if
another large prefill request is already running"""
block_size
=
4
max_seqs
=
60
max_model_len
=
2000
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
,
# Up to 2 partial prefills at a time
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
3200
# large KV cache size for large requests
cache_config
.
num_gpu_blocks
=
3200
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
long_seqs
:
List
[
SequenceGroup
]
=
[]
short_seqs
:
List
[
SequenceGroup
]
=
[]
# Add 2 large seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
1200
,
# Very large prompt
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
long_seqs
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# Add 2 small seq groups behind them
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
+
2
),
prompt_length
=
40
,
# Very small prompt
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
short_seqs
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# Verify one large req and 1 small req chunked
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
32
# large req gets 32 tokens
assert
seq_group_meta
[
1
].
token_chunk_size
==
32
# small req gets 32 tokens
# all 4 are prefilling
assert
long_seqs
[
0
].
is_prefill
()
assert
long_seqs
[
1
].
is_prefill
()
assert
short_seqs
[
0
].
is_prefill
()
assert
short_seqs
[
1
].
is_prefill
()
# First short and first long sequences have been scheduled
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
32
assert
long_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
short_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
32
assert
short_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
64
# in the second iteration,
# the first small request had only 8 tokens left
# so it went to decode
# The other small req is scheduled
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# the new small req got 64 - (32+8) tokens
assert
seq_group_meta
[
0
].
token_chunk_size
==
24
assert
seq_group_meta
[
1
].
token_chunk_size
==
32
# large req still got 32
# the other small request had only 8 tokens left
assert
seq_group_meta
[
2
].
token_chunk_size
==
8
# 40-32
# The first small request got to decode now
assert
long_seqs
[
0
].
is_prefill
()
assert
long_seqs
[
1
].
is_prefill
()
assert
not
short_seqs
[
0
].
is_prefill
()
assert
short_seqs
[
1
].
is_prefill
()
# Both small requests have started in front of the second long request
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
64
assert
long_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
short_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
40
assert
short_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
24
assert
out
.
num_prefill_groups
==
3
assert
out
.
num_batched_tokens
==
64
# the first small seq group has a new token appended.
append_new_token
(
short_seqs
[
0
],
1
)
# in the third iteration,
# the first small request is already decoding
# the second small request only has 16 tokens left and will enter decoding
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
32
# large still got 32
# small req finished prefilling 40-24=16 tokens
assert
seq_group_meta
[
1
].
token_chunk_size
==
16
assert
seq_group_meta
[
2
].
token_chunk_size
==
1
# decode
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
49
# (32+16+1 decode)
# both small requests have now reached decode
assert
long_seqs
[
0
].
is_prefill
()
assert
long_seqs
[
1
].
is_prefill
()
assert
not
short_seqs
[
0
].
is_prefill
()
assert
not
short_seqs
[
1
].
is_prefill
()
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
96
assert
long_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
short_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
41
assert
short_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
40
# both the small seq groups have a new token appended
append_new_token
(
short_seqs
[
0
],
1
)
append_new_token
(
short_seqs
[
1
],
1
)
# in the fourth iteration, both small requests are decoding
# so large request gets all the budget
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# large req gets 62 tokens (minus 2 for decode)
assert
seq_group_meta
[
0
].
token_chunk_size
==
62
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
# decode
assert
seq_group_meta
[
2
].
token_chunk_size
==
1
# decode
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
64
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
158
# assert long_seqs[0].is_prefill()
# assert long_seqs[1].is_prefill()
# assert not short_seqs[0].is_prefill()
# assert not short_seqs[1].is_prefill()
# # both the small seq groups have a new token appended
# append_new_token(short_seqs[0], 1)
# append_new_token(short_seqs[1], 1)
# # in the fifth iteration, large request gets all the budget
# # while both small requests are decoding
# seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
# assert seq_group_meta[0].token_chunk_size == 62
# assert seq_group_meta[1].token_chunk_size == 1 # decode
# assert seq_group_meta[2].token_chunk_size == 1 # decode
# assert out.num_prefill_groups == 1
# assert out.num_batched_tokens == 64
def
test_complex
():
block_size
=
4
max_seqs
=
60
...
...
@@ -508,7 +737,7 @@ def test_chunked_prefill_max_seqs():
assert
not
running
[
1
].
is_prefill
()
def
test_p
e
rfix_caching
():
def
test_pr
e
fix_caching
():
"""Verify allocating full blocks when prefix caching is enabled."""
block_size
=
4
max_seqs
=
10
...
...
@@ -548,3 +777,86 @@ def test_perfix_caching():
assert
seq_group_meta
[
1
].
token_chunk_size
==
12
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
62
def
test_prefix_caching_with_concurrent_partial_prefills
():
"""Verify allocating full blocks when prefix caching is enabled with
--max-num-partial-prefills > 1."""
block_size
=
4
max_seqs
=
10
max_model_len
=
8000
max_num_batched_tokens
=
60
# With two slots, each slot will get 30 tokens
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
,
enable_prefix_caching
=
True
)
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
=
block_size
,
prompt_length
=
50
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# To partially prefill both sequences, both can chunk up to 30 tokens
# But the next lowest multiple of the block size (4) is 28
assert
seq_group_meta
[
0
].
token_chunk_size
==
28
assert
seq_group_meta
[
1
].
token_chunk_size
==
28
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
56
# On the next iteration, both sequences should finish prefill
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# Both sequences have 50 - 28 = 22 tokens left to prefill.
# This is not a multiple of the block size, but we don't care since we don't
# cache the final partial block of prefix sequences
assert
seq_group_meta
[
0
].
token_chunk_size
==
22
assert
seq_group_meta
[
1
].
token_chunk_size
==
22
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
44
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"max_num_partial_prefills"
,
[
2
,
4
,
8
])
def
test_chunked_prefill_with_actual_engine
(
model
:
str
,
max_num_partial_prefills
:
int
):
"""Make sure the model can actually sample with concurrent
partial prefills
"""
prompt
=
"hello"
*
40
engine_args
=
EngineArgs
(
model
=
model
,
max_num_partial_prefills
=
max_num_partial_prefills
,
max_num_batched_tokens
=
40
,
max_num_seqs
=
8
,
enable_chunked_prefill
=
True
,
gpu_memory_utilization
=
0.8
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
temperature
=
0
)
for
req_num
in
range
(
max_num_partial_prefills
):
engine
.
add_request
(
f
"
{
req_num
}
"
,
prompt
,
sampling_params
)
# first step
request_outputs
=
engine
.
step
()
# means all are prefilling
assert
len
(
request_outputs
)
==
0
assert
len
(
engine
.
scheduler
[
0
].
running
)
==
max_num_partial_prefills
tests/distributed/test_comm_ops.py
View file @
ec5e299c
...
...
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_tensor_dict_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
tests/distributed/test_custom_all_reduce.py
View file @
ec5e299c
...
...
@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
graph_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
eager_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
tests/distributed/test_pipeline_parallel.py
View file @
ec5e299c
...
...
@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
all workers in a node other than the head node, which can cause the test
to fail.
"""
import
json
import
os
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
...
...
@@ -15,6 +16,7 @@ import pytest
from
vllm.config
import
TaskOption
from
vllm.logger
import
init_logger
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
,
models_path_prefix
logger
=
init_logger
(
"test_pipeline_parallel"
)
...
...
@@ -31,19 +33,29 @@ class ParallelSetup(NamedTuple):
class
PPTestOptions
(
NamedTuple
):
multi_node_only
:
bool
trust_remote_code
:
bool
tokenizer_mode
:
Optional
[
str
]
load_format
:
Optional
[
str
]
=
None
hf_overrides
:
Optional
[
str
]
=
None
@
dataclass
class
PPTestSettings
:
parallel_setups
:
List
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
List
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
List
[
str
]
task
:
TaskOption
test_options
:
PPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
def
detailed
(
*
,
...
...
@@ -51,10 +63,7 @@ class PPTestSettings:
pp_base
:
int
=
2
,
multi_node_only
:
bool
=
False
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
...
...
@@ -79,13 +88,12 @@ class PPTestSettings:
eager_mode
=
True
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
,
"ray"
],
# only ray is supported for V1
distributed_backends
=
[
"mp"
,
"ray"
,
"ray"
],
vllm_major_versions
=
[
"0"
,
"0"
,
"1"
],
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
load_format
=
load_format
),
)
@
staticmethod
...
...
@@ -95,10 +103,7 @@ class PPTestSettings:
pp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
multi_node_only
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
...
...
@@ -108,20 +113,19 @@ class PPTestSettings:
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
],
vllm_major_versions
=
[
"0"
],
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
load_format
=
load_format
),
)
def
iter_params
(
self
,
model_
name
:
str
):
def
iter_params
(
self
,
model_
id
:
str
):
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
distributed_backend
in
self
.
distributed_backends
:
yield
(
model_name
,
parallel_setup
,
distributed_backend
,
for
backend
,
vllm_major_version
in
zip
(
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
model_id
,
parallel_setup
,
backend
,
vllm_major_version
,
self
.
task
,
opts
)
...
...
@@ -133,16 +137,16 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"Snowflake/snowflake-arctic-instruct"
):
PPTestSettings
.
fast
(
tp_base
=
8
,
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan-7B"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan2-13B-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Snowflake/snowflake-arctic-instruct"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan-7B"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan2-13B-Chat"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"bigscience/bloomz-1b1"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/chatglm3-6b"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"CohereForAI/c4ai-command-r-v01"
):
PPTestSettings
.
fast
(
tp_base
=
2
,
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"databricks/dbrx-instruct"
):
PPTestSettings
.
fast
(
tp_base
=
8
),
os
.
path
.
join
(
models_path_prefix
,
"Deci/DeciLM-7B-instruct"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/chatglm3-6b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"CohereForAI/c4ai-command-r-v01"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"databricks/dbrx-instruct"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"Deci/DeciLM-7B-instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-llm-7b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2b"
):
PPTestSettings
.
fast
(),
...
...
@@ -155,36 +159,36 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
):
PPTestSettings
.
fast
(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-chat-7b"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-chat-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/jais-13b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/
Meta-
Llama-3
-8B
"
):
PPTestSettings
.
detailed
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3
.2-1B-Instruct
"
):
PPTestSettings
.
detailed
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
):
PPTestSettings
.
fast
(),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
):
PPTestSettings
.
fast
(
tp_base
=
4
),
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"mosaicml/mpt-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Minitron-8B-Base"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/OLMo-1B-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"shanearora/OLMo-7B-1124-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/OLMoE-1B-7B-0924-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-iml-max-1.3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"OrionStarAI/Orion-14B-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"OrionStarAI/Orion-14B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"adept/persimmon-8b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
):
PPTestSettings
.
detailed
(
trust_remote_code
=
True
,
multi_node_only
=
True
,
load_format
=
"dummy"
,
hf_overrides
=
'{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
):
PPTestSettings
.
detailed
(
multi_node_only
=
True
,
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-7B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"upstage/solar-pro-preview-instruct"
):
PPTestSettings
.
fast
(
tp_base
=
2
),
os
.
path
.
join
(
models_path_prefix
,
"upstage/solar-pro-preview-instruct"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
# FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(
trust_remote_code=True
),
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
# [Encoder-only]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
...
...
@@ -192,9 +196,9 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS
=
{
# type: ignore[var-annotated]
# [Text-only]
"intfloat/e5-mistral-7b-instruct"
:
PPTestSettings
.
fast
(),
"BAAI/bge-multilingual-gemma2"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2.5-Math-RM-72B"
:
PPTestSettings
.
fast
(
tp_base
=
4
,
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
)
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
)
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
}
MULTIMODAL_MODELS
=
{
...
...
@@ -202,20 +206,20 @@ MULTIMODAL_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-Llama3-V-2_5"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
):
PPTestSettings
.
fast
(
tp_base
=
2
,
tokenizer_mode
=
"mistral"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-Llama3-V-2_5"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
3
"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
):
PPTestSettings
.
fast
(),
# [Encoder-decoder]
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
...
...
@@ -226,7 +230,7 @@ MULTIMODAL_MODELS = {
TEST_MODELS
=
[
# [LANGUAGE GENERATION]
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/
Meta-
Llama-3
-8B
"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3
.2-1B-Instruct
"
),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerLM-3b"
),
# [LANGUAGE EMBEDDING]
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
),
...
...
@@ -234,21 +238,23 @@ TEST_MODELS = [
# [MULTIMODAL GENERATION]
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
3
"
),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
),
# [LANGUAGE GENERATION - HYBRID ARCH]
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
),
]
def
_compare_tp
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
num_gpus_available
:
int
,
*
,
method
:
Literal
[
"generate"
,
"encode"
],
is_multimodal
:
bool
,
):
(
tp_size
,
...
...
@@ -256,13 +262,32 @@ def _compare_tp(
eager_mode
,
chunked_prefill
,
)
=
parallel_setup
(
multi_node_only
,
trust_remote_code
,
tokenizer_mode
,
load_format
,
hf_overrides
,
)
=
test_options
multi_node_only
,
load_format
=
test_options
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
if
load_format
==
"dummy"
:
# Avoid OOM
text_overrides
=
{
"num_hidden_layers"
:
4
,
"hidden_size"
:
512
,
"intermediate_size"
:
800
,
"num_attention_heads"
:
4
,
"num_key_value_heads"
:
1
,
}
if
is_multimodal
:
hf_overrides
.
update
({
"text_config"
:
text_overrides
})
else
:
hf_overrides
.
update
(
text_overrides
)
else
:
model_info
.
check_available_online
(
on_fail
=
"skip"
)
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
...
...
@@ -294,12 +319,15 @@ def _compare_tp(
if
load_format
:
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
hf_overrides
])
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)
])
if
(
distributed_backend
==
"ray"
and
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
):
# Test Ray ADAG for a subset of the tests
specific_case
=
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
if
distributed_backend
==
"ray"
and
(
vllm_major_version
==
"1"
or
specific_case
):
# For V1, test Ray ADAG for all the tests
# For V0, test Ray ADAG for a subset of the tests
pp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
...
...
@@ -334,11 +362,7 @@ def _compare_tp(
]
try
:
compare_two_settings
(
model_name
,
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
compare_two_settings
(
model_id
,
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
except
Exception
:
if
pp_env
is
None
:
raise
...
...
@@ -348,81 +372,87 @@ def _compare_tp(
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"
task
"
,
"test_options"
),
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"task"
,
"test_options"
),
[
params
for
model_name
,
settings
in
TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
params
for
model_id
,
settings
in
TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_language_generation
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
num_gpus_available
,
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"generate"
)
method
=
"generate"
,
is_multimodal
=
False
)
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"
task
"
,
"test_options"
),
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"task"
,
"test_options"
),
[
params
for
model_name
,
settings
in
EMBEDDING_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
params
for
model_id
,
settings
in
EMBEDDING_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_language_embedding
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
num_gpus_available
,
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"encode"
)
method
=
"encode"
,
is_multimodal
=
False
)
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"
task
"
,
"test_options"
),
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"task"
,
"test_options"
),
[
params
for
model_name
,
settings
in
MULTIMODAL_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
params
for
model_id
,
settings
in
MULTIMODAL_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_multimodal_generation
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
num_gpus_available
,
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"generate"
)
method
=
"generate"
,
is_multimodal
=
True
)
tests/engine/test_computed_prefix_blocks.py
View file @
ec5e299c
...
...
@@ -2,14 +2,16 @@
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
...
...
@@ -26,6 +28,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration."
)
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
block_size
=
block_size
,
enable_prefix_caching
=
True
)
...
...
tests/engine/test_detokenization.py
View file @
ec5e299c
...
...
@@ -2,13 +2,15 @@
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_computed_prefix_blocks
(
model
:
str
):
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
...
...
@@ -19,7 +21,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?"
)
llm
=
LLM
(
model
=
model
)
llm
=
LLM
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
temperature
=
0.0
,
detokenize
=
False
)
...
...
tests/engine/test_
custom_
executor.py
→
tests/engine/test_executor.py
View file @
ec5e299c
...
...
@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
...
...
@@ -14,6 +15,10 @@ from vllm.sampling_params import SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
class
Mock
:
...
...
...
@@ -34,11 +39,12 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync
=
CustomUniExecutor
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
...
...
@@ -47,7 +53,8 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
...
...
@@ -56,7 +63,9 @@ def test_custom_executor(model, tmp_path):
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutor
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
...
@@ -69,7 +78,8 @@ def test_custom_executor(model, tmp_path):
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_async
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
...
...
@@ -77,7 +87,11 @@ def test_custom_executor_async(model, tmp_path):
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomUniExecutorAsync
)
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
...
@@ -91,3 +105,20 @@ def test_custom_executor_async(model, tmp_path):
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_respect_ray
(
model
):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
# users might do this if they want to manage the
# resources using ray.
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
"ray"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
assert
engine
.
model_executor
.
uses_ray
tests/engine/test_skip_tokenizer_init.py
View file @
ec5e299c
...
...
@@ -2,18 +2,22 @@
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_skip_tokenizer_initialization
(
model
:
str
):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
)
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
...
...
tests/engine/test_stop_reason.py
View file @
ec5e299c
...
...
@@ -14,7 +14,7 @@ import transformers
from
vllm
import
SamplingParams
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"
facebook/opt-350m
"
)
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"
distilbert/distilgpt2
"
)
STOP_STR
=
"."
SEED
=
42
MAX_TOKENS
=
1024
...
...
tests/entrypoints/conftest.py
View file @
ec5e299c
...
...
@@ -141,6 +141,47 @@ def sample_definition_json_schema():
}
@
pytest
.
fixture
def
sample_enum_json_schema
():
return
{
"type"
:
"object"
,
"properties"
:
{
"status"
:
{
"type"
:
"string"
,
"enum"
:
[
"active"
,
"inactive"
,
"pending"
]
# Literal values using enum
},
"priority"
:
{
"type"
:
"string"
,
"enum"
:
[
"low"
,
"medium"
,
"high"
,
"critical"
]
},
"category"
:
{
"type"
:
"object"
,
"properties"
:
{
"type"
:
{
"type"
:
"string"
,
"enum"
:
[
"bug"
,
"feature"
,
"improvement"
]
},
"severity"
:
{
"type"
:
"integer"
,
"enum"
:
[
1
,
2
,
3
,
4
,
5
]
# Enum can also contain numbers
}
},
"required"
:
[
"type"
,
"severity"
]
},
"flags"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"enum"
:
[
"urgent"
,
"blocked"
,
"needs_review"
,
"approved"
]
}
}
},
"required"
:
[
"status"
,
"priority"
,
"category"
,
"flags"
]
}
@
pytest
.
fixture
def
sample_guided_choice
():
return
[
...
...
tests/entrypoints/llm/test_accuracy.py
View file @
ec5e299c
...
...
@@ -23,10 +23,13 @@ RTOL = 0.03
EXPECTED_VALUE
=
0.58
def
run_test
():
def
run_test
(
more_args
=
None
):
"""Run the end to end accuracy test."""
model_args
=
f
"pretrained=
{
MODEL_NAME
}
,max_model_len=2048"
model_args
=
f
"pretrained=
{
MODEL_NAME
}
,max_model_len=4096"
if
more_args
is
not
None
:
model_args
=
"{},{}"
.
format
(
model_args
,
more_args
)
results
=
lm_eval
.
simple_evaluate
(
model
=
"vllm"
,
...
...
@@ -41,14 +44,21 @@ def run_test():
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"V1 is currently only supported on CUDA."
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 is currently only supported on CUDA and TPU"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
run_test
()
more_args
=
None
if
current_platform
.
is_tpu
():
# Limit compilation time for TPU V1
more_args
=
"max_num_seqs=64"
run_test
(
more_args
)
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
):
...
...
tests/entrypoints/llm/test_chat.py
View file @
ec5e299c
...
...
@@ -6,13 +6,18 @@ import os
import
pytest
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
...conftest
import
MODEL_WEIGHTS_S3_BUCKET
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
...utils
import
models_path_prefix
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
def
test_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
messages
=
[
...
...
@@ -30,7 +35,8 @@ def test_chat():
def
test_multi_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
...
...
@@ -67,7 +73,8 @@ def test_multi_chat():
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Phi-3.5-vision-instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
...
...
tests/entrypoints/llm/test_collective_rpc.py
View file @
ec5e299c
...
...
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
def
echo_rank
(
self
):
return
self
.
rank
llm
=
LLM
(
model
=
"
meta-llama
/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
"
s3://vllm-ci-model-weights
/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
load_format
=
"dummy"
,
tensor_parallel_size
=
tp_size
,
...
...
tests/entrypoints/llm/test_encode.py
View file @
ec5e299c
...
...
@@ -7,10 +7,11 @@ import pytest
import
os
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
intfloat/
e5-mistral-7b-instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"e5-mistral-7b-instruct"
)
PROMPTS
=
[
"Hello, my name is"
,
...
...
@@ -34,6 +35,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
32768
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.75
,
...
...
tests/entrypoints/llm/test_generate.py
View file @
ec5e299c
...
...
@@ -7,10 +7,11 @@ import os
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
facebook/opt-125m
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
distilgpt2
"
)
PROMPTS
=
[
"Hello, my name is"
,
...
...
@@ -32,6 +33,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
...
...
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
ec5e299c
...
...
@@ -8,11 +8,12 @@ import os
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
HuggingFaceH4/
zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"zephyr-7b-beta"
)
PROMPTS
=
[
"Hello, my name is"
,
...
...
@@ -29,6 +30,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
enable_lora
=
True
,
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
ec5e299c
...
...
@@ -8,6 +8,7 @@ import jsonschema
import
pytest
import
os
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
...
...
@@ -15,7 +16,7 @@ from vllm.outputs import RequestOutput
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
Qwen/
Qwen2.5-
7
B-Instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen2.5-
1.5
B-Instruct"
)
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
...
...
@@ -23,7 +24,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
max_model_len
=
1024
)
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_model_len
=
1024
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
...
...
@@ -149,6 +152,47 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
schema
=
sample_definition_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_enum_json_completion
(
sample_enum_json_schema
,
llm
,
guided_decoding_backend
:
str
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_enum_json_schema
,
backend
=
guided_decoding_backend
))
outputs
=
llm
.
generate
(
prompts
=
[
"Create a bug report JSON that fits this schema: "
f
"
{
sample_enum_json_schema
}
. Make it for a high priority critical bug."
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_enum_json_schema
)
# Additional assertions to verify enum values
assert
output_json
[
"status"
]
in
[
"active"
,
"inactive"
,
"pending"
]
assert
output_json
[
"priority"
]
in
[
"low"
,
"medium"
,
"high"
,
"critical"
]
assert
output_json
[
"category"
][
"type"
]
in
[
"bug"
,
"feature"
,
"improvement"
]
assert
output_json
[
"category"
][
"severity"
]
in
[
1
,
2
,
3
,
4
,
5
]
for
flag
in
output_json
[
"flags"
]:
assert
flag
in
[
"urgent"
,
"blocked"
,
"needs_review"
,
"approved"
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_choice_completion
(
sample_guided_choice
,
llm
,
...
...
tests/entrypoints/llm/test_lazy_outlines.py
View file @
ec5e299c
...
...
@@ -7,11 +7,12 @@ from contextlib import nullcontext
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
def
run_normal
():
def
run_normal
_opt125m
():
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
@@ -35,9 +36,35 @@ def run_normal():
cleanup_dist_env_and_memory
()
def
run_normal
():
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM without guided decoding as a baseline.
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/distilgpt2"
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# Destroy the LLM object and free up the GPU memory.
del
llm
cleanup_dist_env_and_memory
()
def
run_lmfe
(
sample_regex
):
# Create an LLM with guided decoding enabled.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilgpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
gpu_memory_utilization
=
0.3
)
...
...
tests/entrypoints/llm/test_prompt_validation.py
View file @
ec5e299c
...
...
@@ -5,6 +5,7 @@ import os
from
vllm
import
LLM
from
...utils
import
models_path_prefix
from
vllm.config
import
LoadFormat
@
pytest
.
fixture
(
autouse
=
True
)
...
...
@@ -16,13 +17,17 @@ def v1(run_with_both_engines):
def
test_empty_prompt
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
enforce_eager
=
True
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
@
pytest
.
mark
.
skip_v1
def
test_out_of_vocab_token
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
enforce_eager
=
True
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
Prev
1
…
4
5
6
7
8
9
10
11
12
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment