Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec5e299c
Commit
ec5e299c
authored
Feb 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.3' into v0.7.3-dev
parents
47bd229c
ed6e9075
Changes
521
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
733 additions
and
139 deletions
+733
-139
tests/conftest.py
tests/conftest.py
+73
-1
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+314
-2
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+5
-5
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+2
-2
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+124
-94
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+6
-3
tests/engine/test_detokenization.py
tests/engine/test_detokenization.py
+6
-4
tests/engine/test_executor.py
tests/engine/test_executor.py
+36
-5
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+8
-4
tests/engine/test_stop_reason.py
tests/engine/test_stop_reason.py
+1
-1
tests/entrypoints/conftest.py
tests/entrypoints/conftest.py
+41
-0
tests/entrypoints/llm/test_accuracy.py
tests/entrypoints/llm/test_accuracy.py
+15
-5
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+10
-3
tests/entrypoints/llm/test_collective_rpc.py
tests/entrypoints/llm/test_collective_rpc.py
+1
-1
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+3
-1
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+3
-1
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+3
-1
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+46
-2
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+29
-2
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+7
-2
No files found.
Too many changes to show.
To preserve performance only
521 of 521+
files are displayed.
Plain diff
Email patch
tests/conftest.py
View file @
ec5e299c
...
@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
...
@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
from
vllm.config
import
LoadFormat
,
TaskOption
,
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
init_distributed_environment
,
...
@@ -49,6 +49,71 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
...
@@ -49,6 +49,71 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
_SYS_MSG
=
os
.
path
.
join
(
_TEST_DIR
,
"system_messages"
,
"sonnet3.5_nov2024.txt"
)
_SYS_MSG
=
os
.
path
.
join
(
_TEST_DIR
,
"system_messages"
,
"sonnet3.5_nov2024.txt"
)
_M
=
TypeVar
(
"_M"
)
_M
=
TypeVar
(
"_M"
)
MODELS_ON_S3
=
[
"distilbert/distilgpt2"
,
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Meta-Llama-3-8B"
,
"meta-llama/Llama-3.2-1B"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
"openai-community/gpt2"
,
"ArthurZ/Ilama-3.2-1B"
,
"llava-hf/llava-1.5-7b-hf"
,
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
"ai21labs/Jamba-tiny-random"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Phi-3-mini-128k-instruct-FP8"
,
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
,
"AMead10/Llama-3.2-1B-Instruct-AWQ"
,
"shuyuej/Llama-3.2-1B-Instruct-GPTQ"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head"
,
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024"
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
,
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"
,
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
,
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor"
,
"nm-testing/llama2.c-stories42M-pruned2.4-compressed"
,
]
MODEL_WEIGHTS_S3_BUCKET
=
models_path_prefix
_PromptMultiModalInput
=
Union
[
List
[
_M
],
List
[
List
[
_M
]]]
_PromptMultiModalInput
=
Union
[
List
[
_M
],
List
[
List
[
_M
]]]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
...
@@ -680,8 +745,14 @@ class VllmRunner:
...
@@ -680,8 +745,14 @@ class VllmRunner:
enable_chunked_prefill
:
bool
=
False
,
enable_chunked_prefill
:
bool
=
False
,
swap_space
:
int
=
4
,
swap_space
:
int
=
4
,
enforce_eager
:
Optional
[
bool
]
=
False
,
enforce_eager
:
Optional
[
bool
]
=
False
,
load_format
:
Optional
[
LoadFormat
]
=
None
,
**
kwargs
,
**
kwargs
,
)
->
None
:
)
->
None
:
if
model_name
in
MODELS_ON_S3
and
not
load_format
:
model_name
=
(
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
model_name
}
"
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
if
not
load_format
:
load_format
=
LoadFormat
.
AUTO
self
.
model
=
LLM
(
self
.
model
=
LLM
(
model
=
model_name
,
model
=
model_name
,
task
=
task
,
task
=
task
,
...
@@ -696,6 +767,7 @@ class VllmRunner:
...
@@ -696,6 +767,7 @@ class VllmRunner:
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
block_size
=
block_size
,
block_size
=
block_size
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
load_format
=
load_format
,
**
kwargs
,
**
kwargs
,
)
)
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
ec5e299c
...
@@ -7,6 +7,9 @@ import pytest # noqa
...
@@ -7,6 +7,9 @@ import pytest # noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.core.scheduler
import
Scheduler
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
.utils
import
create_dummy_prompt
from
.utils
import
create_dummy_prompt
...
@@ -16,7 +19,7 @@ def get_sequence_groups(scheduler_output):
...
@@ -16,7 +19,7 @@ def get_sequence_groups(scheduler_output):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
seq_group
,
token_id
:
int
):
def
append_new_token
(
seq_group
:
SequenceGroup
,
token_id
:
int
):
for
seq
in
seq_group
.
get_seqs
():
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
...
@@ -123,6 +126,232 @@ def test_chunk():
...
@@ -123,6 +126,232 @@ def test_chunk():
assert
out
.
num_batched_tokens
==
57
assert
out
.
num_batched_tokens
==
57
def
test_concurrent_chunking
():
"""Verify prefills are chunked properly when
--max-num-partial-prefills is > 1"""
block_size
=
4
max_seqs
=
60
max_model_len
=
2000
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
,
# Up to 2 partial prefills at a time
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Verify both requests are chunked with half of max_num_batched_tokens each
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
32
assert
seq_group_meta
[
1
].
token_chunk_size
==
32
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
64
# After one iteration, both should have 60 - 32 = 28 tokens left to prefill
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
28
assert
seq_group_meta
[
1
].
token_chunk_size
==
28
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
56
def
test_concurrent_chunking_large_requests
():
"""Verify large prefill requests are run one at a time"""
block_size
=
4
max_seqs
=
60
max_model_len
=
2000
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
,
# Up to 2 partial prefills at a time
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
3200
# large KV cache size for large requests
cache_config
.
num_gpu_blocks
=
3200
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
1200
,
# Very large prompt
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
# Verify only a single request is chunked, and it gets all 64 tokens
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
1
assert
seq_group_meta
[
0
].
token_chunk_size
==
64
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
64
def
test_short_prompts_jump_long_prompts_in_queue
():
"""Verify large prefill requests are punted behind smaller ones if
another large prefill request is already running"""
block_size
=
4
max_seqs
=
60
max_model_len
=
2000
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
,
# Up to 2 partial prefills at a time
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
3200
# large KV cache size for large requests
cache_config
.
num_gpu_blocks
=
3200
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
long_seqs
:
List
[
SequenceGroup
]
=
[]
short_seqs
:
List
[
SequenceGroup
]
=
[]
# Add 2 large seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
1200
,
# Very large prompt
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
long_seqs
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# Add 2 small seq groups behind them
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
+
2
),
prompt_length
=
40
,
# Very small prompt
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
short_seqs
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# Verify one large req and 1 small req chunked
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
32
# large req gets 32 tokens
assert
seq_group_meta
[
1
].
token_chunk_size
==
32
# small req gets 32 tokens
# all 4 are prefilling
assert
long_seqs
[
0
].
is_prefill
()
assert
long_seqs
[
1
].
is_prefill
()
assert
short_seqs
[
0
].
is_prefill
()
assert
short_seqs
[
1
].
is_prefill
()
# First short and first long sequences have been scheduled
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
32
assert
long_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
short_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
32
assert
short_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
64
# in the second iteration,
# the first small request had only 8 tokens left
# so it went to decode
# The other small req is scheduled
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# the new small req got 64 - (32+8) tokens
assert
seq_group_meta
[
0
].
token_chunk_size
==
24
assert
seq_group_meta
[
1
].
token_chunk_size
==
32
# large req still got 32
# the other small request had only 8 tokens left
assert
seq_group_meta
[
2
].
token_chunk_size
==
8
# 40-32
# The first small request got to decode now
assert
long_seqs
[
0
].
is_prefill
()
assert
long_seqs
[
1
].
is_prefill
()
assert
not
short_seqs
[
0
].
is_prefill
()
assert
short_seqs
[
1
].
is_prefill
()
# Both small requests have started in front of the second long request
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
64
assert
long_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
short_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
40
assert
short_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
24
assert
out
.
num_prefill_groups
==
3
assert
out
.
num_batched_tokens
==
64
# the first small seq group has a new token appended.
append_new_token
(
short_seqs
[
0
],
1
)
# in the third iteration,
# the first small request is already decoding
# the second small request only has 16 tokens left and will enter decoding
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
32
# large still got 32
# small req finished prefilling 40-24=16 tokens
assert
seq_group_meta
[
1
].
token_chunk_size
==
16
assert
seq_group_meta
[
2
].
token_chunk_size
==
1
# decode
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
49
# (32+16+1 decode)
# both small requests have now reached decode
assert
long_seqs
[
0
].
is_prefill
()
assert
long_seqs
[
1
].
is_prefill
()
assert
not
short_seqs
[
0
].
is_prefill
()
assert
not
short_seqs
[
1
].
is_prefill
()
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
96
assert
long_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
0
assert
short_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
41
assert
short_seqs
[
1
].
first_seq
.
get_num_computed_tokens
()
==
40
# both the small seq groups have a new token appended
append_new_token
(
short_seqs
[
0
],
1
)
append_new_token
(
short_seqs
[
1
],
1
)
# in the fourth iteration, both small requests are decoding
# so large request gets all the budget
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# large req gets 62 tokens (minus 2 for decode)
assert
seq_group_meta
[
0
].
token_chunk_size
==
62
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
# decode
assert
seq_group_meta
[
2
].
token_chunk_size
==
1
# decode
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
64
assert
long_seqs
[
0
].
first_seq
.
get_num_computed_tokens
()
==
158
# assert long_seqs[0].is_prefill()
# assert long_seqs[1].is_prefill()
# assert not short_seqs[0].is_prefill()
# assert not short_seqs[1].is_prefill()
# # both the small seq groups have a new token appended
# append_new_token(short_seqs[0], 1)
# append_new_token(short_seqs[1], 1)
# # in the fifth iteration, large request gets all the budget
# # while both small requests are decoding
# seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
# assert seq_group_meta[0].token_chunk_size == 62
# assert seq_group_meta[1].token_chunk_size == 1 # decode
# assert seq_group_meta[2].token_chunk_size == 1 # decode
# assert out.num_prefill_groups == 1
# assert out.num_batched_tokens == 64
def
test_complex
():
def
test_complex
():
block_size
=
4
block_size
=
4
max_seqs
=
60
max_seqs
=
60
...
@@ -508,7 +737,7 @@ def test_chunked_prefill_max_seqs():
...
@@ -508,7 +737,7 @@ def test_chunked_prefill_max_seqs():
assert
not
running
[
1
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
def
test_p
e
rfix_caching
():
def
test_pr
e
fix_caching
():
"""Verify allocating full blocks when prefix caching is enabled."""
"""Verify allocating full blocks when prefix caching is enabled."""
block_size
=
4
block_size
=
4
max_seqs
=
10
max_seqs
=
10
...
@@ -548,3 +777,86 @@ def test_perfix_caching():
...
@@ -548,3 +777,86 @@ def test_perfix_caching():
assert
seq_group_meta
[
1
].
token_chunk_size
==
12
assert
seq_group_meta
[
1
].
token_chunk_size
==
12
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
62
assert
out
.
num_batched_tokens
==
62
def
test_prefix_caching_with_concurrent_partial_prefills
():
"""Verify allocating full blocks when prefix caching is enabled with
--max-num-partial-prefills > 1."""
block_size
=
4
max_seqs
=
10
max_model_len
=
8000
max_num_batched_tokens
=
60
# With two slots, each slot will get 30 tokens
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
max_num_partial_prefills
=
2
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
,
enable_prefix_caching
=
True
)
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
=
block_size
,
prompt_length
=
50
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# To partially prefill both sequences, both can chunk up to 30 tokens
# But the next lowest multiple of the block size (4) is 28
assert
seq_group_meta
[
0
].
token_chunk_size
==
28
assert
seq_group_meta
[
1
].
token_chunk_size
==
28
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
56
# On the next iteration, both sequences should finish prefill
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# Both sequences have 50 - 28 = 22 tokens left to prefill.
# This is not a multiple of the block size, but we don't care since we don't
# cache the final partial block of prefix sequences
assert
seq_group_meta
[
0
].
token_chunk_size
==
22
assert
seq_group_meta
[
1
].
token_chunk_size
==
22
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
44
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"max_num_partial_prefills"
,
[
2
,
4
,
8
])
def
test_chunked_prefill_with_actual_engine
(
model
:
str
,
max_num_partial_prefills
:
int
):
"""Make sure the model can actually sample with concurrent
partial prefills
"""
prompt
=
"hello"
*
40
engine_args
=
EngineArgs
(
model
=
model
,
max_num_partial_prefills
=
max_num_partial_prefills
,
max_num_batched_tokens
=
40
,
max_num_seqs
=
8
,
enable_chunked_prefill
=
True
,
gpu_memory_utilization
=
0.8
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
temperature
=
0
)
for
req_num
in
range
(
max_num_partial_prefills
):
engine
.
add_request
(
f
"
{
req_num
}
"
,
prompt
,
sampling_params
)
# first step
request_outputs
=
engine
.
step
()
# means all are prefilling
assert
len
(
request_outputs
)
==
0
assert
len
(
engine
.
scheduler
[
0
].
running
)
==
max_num_partial_prefills
tests/distributed/test_comm_ops.py
View file @
ec5e299c
...
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_tensor_dict_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
def
send_recv_tensor_dict_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
def
send_recv_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
tests/distributed/test_custom_all_reduce.py
View file @
ec5e299c
...
@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
...
@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
graph_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
def
graph_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
...
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
eager_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
def
eager_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
tests/distributed/test_pipeline_parallel.py
View file @
ec5e299c
...
@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
...
@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
all workers in a node other than the head node, which can cause the test
all workers in a node other than the head node, which can cause the test
to fail.
to fail.
"""
"""
import
json
import
os
import
os
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
...
@@ -15,6 +16,7 @@ import pytest
...
@@ -15,6 +16,7 @@ import pytest
from
vllm.config
import
TaskOption
from
vllm.config
import
TaskOption
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
,
models_path_prefix
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
,
models_path_prefix
logger
=
init_logger
(
"test_pipeline_parallel"
)
logger
=
init_logger
(
"test_pipeline_parallel"
)
...
@@ -31,19 +33,29 @@ class ParallelSetup(NamedTuple):
...
@@ -31,19 +33,29 @@ class ParallelSetup(NamedTuple):
class
PPTestOptions
(
NamedTuple
):
class
PPTestOptions
(
NamedTuple
):
multi_node_only
:
bool
multi_node_only
:
bool
trust_remote_code
:
bool
tokenizer_mode
:
Optional
[
str
]
load_format
:
Optional
[
str
]
=
None
load_format
:
Optional
[
str
]
=
None
hf_overrides
:
Optional
[
str
]
=
None
@
dataclass
@
dataclass
class
PPTestSettings
:
class
PPTestSettings
:
parallel_setups
:
List
[
ParallelSetup
]
parallel_setups
:
List
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
List
[
str
]
distributed_backends
:
List
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
List
[
str
]
task
:
TaskOption
task
:
TaskOption
test_options
:
PPTestOptions
test_options
:
PPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
@
staticmethod
def
detailed
(
def
detailed
(
*
,
*
,
...
@@ -51,10 +63,7 @@ class PPTestSettings:
...
@@ -51,10 +63,7 @@ class PPTestSettings:
pp_base
:
int
=
2
,
pp_base
:
int
=
2
,
multi_node_only
:
bool
=
False
,
multi_node_only
:
bool
=
False
,
task
:
TaskOption
=
"auto"
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
):
return
PPTestSettings
(
return
PPTestSettings
(
parallel_setups
=
[
parallel_setups
=
[
...
@@ -79,13 +88,12 @@ class PPTestSettings:
...
@@ -79,13 +88,12 @@ class PPTestSettings:
eager_mode
=
True
,
eager_mode
=
True
,
chunked_prefill
=
False
),
chunked_prefill
=
False
),
],
],
distributed_backends
=
[
"mp"
,
"ray"
],
# only ray is supported for V1
distributed_backends
=
[
"mp"
,
"ray"
,
"ray"
],
vllm_major_versions
=
[
"0"
,
"0"
,
"1"
],
task
=
task
,
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
load_format
=
load_format
),
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
)
)
@
staticmethod
@
staticmethod
...
@@ -95,10 +103,7 @@ class PPTestSettings:
...
@@ -95,10 +103,7 @@ class PPTestSettings:
pp_base
:
int
=
2
,
pp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
task
:
TaskOption
=
"auto"
,
multi_node_only
:
bool
=
False
,
multi_node_only
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
):
return
PPTestSettings
(
return
PPTestSettings
(
parallel_setups
=
[
parallel_setups
=
[
...
@@ -108,20 +113,19 @@ class PPTestSettings:
...
@@ -108,20 +113,19 @@ class PPTestSettings:
chunked_prefill
=
False
),
chunked_prefill
=
False
),
],
],
distributed_backends
=
[
"mp"
],
distributed_backends
=
[
"mp"
],
vllm_major_versions
=
[
"0"
],
task
=
task
,
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
load_format
=
load_format
),
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
)
)
def
iter_params
(
self
,
model_
name
:
str
):
def
iter_params
(
self
,
model_
id
:
str
):
opts
=
self
.
test_options
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
parallel_setup
in
self
.
parallel_setups
:
for
distributed_backend
in
self
.
distributed_backends
:
for
backend
,
vllm_major_version
in
zip
(
self
.
distributed_backends
,
yield
(
model_name
,
parallel_setup
,
distributed_backend
,
self
.
vllm_major_versions
):
yield
(
model_id
,
parallel_setup
,
backend
,
vllm_major_version
,
self
.
task
,
opts
)
self
.
task
,
opts
)
...
@@ -133,16 +137,16 @@ TEXT_GENERATION_MODELS = {
...
@@ -133,16 +137,16 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only]
# [Decoder-only]
# Uses Llama
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"Snowflake/snowflake-arctic-instruct"
):
PPTestSettings
.
fast
(
tp_base
=
8
,
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Snowflake/snowflake-arctic-instruct"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan-7B"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan-7B"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan2-13B-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan2-13B-Chat"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"bigscience/bloomz-1b1"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigscience/bloomz-1b1"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/chatglm3-6b"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/chatglm3-6b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"CohereForAI/c4ai-command-r-v01"
):
PPTestSettings
.
fast
(
tp_base
=
2
,
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"CohereForAI/c4ai-command-r-v01"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"databricks/dbrx-instruct"
):
PPTestSettings
.
fast
(
tp_base
=
8
),
os
.
path
.
join
(
models_path_prefix
,
"databricks/dbrx-instruct"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"Deci/DeciLM-7B-instruct"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"Deci/DeciLM-7B-instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-llm-7b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-llm-7b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2b"
):
PPTestSettings
.
fast
(),
...
@@ -155,36 +159,36 @@ TEXT_GENERATION_MODELS = {
...
@@ -155,36 +159,36 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
):
PPTestSettings
.
fast
(),
# Uses Llama
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-chat-7b"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-chat-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/jais-13b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/jais-13b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/
Meta-
Llama-3
-8B
"
):
PPTestSettings
.
detailed
(),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3
.2-1B-Instruct
"
):
PPTestSettings
.
detailed
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
):
PPTestSettings
.
fast
(),
# Uses Llama
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
):
PPTestSettings
.
fast
(
tp_base
=
4
),
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"mosaicml/mpt-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"mosaicml/mpt-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Minitron-8B-Base"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Minitron-8B-Base"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/OLMo-1B-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/OLMo-1B-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"shanearora/OLMo-7B-1124-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"shanearora/OLMo-7B-1124-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/OLMoE-1B-7B-0924-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/OLMoE-1B-7B-0924-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-iml-max-1.3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-iml-max-1.3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"OrionStarAI/Orion-14B-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"OrionStarAI/Orion-14B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"adept/persimmon-8b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"adept/persimmon-8b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
):
PPTestSettings
.
detailed
(
trust_remote_code
=
True
,
multi_node_only
=
True
,
load_format
=
"dummy"
,
hf_overrides
=
'{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
):
PPTestSettings
.
detailed
(
multi_node_only
=
True
,
load_format
=
"dummy"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-7B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-7B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"upstage/solar-pro-preview-instruct"
):
PPTestSettings
.
fast
(
tp_base
=
2
),
os
.
path
.
join
(
models_path_prefix
,
"upstage/solar-pro-preview-instruct"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
# FIXME: Cannot load tokenizer in latest transformers version.
# FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(
trust_remote_code=True
),
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
# [Encoder-only]
# [Encoder-only]
# TODO: Implement PP
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
# "facebook/bart-base": PPTestSettings.fast(),
...
@@ -192,9 +196,9 @@ TEXT_GENERATION_MODELS = {
...
@@ -192,9 +196,9 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS
=
{
# type: ignore[var-annotated]
EMBEDDING_MODELS
=
{
# type: ignore[var-annotated]
# [Text-only]
# [Text-only]
"intfloat/e5-mistral-7b-instruct"
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)
:
PPTestSettings
.
fast
(),
"BAAI/bge-multilingual-gemma2"
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
)
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2.5-Math-RM-72B"
:
PPTestSettings
.
fast
(
tp_base
=
4
,
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
)
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
}
}
MULTIMODAL_MODELS
=
{
MULTIMODAL_MODELS
=
{
...
@@ -202,20 +206,20 @@ MULTIMODAL_MODELS = {
...
@@ -202,20 +206,20 @@ MULTIMODAL_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-Llama3-V-2_5"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-Llama3-V-2_5"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
):
PPTestSettings
.
fast
(
),
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
):
PPTestSettings
.
fast
(
tp_base
=
2
,
tokenizer_mode
=
"mistral"
),
# noqa: E501
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
3
"
):
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
):
PPTestSettings
.
fast
(),
# [Encoder-decoder]
# [Encoder-decoder]
# TODO: Implement PP
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
...
@@ -226,7 +230,7 @@ MULTIMODAL_MODELS = {
...
@@ -226,7 +230,7 @@ MULTIMODAL_MODELS = {
TEST_MODELS
=
[
TEST_MODELS
=
[
# [LANGUAGE GENERATION]
# [LANGUAGE GENERATION]
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/
Meta-
Llama-3
-8B
"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3
.2-1B-Instruct
"
),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerLM-3b"
),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerLM-3b"
),
# [LANGUAGE EMBEDDING]
# [LANGUAGE EMBEDDING]
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
),
...
@@ -234,21 +238,23 @@ TEST_MODELS = [
...
@@ -234,21 +238,23 @@ TEST_MODELS = [
# [MULTIMODAL GENERATION]
# [MULTIMODAL GENERATION]
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
),
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
3
"
),
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
),
# [LANGUAGE GENERATION - HYBRID ARCH]
# [LANGUAGE GENERATION - HYBRID ARCH]
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
),
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
),
]
]
def
_compare_tp
(
def
_compare_tp
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
test_options
:
PPTestOptions
,
num_gpus_available
:
int
,
num_gpus_available
:
int
,
*
,
*
,
method
:
Literal
[
"generate"
,
"encode"
],
method
:
Literal
[
"generate"
,
"encode"
],
is_multimodal
:
bool
,
):
):
(
(
tp_size
,
tp_size
,
...
@@ -256,13 +262,32 @@ def _compare_tp(
...
@@ -256,13 +262,32 @@ def _compare_tp(
eager_mode
,
eager_mode
,
chunked_prefill
,
chunked_prefill
,
)
=
parallel_setup
)
=
parallel_setup
(
multi_node_only
,
multi_node_only
,
load_format
=
test_options
trust_remote_code
,
tokenizer_mode
,
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
load_format
,
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
hf_overrides
,
)
=
test_options
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
if
load_format
==
"dummy"
:
# Avoid OOM
text_overrides
=
{
"num_hidden_layers"
:
4
,
"hidden_size"
:
512
,
"intermediate_size"
:
800
,
"num_attention_heads"
:
4
,
"num_key_value_heads"
:
1
,
}
if
is_multimodal
:
hf_overrides
.
update
({
"text_config"
:
text_overrides
})
else
:
hf_overrides
.
update
(
text_overrides
)
else
:
model_info
.
check_available_online
(
on_fail
=
"skip"
)
if
num_gpus_available
<
tp_size
*
pp_size
:
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
...
@@ -294,12 +319,15 @@ def _compare_tp(
...
@@ -294,12 +319,15 @@ def _compare_tp(
if
load_format
:
if
load_format
:
common_args
.
extend
([
"--load-format"
,
load_format
])
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
hf_overrides
])
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)
])
if
(
distributed_backend
==
"ray"
and
tp_size
==
2
and
pp_size
==
2
specific_case
=
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
and
chunked_prefill
):
if
distributed_backend
==
"ray"
and
(
vllm_major_version
==
"1"
# Test Ray ADAG for a subset of the tests
or
specific_case
):
# For V1, test Ray ADAG for all the tests
# For V0, test Ray ADAG for a subset of the tests
pp_env
=
{
pp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
...
@@ -334,11 +362,7 @@ def _compare_tp(
...
@@ -334,11 +362,7 @@ def _compare_tp(
]
]
try
:
try
:
compare_two_settings
(
model_name
,
compare_two_settings
(
model_id
,
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
except
Exception
:
except
Exception
:
if
pp_env
is
None
:
if
pp_env
is
None
:
raise
raise
...
@@ -348,81 +372,87 @@ def _compare_tp(
...
@@ -348,81 +372,87 @@ def _compare_tp(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"
task
"
,
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"test_options"
),
"task"
,
"test_options"
),
[
[
params
for
model_name
,
settings
in
TEXT_GENERATION_MODELS
.
items
()
params
for
model_id
,
settings
in
TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
if
model_name
in
TEST_MODELS
],
],
)
)
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_tp_language_generation
(
def
test_tp_language_generation
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
test_options
:
PPTestOptions
,
num_gpus_available
,
num_gpus_available
,
):
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
vllm_major_version
,
task
,
task
,
test_options
,
test_options
,
num_gpus_available
,
num_gpus_available
,
method
=
"generate"
)
method
=
"generate"
,
is_multimodal
=
False
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"
task
"
,
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"test_options"
),
"task"
,
"test_options"
),
[
[
params
for
model_name
,
settings
in
EMBEDDING_MODELS
.
items
()
params
for
model_id
,
settings
in
EMBEDDING_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
if
model_name
in
TEST_MODELS
],
],
)
)
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_tp_language_embedding
(
def
test_tp_language_embedding
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
test_options
:
PPTestOptions
,
num_gpus_available
,
num_gpus_available
,
):
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
vllm_major_version
,
task
,
task
,
test_options
,
test_options
,
num_gpus_available
,
num_gpus_available
,
method
=
"encode"
)
method
=
"encode"
,
is_multimodal
=
False
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"
task
"
,
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"test_options"
),
"task"
,
"test_options"
),
[
[
params
for
model_name
,
settings
in
MULTIMODAL_MODELS
.
items
()
params
for
model_id
,
settings
in
MULTIMODAL_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
if
model_name
in
TEST_MODELS
],
],
)
)
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_tp_multimodal_generation
(
def
test_tp_multimodal_generation
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
task
:
TaskOption
,
task
:
TaskOption
,
test_options
:
PPTestOptions
,
test_options
:
PPTestOptions
,
num_gpus_available
,
num_gpus_available
,
):
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
vllm_major_version
,
task
,
task
,
test_options
,
test_options
,
num_gpus_available
,
num_gpus_available
,
method
=
"generate"
)
method
=
"generate"
,
is_multimodal
=
True
)
tests/engine/test_computed_prefix_blocks.py
View file @
ec5e299c
...
@@ -2,14 +2,16 @@
...
@@ -2,14 +2,16 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
# This test checks if we are able to run the engine to completion
...
@@ -26,6 +28,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
...
@@ -26,6 +28,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration."
)
"decoration."
)
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
block_size
=
block_size
,
block_size
=
block_size
,
enable_prefix_caching
=
True
)
enable_prefix_caching
=
True
)
...
...
tests/engine/test_detokenization.py
View file @
ec5e299c
...
@@ -2,13 +2,15 @@
...
@@ -2,13 +2,15 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_computed_prefix_blocks
(
model
:
str
):
def
test_computed_prefix_blocks
(
model
:
str
):
# This test checks if the engine generates completions both with and
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
# without optional detokenization, that detokenization includes text
...
@@ -19,7 +21,7 @@ def test_computed_prefix_blocks(model: str):
...
@@ -19,7 +21,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"paper clips? Is there an easy to follow video tutorial available "
"online for free?"
)
"online for free?"
)
llm
=
LLM
(
model
=
model
)
llm
=
LLM
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
detokenize
=
False
)
detokenize
=
False
)
...
...
tests/engine/test_
custom_
executor.py
→
tests/engine/test_executor.py
View file @
ec5e299c
...
@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
...
@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
...
@@ -14,6 +15,10 @@ from vllm.sampling_params import SamplingParams
...
@@ -14,6 +15,10 @@ from vllm.sampling_params import SamplingParams
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
class
Mock
:
class
Mock
:
...
...
...
@@ -34,11 +39,12 @@ class CustomUniExecutor(UniProcExecutor):
...
@@ -34,11 +39,12 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync
=
CustomUniExecutor
CustomUniExecutorAsync
=
CustomUniExecutor
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_type_checking
(
model
):
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
Mock
)
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
...
@@ -47,7 +53,8 @@ def test_custom_executor_type_checking(model):
...
@@ -47,7 +53,8 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor
(
model
,
tmp_path
):
def
test_custom_executor
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
os
.
chdir
(
tmp_path
)
...
@@ -56,7 +63,9 @@ def test_custom_executor(model, tmp_path):
...
@@ -56,7 +63,9 @@ def test_custom_executor(model, tmp_path):
engine_args
=
EngineArgs
(
engine_args
=
EngineArgs
(
model
=
model
,
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutor
,
distributed_executor_backend
=
CustomUniExecutor
,
enforce_eager
=
True
,
# reduce test time
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
@@ -69,7 +78,8 @@ def test_custom_executor(model, tmp_path):
...
@@ -69,7 +78,8 @@ def test_custom_executor(model, tmp_path):
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_async
(
model
,
tmp_path
):
def
test_custom_executor_async
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
os
.
chdir
(
tmp_path
)
...
@@ -77,7 +87,11 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -77,7 +87,11 @@ def test_custom_executor_async(model, tmp_path):
assert
not
os
.
path
.
exists
(
".marker"
)
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomUniExecutorAsync
)
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
@@ -91,3 +105,20 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -91,3 +105,20 @@ def test_custom_executor_async(model, tmp_path):
assert
os
.
path
.
exists
(
".marker"
)
assert
os
.
path
.
exists
(
".marker"
)
finally
:
finally
:
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_respect_ray
(
model
):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
# users might do this if they want to manage the
# resources using ray.
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
"ray"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
assert
engine
.
model_executor
.
uses_ray
tests/engine/test_skip_tokenizer_init.py
View file @
ec5e299c
...
@@ -2,18 +2,22 @@
...
@@ -2,18 +2,22 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
import
os
from
..utils
import
models_path_prefix
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_skip_tokenizer_initialization
(
model
:
str
):
def
test_skip_tokenizer_initialization
(
model
:
str
):
# This test checks if the flag skip_tokenizer_init skips the initialization
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
# token ids.
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
)
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
...
...
tests/engine/test_stop_reason.py
View file @
ec5e299c
...
@@ -14,7 +14,7 @@ import transformers
...
@@ -14,7 +14,7 @@ import transformers
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"
facebook/opt-350m
"
)
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"
distilbert/distilgpt2
"
)
STOP_STR
=
"."
STOP_STR
=
"."
SEED
=
42
SEED
=
42
MAX_TOKENS
=
1024
MAX_TOKENS
=
1024
...
...
tests/entrypoints/conftest.py
View file @
ec5e299c
...
@@ -141,6 +141,47 @@ def sample_definition_json_schema():
...
@@ -141,6 +141,47 @@ def sample_definition_json_schema():
}
}
@
pytest
.
fixture
def
sample_enum_json_schema
():
return
{
"type"
:
"object"
,
"properties"
:
{
"status"
:
{
"type"
:
"string"
,
"enum"
:
[
"active"
,
"inactive"
,
"pending"
]
# Literal values using enum
},
"priority"
:
{
"type"
:
"string"
,
"enum"
:
[
"low"
,
"medium"
,
"high"
,
"critical"
]
},
"category"
:
{
"type"
:
"object"
,
"properties"
:
{
"type"
:
{
"type"
:
"string"
,
"enum"
:
[
"bug"
,
"feature"
,
"improvement"
]
},
"severity"
:
{
"type"
:
"integer"
,
"enum"
:
[
1
,
2
,
3
,
4
,
5
]
# Enum can also contain numbers
}
},
"required"
:
[
"type"
,
"severity"
]
},
"flags"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"enum"
:
[
"urgent"
,
"blocked"
,
"needs_review"
,
"approved"
]
}
}
},
"required"
:
[
"status"
,
"priority"
,
"category"
,
"flags"
]
}
@
pytest
.
fixture
@
pytest
.
fixture
def
sample_guided_choice
():
def
sample_guided_choice
():
return
[
return
[
...
...
tests/entrypoints/llm/test_accuracy.py
View file @
ec5e299c
...
@@ -23,10 +23,13 @@ RTOL = 0.03
...
@@ -23,10 +23,13 @@ RTOL = 0.03
EXPECTED_VALUE
=
0.58
EXPECTED_VALUE
=
0.58
def
run_test
():
def
run_test
(
more_args
=
None
):
"""Run the end to end accuracy test."""
"""Run the end to end accuracy test."""
model_args
=
f
"pretrained=
{
MODEL_NAME
}
,max_model_len=2048"
model_args
=
f
"pretrained=
{
MODEL_NAME
}
,max_model_len=4096"
if
more_args
is
not
None
:
model_args
=
"{},{}"
.
format
(
model_args
,
more_args
)
results
=
lm_eval
.
simple_evaluate
(
results
=
lm_eval
.
simple_evaluate
(
model
=
"vllm"
,
model
=
"vllm"
,
...
@@ -41,14 +44,21 @@ def run_test():
...
@@ -41,14 +44,21 @@ def run_test():
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
reason
=
"V1 is currently only supported on CUDA."
)
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 is currently only supported on CUDA and TPU"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
"""Run with the V1 Engine."""
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
run_test
()
more_args
=
None
if
current_platform
.
is_tpu
():
# Limit compilation time for TPU V1
more_args
=
"max_num_seqs=64"
run_test
(
more_args
)
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
):
...
...
tests/entrypoints/llm/test_chat.py
View file @
ec5e299c
...
@@ -6,13 +6,18 @@ import os
...
@@ -6,13 +6,18 @@ import os
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
...conftest
import
MODEL_WEIGHTS_S3_BUCKET
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
def
test_chat
():
def
test_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
prompt1
=
"Explain the concept of entropy."
messages
=
[
messages
=
[
...
@@ -30,7 +35,8 @@ def test_chat():
...
@@ -30,7 +35,8 @@ def test_chat():
def
test_multi_chat
():
def
test_multi_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
prompt2
=
"Explain what among us is."
...
@@ -67,7 +73,8 @@ def test_multi_chat():
...
@@ -67,7 +73,8 @@ def test_multi_chat():
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Phi-3.5-vision-instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
max_num_seqs
=
5
,
...
...
tests/entrypoints/llm/test_collective_rpc.py
View file @
ec5e299c
...
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
...
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
def
echo_rank
(
self
):
def
echo_rank
(
self
):
return
self
.
rank
return
self
.
rank
llm
=
LLM
(
model
=
"
meta-llama
/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
"
s3://vllm-ci-model-weights
/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
enforce_eager
=
True
,
load_format
=
"dummy"
,
load_format
=
"dummy"
,
tensor_parallel_size
=
tp_size
,
tensor_parallel_size
=
tp_size
,
...
...
tests/entrypoints/llm/test_encode.py
View file @
ec5e299c
...
@@ -7,10 +7,11 @@ import pytest
...
@@ -7,10 +7,11 @@ import pytest
import
os
import
os
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
intfloat/
e5-mistral-7b-instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"e5-mistral-7b-instruct"
)
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -34,6 +35,7 @@ def llm():
...
@@ -34,6 +35,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
32768
,
max_num_batched_tokens
=
32768
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.75
,
gpu_memory_utilization
=
0.75
,
...
...
tests/entrypoints/llm/test_generate.py
View file @
ec5e299c
...
@@ -7,10 +7,11 @@ import os
...
@@ -7,10 +7,11 @@ import os
import
pytest
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
facebook/opt-125m
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
distilgpt2
"
)
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -32,6 +33,7 @@ def llm():
...
@@ -32,6 +33,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
gpu_memory_utilization
=
0.10
,
...
...
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
ec5e299c
...
@@ -8,11 +8,12 @@ import os
...
@@ -8,11 +8,12 @@ import os
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
HuggingFaceH4/
zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"zephyr-7b-beta"
)
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -29,6 +30,7 @@ def llm():
...
@@ -29,6 +30,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
max_model_len
=
8192
,
enable_lora
=
True
,
enable_lora
=
True
,
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
ec5e299c
...
@@ -8,6 +8,7 @@ import jsonschema
...
@@ -8,6 +8,7 @@ import jsonschema
import
pytest
import
pytest
import
os
import
os
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
...
@@ -15,7 +16,7 @@ from vllm.outputs import RequestOutput
...
@@ -15,7 +16,7 @@ from vllm.outputs import RequestOutput
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
Qwen/
Qwen2.5-
7
B-Instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen2.5-
1.5
B-Instruct"
)
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
...
@@ -23,7 +24,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
...
@@ -23,7 +24,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
def
llm
():
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
max_model_len
=
1024
)
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_model_len
=
1024
)
with
llm
.
deprecate_legacy_api
():
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
yield
weakref
.
proxy
(
llm
)
...
@@ -149,6 +152,47 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
...
@@ -149,6 +152,47 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
schema
=
sample_definition_json_schema
)
schema
=
sample_definition_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_enum_json_completion
(
sample_enum_json_schema
,
llm
,
guided_decoding_backend
:
str
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_enum_json_schema
,
backend
=
guided_decoding_backend
))
outputs
=
llm
.
generate
(
prompts
=
[
"Create a bug report JSON that fits this schema: "
f
"
{
sample_enum_json_schema
}
. Make it for a high priority critical bug."
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_enum_json_schema
)
# Additional assertions to verify enum values
assert
output_json
[
"status"
]
in
[
"active"
,
"inactive"
,
"pending"
]
assert
output_json
[
"priority"
]
in
[
"low"
,
"medium"
,
"high"
,
"critical"
]
assert
output_json
[
"category"
][
"type"
]
in
[
"bug"
,
"feature"
,
"improvement"
]
assert
output_json
[
"category"
][
"severity"
]
in
[
1
,
2
,
3
,
4
,
5
]
for
flag
in
output_json
[
"flags"
]:
assert
flag
in
[
"urgent"
,
"blocked"
,
"needs_review"
,
"approved"
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
def
test_guided_choice_completion
(
sample_guided_choice
,
llm
,
def
test_guided_choice_completion
(
sample_guided_choice
,
llm
,
...
...
tests/entrypoints/llm/test_lazy_outlines.py
View file @
ec5e299c
...
@@ -7,11 +7,12 @@ from contextlib import nullcontext
...
@@ -7,11 +7,12 @@ from contextlib import nullcontext
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
def
run_normal
():
def
run_normal
_opt125m
():
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
"The president of the United States is"
,
"The president of the United States is"
,
...
@@ -35,9 +36,35 @@ def run_normal():
...
@@ -35,9 +36,35 @@ def run_normal():
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
def
run_normal
():
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM without guided decoding as a baseline.
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/distilgpt2"
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# Destroy the LLM object and free up the GPU memory.
del
llm
cleanup_dist_env_and_memory
()
def
run_lmfe
(
sample_regex
):
def
run_lmfe
(
sample_regex
):
# Create an LLM with guided decoding enabled.
# Create an LLM with guided decoding enabled.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilgpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
guided_decoding_backend
=
"lm-format-enforcer"
,
gpu_memory_utilization
=
0.3
)
gpu_memory_utilization
=
0.3
)
...
...
tests/entrypoints/llm/test_prompt_validation.py
View file @
ec5e299c
...
@@ -5,6 +5,7 @@ import os
...
@@ -5,6 +5,7 @@ import os
from
vllm
import
LLM
from
vllm
import
LLM
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
from
vllm.config
import
LoadFormat
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -16,13 +17,17 @@ def v1(run_with_both_engines):
...
@@ -16,13 +17,17 @@ def v1(run_with_both_engines):
def
test_empty_prompt
():
def
test_empty_prompt
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
enforce_eager
=
True
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
llm
.
generate
([
""
])
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
skip_v1
def
test_out_of_vocab_token
():
def
test_out_of_vocab_token
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
enforce_eager
=
True
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
Prev
1
…
4
5
6
7
8
9
10
11
12
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment