Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6d2051cc
Commit
6d2051cc
authored
Oct 21, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev
parents
2c7f740a
a2c71c54
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
967 additions
and
397 deletions
+967
-397
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+24
-37
tests/core/test_num_computed_tokens_update.py
tests/core/test_num_computed_tokens_update.py
+80
-0
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+55
-88
tests/core/utils.py
tests/core/utils.py
+6
-7
tests/data/test_config.yaml
tests/data/test_config.yaml
+1
-0
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+327
-67
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+34
-14
tests/engine/test_custom_executor.py
tests/engine/test_custom_executor.py
+4
-4
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+0
-34
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+0
-37
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+43
-23
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+3
-1
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+57
-1
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+12
-2
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+23
-7
tests/entrypoints/openai/test_chunked_prompt.py
tests/entrypoints/openai/test_chunked_prompt.py
+126
-0
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_cli_args.py
+109
-69
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+2
-2
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+61
-0
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+0
-4
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
tests/core/test_chunked_prefill_scheduler.py
View file @
6d2051cc
...
@@ -27,19 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
...
@@ -27,19 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
return
metas
,
out
return
metas
,
out
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_simple
():
def
test_simple
(
use_v2_block_manager
:
bool
):
"""Verify basic scheduling works."""
"""Verify basic scheduling works."""
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_num_batched_tokens
,
num_seq_group
,
num_seq_group
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -74,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
...
@@ -74,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
assert
len
(
seq_group_meta
)
==
num_seq_group
assert
len
(
seq_group_meta
)
==
num_seq_group
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_chunk
():
def
test_chunk
(
use_v2_block_manager
:
bool
):
"""Verify prefills are chunked properly."""
"""Verify prefills are chunked properly."""
block_size
=
4
block_size
=
4
max_seqs
=
60
max_seqs
=
60
...
@@ -86,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
...
@@ -86,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
...
@@ -124,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
...
@@ -124,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
assert
out
.
num_batched_tokens
==
57
assert
out
.
num_batched_tokens
==
57
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_complex
():
def
test_complex
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
60
max_seqs
=
60
max_model_len
=
80
max_model_len
=
80
...
@@ -135,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
...
@@ -135,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
64
cache_config
.
num_cpu_blocks
=
64
cache_config
.
num_gpu_blocks
=
64
cache_config
.
num_gpu_blocks
=
64
...
@@ -194,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
...
@@ -194,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
assert
running
[
2
].
is_prefill
()
assert
running
[
2
].
is_prefill
()
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_maximal_decoding
():
def
test_maximal_decoding
(
use_v2_block_manager
:
bool
):
"""Verify decoding requests are prioritized."""
"""Verify decoding requests are prioritized."""
block_size
=
4
block_size
=
4
max_seqs
=
2
max_seqs
=
2
...
@@ -206,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
...
@@ -206,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -288,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
...
@@ -288,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
assert
out
.
num_batched_tokens
==
2
assert
out
.
num_batched_tokens
==
2
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prompt_limit
():
def
test_prompt_limit
(
use_v2_block_manager
:
bool
):
"""Verify max_num_batched_tokens < max_model_len is possible."""
"""Verify max_num_batched_tokens < max_model_len is possible."""
block_size
=
4
block_size
=
4
max_seqs
=
32
max_seqs
=
32
...
@@ -300,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
...
@@ -300,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
...
@@ -323,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
...
@@ -323,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
assert
out
.
num_batched_tokens
==
32
assert
out
.
num_batched_tokens
==
32
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prompt_limit_exceed
():
def
test_prompt_limit_exceed
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
64
max_seqs
=
64
max_model_len
=
32
max_model_len
=
32
...
@@ -349,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
...
@@ -349,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
assert
out
.
ignored_seq_groups
[
0
]
==
seq_group
assert
out
.
ignored_seq_groups
[
0
]
==
seq_group
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_swap
():
def
test_swap
(
use_v2_block_manager
:
bool
):
"""Verify swapping works with chunked prefill requests"""
"""Verify swapping works with chunked prefill requests"""
block_size
=
4
block_size
=
4
max_seqs
=
30
max_seqs
=
30
...
@@ -361,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
...
@@ -361,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
...
@@ -407,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
...
@@ -407,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_out
==
[]
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_running_prefill_prioritized_over_swap
():
def
test_running_prefill_prioritized_over_swap
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
30
max_seqs
=
30
max_model_len
=
200
max_model_len
=
200
...
@@ -418,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
...
@@ -418,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
...
@@ -501,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
...
@@ -501,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_out
==
[]
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_chunked_prefill_preempt
():
def
test_chunked_prefill_preempt
(
use_v2_block_manager
:
bool
):
"""Verify preempt works with chunked prefill requests"""
"""Verify preempt works with chunked prefill requests"""
block_size
=
4
block_size
=
4
max_seqs
=
30
max_seqs
=
30
...
@@ -513,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
...
@@ -513,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
...
@@ -568,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
...
@@ -568,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_chunked_prefill_max_seqs
():
def
test_chunked_prefill_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
2
max_seqs
=
2
max_model_len
=
80
max_model_len
=
80
...
@@ -579,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
...
@@ -579,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
128
cache_config
.
num_cpu_blocks
=
128
cache_config
.
num_gpu_blocks
=
128
cache_config
.
num_gpu_blocks
=
128
...
@@ -622,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
...
@@ -622,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
assert
not
running
[
1
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_perfix_caching
():
def
test_perfix_caching
(
use_v2_block_manager
:
bool
):
"""Verify allocating full blocks when prefix caching is enabled."""
"""Verify allocating full blocks when prefix caching is enabled."""
block_size
=
4
block_size
=
4
max_seqs
=
10
max_seqs
=
10
...
@@ -634,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
...
@@ -634,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1.0
,
1
,
1
,
...
...
tests/core/test_num_computed_tokens_update.py
0 → 100644
View file @
6d2051cc
import
pytest
from
tests.conftest
import
VllmRunner
from
tests.core.utils
import
create_dummy_prompt
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SequenceGroup
MODEL
=
"JackFram/llama-160m"
def
add_seq_group_to_engine
(
engine
:
LLMEngine
,
seq_group
:
SequenceGroup
):
scheduler
=
engine
.
scheduler
[
0
]
scheduler
.
add_seq_group
(
seq_group
)
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"enable_chunked_prefill"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
def
test_num_computed_tokens_update
(
num_scheduler_steps
:
int
,
enable_chunked_prefill
:
bool
,
enforce_eager
:
bool
):
is_multi_step
=
num_scheduler_steps
>
1
is_multi_step_chunked_prefill
=
is_multi_step
and
enable_chunked_prefill
if
is_multi_step_chunked_prefill
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Multi-step with Chunked-Prefill does not support "
"rocm_flash_attn backend"
)
# Make a vllm engine
runner
=
VllmRunner
(
model_name
=
MODEL
,
gpu_memory_utilization
=
0.7
,
num_scheduler_steps
=
num_scheduler_steps
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enforce_eager
=
enforce_eager
)
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# What is scheduled will run for num_scheduler_steps always.
num_prompt_steps
=
num_scheduler_steps
\
if
is_multi_step_chunked_prefill
else
1
num_output_tokens_list
=
[
4
,
8
,
12
,
15
,
16
,
17
]
# Create sequence and add to engine
prompt_len
=
10
for
req_idx
,
num_output_tokens
in
enumerate
(
num_output_tokens_list
):
seq
,
seq_group
=
create_dummy_prompt
(
request_id
=
str
(
req_idx
),
prompt_length
=
prompt_len
,
min_tokens
=
num_output_tokens
,
max_tokens
=
num_output_tokens
)
add_seq_group_to_engine
(
engine
,
seq_group
)
assert
seq
.
data
.
get_num_computed_tokens
()
==
0
for
_
in
range
(
num_prompt_steps
):
# prompt steps
engine
.
step
()
if
not
seq
.
is_finished
():
prompt_num_computed_tokens
=
seq
.
data
.
get_num_computed_tokens
()
# Test correctness of num_computed_tokens after the prompt steps
assert
prompt_num_computed_tokens
==
\
prompt_len
+
num_prompt_steps
-
1
decode_step_counter
=
0
while
not
seq
.
is_finished
():
# Test correctness of num_computed_tokens after the decode steps
assert
seq
.
data
.
get_num_computed_tokens
(
)
==
prompt_num_computed_tokens
+
decode_step_counter
for
_
in
range
(
num_scheduler_steps
):
# decode step
engine
.
step
()
decode_step_counter
+=
1
# Test correctness of num_computed_tokens after the sequence finish.
assert
seq
.
data
.
get_num_computed_tokens
(
)
==
prompt_len
+
num_output_tokens
-
1
tests/core/test_scheduler.py
View file @
6d2051cc
...
@@ -3,7 +3,7 @@ from collections import deque
...
@@ -3,7 +3,7 @@ from collections import deque
from
typing
import
List
,
Set
,
Tuple
from
typing
import
List
,
Set
,
Tuple
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
import
pytest
# noqa
from
torch
import
Use
# noqa
from
torch
import
Use
# noqa
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
...
@@ -17,11 +17,13 @@ from .utils import (append_new_token, append_new_token_seq_group,
...
@@ -17,11 +17,13 @@ from .utils import (append_new_token, append_new_token_seq_group,
schedule_and_update_computed_tokens
)
schedule_and_update_computed_tokens
)
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_add_seq_group
():
def
test_scheduler_add_seq_group
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
use_v2_block_manager
=
use_v2_block_manager
)
100
,
64
,
1
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -37,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
...
@@ -37,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_abort_seq_group
():
def
test_scheduler_abort_seq_group
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
use_v2_block_manager
=
use_v2_block_manager
)
100
,
64
,
1
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -61,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
...
@@ -61,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
0
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_schedule_simple
():
def
test_scheduler_schedule_simple
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
...
@@ -70,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
...
@@ -70,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
64
,
64
,
num_seq_group
,
num_seq_group
,
max_model_len
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -105,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
...
@@ -105,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_prefill_prioritized
():
def
test_scheduler_prefill_prioritized
(
use_v2_block_manager
:
bool
):
"""Verify running batched tokens are not applied to prefill requests."""
"""Verify running batched tokens are not applied to prefill requests."""
block_size
=
4
block_size
=
4
max_model_len
=
30
max_model_len
=
30
...
@@ -115,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
...
@@ -115,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
max_batched_num_tokens
,
max_batched_num_tokens
,
2
,
2
,
max_model_len
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
...
@@ -139,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
...
@@ -139,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_schedule_preempt_abort
():
def
test_scheduler_schedule_preempt_abort
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
64
,
2
,
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
...
@@ -194,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
...
@@ -194,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
1
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_max_seqs
():
def
test_scheduler_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_seq_group
=
2
max_seq_group
=
2
...
@@ -204,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
...
@@ -204,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
64
,
64
,
max_seq_group
,
max_seq_group
,
max_model_len
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -242,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
...
@@ -242,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_delay_factor
():
def
test_scheduler_delay_factor
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
100
,
100
,
64
,
64
,
16
,
16
,
delay_factor
=
0.5
,
delay_factor
=
0.5
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -287,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool):
...
@@ -287,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool):
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_swapped_out_prioritized
():
def
test_swapped_out_prioritized
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
,
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
,
block_size
=
block_size
,
block_size
=
block_size
,
use_v2_block_manager
=
use_v2_block_manager
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
# best_of=2 * 3 == 6 sequences.
# best_of=2 * 3 == 6 sequences.
...
@@ -344,7 +344,6 @@ def initialize_scheduler(
...
@@ -344,7 +344,6 @@ def initialize_scheduler(
max_token_budget
=
1000
,
max_token_budget
=
1000
,
max_model_len
=
1000
,
max_model_len
=
1000
,
lora_config
=
None
,
lora_config
=
None
,
use_v2_block_manager
=
False
,
block_size
=
4
,
block_size
=
4
,
num_cpu_blocks
=
8
,
num_cpu_blocks
=
8
,
num_gpu_blocks
=
8
,
num_gpu_blocks
=
8
,
...
@@ -354,7 +353,7 @@ def initialize_scheduler(
...
@@ -354,7 +353,7 @@ def initialize_scheduler(
max_token_budget
,
max_token_budget
,
max_num_seqs
,
max_num_seqs
,
max_model_len
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
...
@@ -379,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget,
...
@@ -379,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget,
budget
.
add_num_seqs
(
mock_seq_group
.
request_id
,
num_curr_seqs
)
budget
.
add_num_seqs
(
mock_seq_group
.
request_id
,
num_curr_seqs
)
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_prompt_len
():
def
test_prefill_schedule_max_prompt_len
(
use_v2_block_manager
:
bool
):
"""
"""
Test prompt longer than max_prompt_len is aborted.
Test prompt longer than max_prompt_len is aborted.
"""
"""
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
block_size
=
block_size
)
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
,
prompt_length
=
60
,
block_size
=
block_size
)
block_size
=
block_size
)
...
@@ -402,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
...
@@ -402,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_token_budget
():
def
test_prefill_schedule_token_budget
(
use_v2_block_manager
:
bool
):
"""
"""
Test token budget respected.
Test token budget respected.
"""
"""
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
0
)
budget
=
create_token_budget
(
token_budget
=
0
)
...
@@ -439,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
...
@@ -439,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert
len
(
remaining_waiting
)
==
1
assert
len
(
remaining_waiting
)
==
1
# Test when current_batched_tokens respected.
# Test when current_batched_tokens respected.
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
16
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
num_gpu_blocks
=
16
)
budget
=
create_token_budget
(
token_budget
=
60
)
budget
=
create_token_budget
(
token_budget
=
60
)
...
@@ -467,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
...
@@ -467,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_seqs
():
def
test_prefill_schedule_max_seqs
(
use_v2_block_manager
:
bool
):
"""
"""
Test max seq respected.
Test max seq respected.
"""
"""
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
...
@@ -508,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
...
@@ -508,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
assert
len
(
remaining_waiting
)
==
1
assert
len
(
remaining_waiting
)
==
1
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_lora
():
def
test_prefill_schedule_max_lora
(
use_v2_block_manager
:
bool
):
"""
"""
Test max lora is respected and prioritized.
Test max lora is respected and prioritized.
"""
"""
block_size
=
4
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
...
@@ -563,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
...
@@ -563,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
assert
budget
.
num_batched_tokens
==
60
assert
budget
.
num_batched_tokens
==
60
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_no_block_manager_capacity
():
def
test_prefill_schedule_no_block_manager_capacity
(
use_v2_block_manager
):
"""
"""
Test sequence cannot be scheduled due to block manager has no capacity.
Test sequence cannot be scheduled due to block manager has no capacity.
"""
"""
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
128
,
num_gpu_blocks
=
128
,
num_cpu_blocks
=
128
)
num_cpu_blocks
=
128
)
budget
=
create_token_budget
()
budget
=
create_token_budget
()
...
@@ -607,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
...
@@ -607,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_decode_schedule_preempted
():
def
test_decode_schedule_preempted
(
use_v2_block_manager
:
bool
):
"""
"""
Test decodes cannot be scheduled and preempted.
Test decodes cannot be scheduled and preempted.
"""
"""
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
...
@@ -653,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool):
...
@@ -653,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool):
assert
output
.
blocks_to_copy
==
[]
assert
output
.
blocks_to_copy
==
[]
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_decode_swap_beam_search
():
def
test_decode_swap_beam_search
(
use_v2_block_manager
:
bool
):
"""
"""
Test best_of > 1 swap out blocks
Test best_of > 1 swap out blocks
"""
"""
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
64
,
num_gpu_blocks
=
64
,
num_cpu_blocks
=
64
)
num_cpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
...
@@ -709,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool):
...
@@ -709,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool):
assert
output
.
blocks_to_copy
==
[]
assert
output
.
blocks_to_copy
==
[]
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_decode_blocks_to_copy_update
():
def
test_schedule_decode_blocks_to_copy_update
(
use_v2_block_manager
:
bool
):
"""
"""
Verify blocks_to_copy is updated.
Verify blocks_to_copy is updated.
"""
"""
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
4
,
block_size
=
4
,
num_cpu_blocks
=
16
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
num_gpu_blocks
=
16
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
...
@@ -747,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
...
@@ -747,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_simple
():
def
test_schedule_swapped_simple
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
)
block_size
=
block_size
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
...
@@ -778,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool):
...
@@ -778,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool):
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_max_token_budget
():
def
test_schedule_swapped_max_token_budget
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
...
@@ -815,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
...
@@ -815,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_max_seqs
():
def
test_schedule_swapped_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
...
@@ -852,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
...
@@ -852,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_max_loras
():
def
test_schedule_swapped_max_loras
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
...
@@ -887,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
...
@@ -887,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
assert
len
(
curr_loras
)
==
1
assert
len
(
curr_loras
)
==
1
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_cannot_swap_in
():
def
test_schedule_swapped_cannot_swap_in
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
...
@@ -920,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
...
@@ -920,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_infeasible_swap
():
def
test_infeasible_swap
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
...
@@ -954,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool):
...
@@ -954,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool):
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_blocks_to_copy
():
def
test_schedule_swapped_blocks_to_copy
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
...
...
tests/core/utils.py
View file @
6d2051cc
...
@@ -13,9 +13,10 @@ def create_dummy_prompt(
...
@@ -13,9 +13,10 @@ def create_dummy_prompt(
prompt_length
:
int
,
prompt_length
:
int
,
block_size
:
Optional
[
int
]
=
None
,
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
best_of
:
int
=
1
,
prompt_tokens
:
Optional
[
List
[
int
]]
=
None
,
prompt_tokens
:
Optional
[
List
[
int
]]
=
None
,
min_tokens
:
int
=
0
,
max_tokens
:
int
=
16
,
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
if
not
block_size
:
block_size
=
prompt_length
block_size
=
prompt_length
...
@@ -35,8 +36,9 @@ def create_dummy_prompt(
...
@@ -35,8 +36,9 @@ def create_dummy_prompt(
seqs
=
[
prompt
],
seqs
=
[
prompt
],
arrival_time
=
time
.
time
(),
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
use_beam_search
=
use_beam_search
,
best_of
=
best_of
,
best_of
=
best_of
),
max_tokens
=
max_tokens
,
min_tokens
=
min_tokens
),
lora_request
=
lora_request
)
lora_request
=
lora_request
)
return
prompt
,
seq_group
return
prompt
,
seq_group
...
@@ -48,7 +50,6 @@ def create_dummy_prompt_encoder_decoder(
...
@@ -48,7 +50,6 @@ def create_dummy_prompt_encoder_decoder(
encoder_prompt_length
:
int
,
encoder_prompt_length
:
int
,
block_size
:
Optional
[
int
]
=
None
,
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
best_of
:
int
=
1
,
)
->
Tuple
[
Sequence
,
Sequence
,
SequenceGroup
]:
)
->
Tuple
[
Sequence
,
Sequence
,
SequenceGroup
]:
if
not
block_size
:
if
not
block_size
:
...
@@ -81,9 +82,7 @@ def create_dummy_prompt_encoder_decoder(
...
@@ -81,9 +82,7 @@ def create_dummy_prompt_encoder_decoder(
from_decoder_prompt
=
False
)
from_decoder_prompt
=
False
)
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
seqs
=
[
decoder_prompt
],
seqs
=
[
decoder_prompt
],
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
best_of
=
best_of
),
use_beam_search
=
use_beam_search
,
best_of
=
best_of
),
arrival_time
=
time
.
time
(),
arrival_time
=
time
.
time
(),
lora_request
=
lora_request
,
lora_request
=
lora_request
,
encoder_seq
=
encoder_prompt
)
encoder_seq
=
encoder_prompt
)
...
...
tests/data/test_config.yaml
View file @
6d2051cc
port
:
12312
port
:
12312
served_model_name
:
mymodel
tensor_parallel_size
:
2
tensor_parallel_size
:
2
tests/distributed/test_pipeline_parallel.py
View file @
6d2051cc
...
@@ -6,10 +6,10 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
...
@@ -6,10 +6,10 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
to fail.
to fail.
"""
"""
import
os
import
os
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
import
pytest
import
pytest
from
packaging
import
version
from
transformers
import
__version__
as
transformers_version
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -20,52 +20,253 @@ logger = init_logger("test_pipeline_parallel")
...
@@ -20,52 +20,253 @@ logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
@
pytest
.
mark
.
parametrize
(
class
ParallelSetup
(
NamedTuple
):
(
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
tp_size
:
int
"MODEL_NAME, DIST_BACKEND"
),
pp_size
:
int
[
eager_mode
:
bool
(
2
,
2
,
0
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
chunked_prefill
:
bool
(
2
,
2
,
1
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
3
,
0
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
0
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
@
dataclass
(
1
,
4
,
1
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
class
PPTestSettings
:
(
1
,
3
,
0
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
parallel_setups
:
List
[
ParallelSetup
]
(
1
,
4
,
0
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
distributed_backends
:
List
[
str
]
(
1
,
4
,
1
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
trust_remote_code
:
bool
(
2
,
2
,
1
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
tokenizer_mode
:
Optional
[
str
]
(
2
,
2
,
0
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
# NOTE: InternVL2 multi-node tests are flaky,
@
staticmethod
# use mp backend to skip the multi-node tests
def
detailed
(
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-1B"
,
"mp"
),
*
,
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-2B"
,
"mp"
),
tp_base
:
int
=
1
,
(
1
,
2
,
1
,
0
,
1
,
"OpenGVLab/InternVL2-4B"
,
"mp"
),
pp_base
:
int
=
2
,
(
1
,
2
,
0
,
1
,
0
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"mp"
)
trust_remote_code
:
bool
=
False
,
],
tokenizer_mode
:
Optional
[
str
]
=
None
,
)
):
@
fork_new_process_for_each_test
return
PPTestSettings
(
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
parallel_setups
=
[
TRUST_REMOTE_CODE
,
MODEL_NAME
,
DIST_BACKEND
):
ParallelSetup
(
tp_size
=
tp_base
,
if
VLLM_MULTI_NODE
and
DIST_BACKEND
==
"mp"
:
pp_size
=
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
2
*
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
2
*
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
,
"ray"
],
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
)
@
staticmethod
def
fast
(
*
,
tp_base
:
int
=
1
,
pp_base
:
int
=
2
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
],
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
)
def
iter_params
(
self
,
model_name
:
str
):
for
parallel_setup
in
self
.
parallel_setups
:
for
distributed_backend
in
self
.
distributed_backends
:
yield
(
model_name
,
parallel_setup
,
distributed_backend
,
self
.
trust_remote_code
,
self
.
tokenizer_mode
)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
# yapf: disable
GENERATION_MODEL_SETTINGS
=
{
# [DETAILED TESTS]
"meta-llama/Meta-Llama-3-8B"
:
PPTestSettings
.
detailed
(),
# [FAST TESTS]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
8
,
trust_remote_code
=
True
),
# noqa: E501
"baichuan-inc/Baichuan-7B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"baichuan-inc/Baichuan2-13B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"bigscience/bloomz-1b1"
:
PPTestSettings
.
fast
(),
"THUDM/chatglm3-6b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"CohereForAI/c4ai-command-r-v01"
:
PPTestSettings
.
fast
(
tp_base
=
2
,
trust_remote_code
=
True
),
# noqa: E501
"databricks/dbrx-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
8
),
"Deci/DeciLM-7B-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"deepseek-ai/deepseek-llm-7b-chat"
:
PPTestSettings
.
fast
(),
"deepseek-ai/DeepSeek-V2-Lite-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
:
PPTestSettings
.
fast
(),
"tiiuae/falcon-7b"
:
PPTestSettings
.
fast
(),
"google/gemma-2b"
:
PPTestSettings
.
fast
(),
"google/gemma-2-9b"
:
PPTestSettings
.
fast
(),
"gpt2"
:
PPTestSettings
.
fast
(),
"bigcode/starcoder"
:
PPTestSettings
.
fast
(),
"EleutherAI/gpt-j-6b"
:
PPTestSettings
.
fast
(),
"EleutherAI/pythia-12b"
:
PPTestSettings
.
fast
(),
"ibm/PowerLM-3b"
:
PPTestSettings
.
fast
(),
"ibm/PowerMoE-3b"
:
PPTestSettings
.
fast
(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"core42/jais-13b-chat"
:
PPTestSettings
.
fast
(),
# TODO: Implement PP
# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
"openbmb/MiniCPM-2B-sft-bf16"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"openbmb/MiniCPM3-4B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
PPTestSettings
.
fast
(
tp_base
=
4
),
"mosaicml/mpt-7b"
:
PPTestSettings
.
fast
(),
"nvidia/Minitron-8B-Base"
:
PPTestSettings
.
fast
(),
"allenai/OLMoE-1B-7B-0924-Instruct"
:
PPTestSettings
.
fast
(),
"allenai/OLMo-1B-hf"
:
PPTestSettings
.
fast
(),
"facebook/opt-iml-max-1.3b"
:
PPTestSettings
.
fast
(),
"OrionStarAI/Orion-14B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"microsoft/phi-2"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3-mini-4k-instruct"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3-small-8k-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
# FIXME: https://github.com/vllm-project/vllm/issues/8553
# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen-7B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"Qwen/Qwen2-beta-7B-Chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
:
PPTestSettings
.
fast
(),
"stabilityai/stablelm-3b-4e1t"
:
PPTestSettings
.
fast
(),
"bigcode/starcoder2-3b"
:
PPTestSettings
.
fast
(),
"upstage/solar-pro-preview-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
2
),
# FIXME: Cannot load tokenizer in latest transformers version
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}
EMBEDDING_MODEL_SETTINGS
=
{
# type: ignore[var-annotated]
# [FAST TESTS]
"intfloat/e5-mistral-7b-instruct"
:
PPTestSettings
.
fast
(),
"BAAI/bge-multilingual-gemma2"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2.5-Math-RM-72B"
:
PPTestSettings
.
fast
(
tp_base
=
4
,
trust_remote_code
=
True
),
# noqa: E501
}
MULTIMODAL_MODEL_SETTINGS
=
{
# [FAST TESTS]
"Salesforce/blip2-opt-2.7b"
:
PPTestSettings
.
fast
(),
"facebook/chameleon-7b"
:
PPTestSettings
.
fast
(),
"adept/fuyu-8b"
:
PPTestSettings
.
fast
(),
"OpenGVLab/InternVL2-1B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"llava-hf/llava-1.5-7b-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/llava-v1.6-mistral-7b-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/LLaVA-NeXT-Video-7B-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM-Llama3-V-2_5"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
"microsoft/Phi-3-vision-128k-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"mistralai/Pixtral-12B-2409"
:
PPTestSettings
.
fast
(
tp_base
=
2
,
tokenizer_mode
=
"mistral"
),
# noqa: E501
"Qwen/Qwen-VL-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"Qwen/Qwen2-VL-2B-Instruct"
:
PPTestSettings
.
fast
(),
"fixie-ai/ultravox-v0_3"
:
PPTestSettings
.
fast
(),
}
CONDITIONAL_GENERATION_MODEL_SETTINGS
=
{
# type: ignore[var-annotated]
# [FAST TESTS]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS
=
[
# [LANGUAGE GENERATION]
"meta-llama/Meta-Llama-3-8B"
,
"ibm/PowerLM-3b"
,
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct"
,
"BAAI/bge-multilingual-gemma2"
,
# [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B"
,
"microsoft/Phi-3-vision-128k-instruct"
,
"fixie-ai/ultravox-v0_3"
,
]
def
_compare_tp
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
:
int
,
*
,
method
:
Literal
[
"generate"
,
"encode"
]
=
"encode"
,
):
tp_size
,
pp_size
,
eager_mode
,
chunked_prefill
=
parallel_setup
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
if
VLLM_MULTI_NODE
and
distributed_backend
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
"multiprocessing distributed backend"
)
# Skip tests that require transformers>=4.45.0
common_args
=
[
if
"Qwen2-VL"
in
MODEL_NAME
and
version
.
parse
(
transformers_version
)
<
version
.
parse
(
"4.45.0.dev0"
):
pytest
.
skip
(
"This test requires transformers>=4.45.0"
)
pp_args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
"float16"
,
"float16"
,
"--max-model-len"
,
"--max-model-len"
,
"8192"
,
"2048"
,
"--max-num-seqs"
,
"8"
,
]
if
chunked_prefill
:
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
if
trust_remote_code
:
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
common_args
.
extend
([
"--tokenizer-mode"
,
tokenizer_mode
])
if
(
distributed_backend
==
"ray"
and
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
):
# Test Ray ADAG for a subset of the tests
pp_env
=
{
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
common_args
.
append
(
"--disable-frontend-multiprocessing"
)
else
:
pp_env
=
None
pp_args
=
[
*
common_args
,
"--pipeline-parallel-size"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
str
(
pp_size
),
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
str
(
TP_SIZE
),
str
(
tp_size
),
"--distributed-executor-backend"
,
"--distributed-executor-backend"
,
DIST_BACKEND
,
distributed_backend
,
]
]
# compare without pipeline parallelism
# compare without pipeline parallelism
...
@@ -74,44 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
...
@@ -74,44 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
# schedule all workers in a node other than the head node,
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
# which can cause the test to fail.
tp_args
=
[
tp_args
=
[
# use half precision for speed and memory savings in CI environment
*
common_args
,
"--dtype"
,
"float16"
,
"--max-model-len"
,
"8192"
,
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
str
(
max
(
TP_SIZE
,
2
)),
# We only use 2 GPUs in the CI.
str
(
tp_size
),
"--distributed-executor-backend"
,
"--distributed-executor-backend"
,
"mp"
,
"mp"
,
]
]
if
CHUNKED_PREFILL
:
pp_args
.
append
(
"--enable-chunked-prefill"
)
tp_args
.
append
(
"--enable-chunked-prefill"
)
if
EAGER_MODE
:
pp_args
.
append
(
"--enforce-eager"
)
tp_args
.
append
(
"--enforce-eager"
)
if
TRUST_REMOTE_CODE
:
pp_args
.
append
(
"--trust-remote-code"
)
tp_args
.
append
(
"--trust-remote-code"
)
pp_env
=
None
if
(
DIST_BACKEND
==
"ray"
and
TP_SIZE
==
2
and
PP_SIZE
==
2
and
CHUNKED_PREFILL
):
# Test Ray ADAG for a subset of the tests
pp_env
=
{
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
pp_args
.
append
(
"--disable-frontend-multiprocessing"
)
tp_args
.
append
(
"--disable-frontend-multiprocessing"
)
try
:
try
:
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
,
pp_env
)
compare_two_settings
(
model_name
,
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
except
Exception
:
except
Exception
:
if
pp_env
is
None
:
if
pp_env
is
None
:
raise
raise
else
:
else
:
# Ray ADAG tests are flaky, so we don't want to fail the test
# Ray ADAG tests are flaky, so we don't want to fail the test
logger
.
exception
(
"Ray ADAG tests failed"
)
logger
.
exception
(
"Ray ADAG tests failed"
)
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
GENERATION_MODEL_SETTINGS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_language_generation
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
):
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
method
=
"generate"
)
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
EMBEDDING_MODEL_SETTINGS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_language_embedding
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
):
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
method
=
"encode"
)
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
MULTIMODAL_MODEL_SETTINGS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_multimodal_generation
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
):
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
method
=
"generate"
)
tests/engine/test_arg_utils.py
View file @
6d2051cc
...
@@ -42,22 +42,42 @@ def test_bad_nullable_kvs(arg):
...
@@ -42,22 +42,42 @@ def test_bad_nullable_kvs(arg):
nullable_kvs
(
arg
)
nullable_kvs
(
arg
)
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
),
[
# yapf: disable
(
None
,
None
),
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
,
"option"
),
[
(
"{}"
,
{}),
(
None
,
None
,
"mm-processor-kwargs"
),
(
'{"num_crops": 4}'
,
{
(
"{}"
,
{},
"mm-processor-kwargs"
),
"num_crops"
:
4
(
}),
'{"num_crops": 4}'
,
(
'{"foo": {"bar": "baz"}}'
,
{
{
"foo"
:
{
"num_crops"
:
4
"bar"
:
"baz"
},
}
"mm-processor-kwargs"
}),
),
(
'{"foo": {"bar": "baz"}}'
,
{
"foo"
:
{
"bar"
:
"baz"
}
},
"mm-processor-kwargs"
),
(
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}'
,
{
"cast_logits_dtype"
:
"bfloat16"
,
"sequence_parallel_norm"
:
True
,
"sequence_parallel_norm_threshold"
:
2048
,
},
"override-neuron-config"
),
])
])
def
test_mm_processor_kwargs_prompt_parser
(
arg
,
expected
):
# yapf: enable
def
test_composite_arg_parser
(
arg
,
expected
,
option
):
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
if
arg
is
None
:
args
=
parser
.
parse_args
([])
args
=
parser
.
parse_args
([])
else
:
else
:
args
=
parser
.
parse_args
([
"--
mm-processor-kwargs
"
,
arg
])
args
=
parser
.
parse_args
([
f
"--
{
option
}
"
,
arg
])
assert
args
.
mm_processor_kwargs
==
expected
assert
getattr
(
args
,
option
.
replace
(
"-"
,
"_"
))
==
expected
tests/engine/test_custom_executor.py
View file @
6d2051cc
...
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
...
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor
(
model
,
tmp
dir
):
def
test_custom_executor
(
model
,
tmp
_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp
dir
)
os
.
chdir
(
tmp
_path
)
try
:
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
assert
not
os
.
path
.
exists
(
".marker"
)
...
@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
...
@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_async
(
model
,
tmp
dir
):
def
test_custom_executor_async
(
model
,
tmp
_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp
dir
)
os
.
chdir
(
tmp
_path
)
try
:
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
assert
not
os
.
path
.
exists
(
".marker"
)
...
...
tests/entrypoints/llm/test_encode.py
View file @
6d2051cc
...
@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
...
@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
prompt
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
prompt
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
({
"prompt"
:
prompt
},
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...
@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
PROMPTS
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
pooling_params
=
pooling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
pooling_params
=
PoolingParams
()
...
...
tests/entrypoints/llm/test_generate.py
View file @
6d2051cc
...
@@ -47,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
...
@@ -47,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
prompt
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
({
"prompt"
:
prompt
},
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
@@ -79,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...
@@ -79,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
PROMPTS
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
sampling_params
=
sampling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
6d2051cc
...
@@ -7,7 +7,7 @@ import pytest
...
@@ -7,7 +7,7 @@ import pytest
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
...conftest
import
cleanup
from
...conftest
import
cleanup
...
@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm):
...
@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
temperature
=
0.8
,
top_p
=
0.95
,
top_p
=
0.95
,
)
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
[
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
]
*
2
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
assert
outputs
is
not
None
assert
outputs
is
not
None
for
output
in
outputs
:
for
output
in
outputs
:
...
@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm):
...
@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
temperature
=
1.0
,
max_tokens
=
1000
,
max_tokens
=
1000
,
)
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_json_schema
))
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
[
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
]
*
2
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_json
=
sample_json_schema
))
assert
outputs
is
not
None
assert
outputs
is
not
None
...
@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm):
...
@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
temperature
=
0.8
,
top_p
=
0.95
,
top_p
=
0.95
,
)
guided_decoding
=
GuidedDecodingParams
(
choice
=
sample_guided_choice
)
)
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
"The best language for type-safe systems programming is "
,
prompts
=
"The best language for type-safe systems programming is "
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
use_tqdm
=
True
)
guided_options_request
=
dict
(
guided_choice
=
sample_guided_choice
))
assert
outputs
is
not
None
assert
outputs
is
not
None
for
output
in
outputs
:
for
output
in
outputs
:
...
@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm):
...
@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm):
temperature
=
0.8
,
temperature
=
0.8
,
top_p
=
0.95
,
top_p
=
0.95
,
max_tokens
=
1000
,
max_tokens
=
1000
,
)
guided_decoding
=
GuidedDecodingParams
(
grammar
=
sample_sql_statements
)
)
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a sql state that select col_1 from "
prompts
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
"table_1 where it is equals to 1"
),
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_grammar
=
sample_sql_statements
)
)
)
assert
outputs
is
not
None
assert
outputs
is
not
None
for
output
in
outputs
:
for
output
in
outputs
:
...
@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm):
...
@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm):
assert
generated_text
.
strip
()
==
ground_truth
assert
generated_text
.
strip
()
==
ground_truth
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_options_request_deprecation_warning
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"guided_options_request"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_validation_against_both_guided_decoding_options
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
with
pytest
.
raises
(
ValueError
,
match
=
"Cannot set both"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
tests/entrypoints/openai/test_audio.py
View file @
6d2051cc
...
@@ -21,7 +21,9 @@ def server():
...
@@ -21,7 +21,9 @@ def server():
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"bfloat16"
,
"--max-model-len"
,
"--max-model-len"
,
"4096"
,
"2048"
,
"--max-num-seqs"
,
"5"
,
"--enforce-eager"
,
"--enforce-eager"
,
]
]
...
...
tests/entrypoints/openai/test_basic.py
View file @
6d2051cc
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
List
import
openai
import
openai
import
pytest
import
pytest
...
@@ -12,8 +13,44 @@ from ...utils import RemoteOpenAIServer
...
@@ -12,8 +13,44 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
'module'
)
def
server_args
(
request
:
pytest
.
FixtureRequest
)
->
List
[
str
]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if
not
hasattr
(
request
,
"param"
):
return
[]
val
=
request
.
param
if
isinstance
(
val
,
str
):
return
[
val
]
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
(
server_args
):
args
=
[
args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
...
@@ -23,6 +60,7 @@ def server():
...
@@ -23,6 +60,7 @@ def server():
"--enforce-eager"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"--max-num-seqs"
,
"128"
,
"128"
,
*
server_args
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
@@ -35,6 +73,15 @@ async def client(server):
...
@@ -35,6 +73,15 @@ async def client(server):
yield
async_client
yield
async_client
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
...
@@ -45,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI):
...
@@ -45,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI):
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
...
...
tests/entrypoints/openai/test_chat.py
View file @
6d2051cc
...
@@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_tokens
=
10
,
extra_body
=
dict
(
min_tokens
=
10
),
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
stream_options
=
{
stream_options
=
{
"include_usage"
:
True
,
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
"continuous_usage_stats"
:
True
,
},
},
)
)
last_completion_tokens
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
last_completion_tokens
==
0
or
\
chunk
.
usage
.
completion_tokens
>
last_completion_tokens
or
\
(
not
chunk
.
choices
and
chunk
.
usage
.
completion_tokens
==
last_completion_tokens
)
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
chunk
.
usage
.
completion_tokens
)
last_completion_tokens
=
chunk
.
usage
.
completion_tokens
assert
last_completion_tokens
==
10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
...
...
tests/entrypoints/openai/test_chat_template.py
View file @
6d2051cc
...
@@ -12,7 +12,7 @@ assert chatml_jinja_path.exists()
...
@@ -12,7 +12,7 @@ assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
False
,
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
Hi there!<|im_end|>
Hi there!<|im_end|>
...
@@ -20,12 +20,20 @@ Hi there!<|im_end|>
...
@@ -20,12 +20,20 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
"""
),
"""
),
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
False
,
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
Hi there!<|im_end|>
Hi there!<|im_end|>
<|im_start|>user
<|im_start|>user
What is the capital of"""
)
What is the capital of"""
),
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
True
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of"""
),
]
]
TEST_MESSAGES
=
[
TEST_MESSAGES
=
[
...
@@ -42,6 +50,10 @@ TEST_MESSAGES = [
...
@@ -42,6 +50,10 @@ TEST_MESSAGES = [
'content'
:
'What is the capital of'
'content'
:
'What is the capital of'
},
},
]
]
ASSISTANT_MESSAGE_TO_CONTINUE
=
{
'role'
:
'assistant'
,
'content'
:
'The capital of'
}
def
test_load_chat_template
():
def
test_load_chat_template
():
...
@@ -73,10 +85,10 @@ def test_no_load_chat_template_literallike():
...
@@ -73,10 +85,10 @@ def test_no_load_chat_template_literallike():
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model,template,add_generation_prompt,expected_output"
,
"model,template,add_generation_prompt,
continue_final_message,
expected_output"
,
MODEL_TEMPLATE_GENERATON_OUTPUT
)
MODEL_TEMPLATE_GENERATON_OUTPUT
)
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
expected_output
):
continue_final_message
,
expected_output
):
# Initialize the tokenizer
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
template_content
=
load_chat_template
(
chat_template
=
template
)
template_content
=
load_chat_template
(
chat_template
=
template
)
...
@@ -84,8 +96,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -84,8 +96,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Create a mock request object using keyword arguments
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
mock_request
=
ChatCompletionRequest
(
model
=
model
,
model
=
model
,
messages
=
TEST_MESSAGES
,
messages
=
TEST_MESSAGES
+
[
ASSISTANT_MESSAGE_TO_CONTINUE
]
add_generation_prompt
=
add_generation_prompt
)
if
continue_final_message
else
TEST_MESSAGES
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
)
# Call the function and get the result
# Call the function and get the result
result
=
apply_hf_chat_template
(
result
=
apply_hf_chat_template
(
...
@@ -93,6 +108,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -93,6 +108,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
conversation
=
mock_request
.
messages
,
conversation
=
mock_request
.
messages
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
continue_final_message
=
mock_request
.
continue_final_message
,
)
)
# Test assertion
# Test assertion
...
...
tests/entrypoints/openai/test_chunked_prompt.py
0 → 100644
View file @
6d2051cc
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--max-num-seqs"
,
"128"
,
"--enable-chunked-prefill"
,
"--max-num-batched-tokens"
,
"1000"
,
# large prompts create a lot of output
"--disable-log-requests"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
prompt
=
"What is the capital of France?"
*
400
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
5
,
)
tokens_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
tokens_received
+=
1
assert
chunk
.
choices
[
0
].
text
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What is the capital of France?"
*
400
}]
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
True
,
top_logprobs
=
5
,
)
tokens_received
=
0
empty_chunks_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
if
chunk
.
choices
[
0
].
delta
.
content
==
""
:
# when there is no tokens generated
assert
chunk
.
usage
.
completion_tokens
==
0
assert
chunk
.
choices
[
0
].
logprobs
is
None
empty_chunks_received
+=
1
else
:
tokens_received
+=
1
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
assert
empty_chunks_received
<=
1
tests/entrypoints/openai/test_cli_args.py
View file @
6d2051cc
import
json
import
json
import
unittest
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
import
pytest
from
vllm.entrypoints.openai.cli_args
import
(
make_arg_parser
,
validate_parsed_serve_args
)
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
from
...utils
import
VLLM_PATH
LORA_MODULE
=
{
LORA_MODULE
=
{
"name"
:
"module2"
,
"name"
:
"module2"
,
"path"
:
"/path/to/module2"
,
"path"
:
"/path/to/module2"
,
"base_model_name"
:
"llama"
"base_model_name"
:
"llama"
}
}
CHATML_JINJA_PATH
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
CHATML_JINJA_PATH
.
exists
()
class
TestLoraParserAction
(
unittest
.
TestCase
):
@
pytest
.
fixture
def
serve_parser
():
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
return
make_arg_parser
(
parser
)
def
setUp
(
self
):
# Setting up argparse parser for tests
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
self
.
parser
=
make_arg_parser
(
parser
)
def
test_valid_key_value_format
(
self
):
### Tests for Lora module parsing
# Test old format: name=path
def
test_valid_key_value_format
(
serve_parser
):
args
=
self
.
parser
.
parse_args
([
# Test old format: name=path
'--lora-modules'
,
args
=
serve_parser
.
parse_args
([
'module1=/path/to/module1'
,
'--lora-modules'
,
'module1=/path/to/module1'
,
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
assert
args
.
lora_modules
==
expected
def
test_valid_json_format
(
serve_parser
):
# Test valid JSON format input
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
def
test_invalid_json_format
(
serve_parser
):
# Test invalid JSON format input, missing closing brace
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
def
test_valid_json_format
(
self
):
# Test valid JSON format input
def
test_invalid_type_error
(
serve_parser
):
args
=
self
.
parser
.
parse_args
([
# Test type error when values are not JSON or key=value
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
'invalid_format'
# This is not JSON or key=value format
])
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
def
test_invalid_json_field
(
serve_parser
):
base_model_name
=
'llama'
)
# Test valid JSON format but missing required fields
]
with
pytest
.
raises
(
SystemExit
):
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
serve_parser
.
parse_args
([
def
test_invalid_json_format
(
self
):
# Test invalid JSON format input, missing closing brace
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
def
test_invalid_type_error
(
self
):
# Test type error when values are not JSON or key=value
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'invalid_format'
# This is not JSON or key=value format
])
def
test_invalid_json_field
(
self
):
# Test valid JSON format but missing required fields
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module4"}'
# Missing required 'path' field
])
def
test_empty_values
(
self
):
# Test when no LoRA modules are provided
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
''
])
self
.
assertEqual
(
args
.
lora_modules
,
[])
def
test_multiple_valid_inputs
(
self
):
# Test multiple valid inputs (both old and JSON format)
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
'--lora-modules'
,
'module1=/path/to/module1'
,
'{"name": "module4"}'
# Missing required 'path' field
json
.
dumps
(
LORA_MODULE
),
])
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
if
__name__
==
'__main__'
:
def
test_empty_values
(
serve_parser
):
unittest
.
main
()
# Test when no LoRA modules are provided
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
''
])
assert
args
.
lora_modules
==
[]
def
test_multiple_valid_inputs
(
serve_parser
):
# Test multiple valid inputs (both old and JSON format)
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
### Tests for serve argument validation that run prior to loading
def
test_enable_auto_choice_passes_without_tool_call_parser
(
serve_parser
):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
])
with
pytest
.
raises
(
TypeError
):
validate_parsed_serve_args
(
args
)
def
test_enable_auto_choice_passes_with_tool_call_parser
(
serve_parser
):
"""Ensure validation passes with tool choice enabled with a call parser"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
,
"--tool-call-parser"
,
"mistral"
,
])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_happy_paths
(
serve_parser
):
"""Ensure validation passes if the chat template exists"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
CHATML_JINJA_PATH
.
absolute
().
as_posix
()])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_sad_paths
(
serve_parser
):
"""Ensure validation fails if the chat template doesn't exist"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
"does/not/exist"
])
with
pytest
.
raises
(
ValueError
):
validate_parsed_serve_args
(
args
)
tests/entrypoints/openai/test_completion.py
View file @
6d2051cc
...
@@ -503,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
...
@@ -503,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
extra_body
=
dict
(
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but
not necessary
# NOTE: this has to be true for n > 1 in vLLM, but
# for official client.
#
not necessary
for official client.
use_beam_search
=
True
),
use_beam_search
=
True
),
)
)
assert
len
(
batch
.
choices
)
==
4
assert
len
(
batch
.
choices
)
==
4
...
...
tests/entrypoints/openai/test_embedding.py
View file @
6d2051cc
...
@@ -144,3 +144,64 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -144,3 +144,64 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
0
].
embedding
0
].
embedding
assert
responses_float
.
data
[
1
].
embedding
==
responses_default
.
data
[
assert
responses_float
.
data
[
1
].
embedding
==
responses_default
.
data
[
1
].
embedding
1
].
embedding
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding_truncation
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?"
,
]
# test single embedding
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
extra_body
=
{
"truncate_prompt_tokens"
:
10
})
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
input_tokens
=
[
1
,
24428
,
289
,
18341
,
26165
,
285
,
19323
,
283
,
289
,
26789
,
3871
,
28728
,
9901
,
340
,
2229
,
385
,
340
,
315
,
28741
,
28804
,
2
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
extra_body
=
{
"truncate_prompt_tokens"
:
10
})
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding_truncation_invalid
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?"
,
]
with
pytest
.
raises
(
openai
.
BadRequestError
):
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
extra_body
=
{
"truncate_prompt_tokens"
:
8193
})
assert
"error"
in
embeddings
.
object
assert
"truncate_prompt_tokens value is greater than max_model_len. "
\
"Please, select a smaller truncation size."
in
embeddings
.
message
tests/entrypoints/openai/test_metrics.py
View file @
6d2051cc
...
@@ -70,7 +70,6 @@ EXPECTED_VALUES = {
...
@@ -70,7 +70,6 @@ EXPECTED_VALUES = {
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_n"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_n"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_best_of"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:prompt_tokens"
:
[(
"_total"
,
"vllm:prompt_tokens"
:
[(
"_total"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
"vllm:generation_tokens"
:
"vllm:generation_tokens"
:
...
@@ -151,9 +150,6 @@ EXPECTED_METRICS = [
...
@@ -151,9 +150,6 @@ EXPECTED_METRICS = [
"vllm:request_params_n_sum"
,
"vllm:request_params_n_sum"
,
"vllm:request_params_n_bucket"
,
"vllm:request_params_n_bucket"
,
"vllm:request_params_n_count"
,
"vllm:request_params_n_count"
,
"vllm:request_params_best_of_sum"
,
"vllm:request_params_best_of_bucket"
,
"vllm:request_params_best_of_count"
,
"vllm:num_preemptions_total"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:generation_tokens_total"
,
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment