Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
99b471c2
Commit
99b471c2
authored
May 21, 2024
by
zhuwenwen
Browse files
merge v0.4.1
parents
1925d2e9
468d761b
Changes
336
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1875 additions
and
284 deletions
+1875
-284
requirements-dev.txt
requirements-dev.txt
+2
-1
requirements-neuron.txt
requirements-neuron.txt
+4
-9
requirements-rocm.txt
requirements-rocm.txt
+4
-16
rocm_patch/commonpy_xformers-0.0.23.rocm.patch
rocm_patch/commonpy_xformers-0.0.23.rocm.patch
+0
-13
rocm_patch/flashpy_xformers-0.0.23.rocm.patch
rocm_patch/flashpy_xformers-0.0.23.rocm.patch
+0
-152
setup.py
setup.py
+49
-13
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+13
-4
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+14
-5
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+66
-0
tests/conftest.py
tests/conftest.py
+18
-5
tests/core/block/conftest.py
tests/core/block/conftest.py
+12
-0
tests/core/block/e2e/conftest.py
tests/core/block/e2e/conftest.py
+1
-16
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+224
-1
tests/core/block/test_block_manager_v2.py
tests/core/block/test_block_manager_v2.py
+103
-0
tests/core/block/test_block_table.py
tests/core/block/test_block_table.py
+75
-0
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+12
-12
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+563
-0
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+686
-23
tests/core/utils.py
tests/core/utils.py
+23
-13
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+6
-1
No files found.
requirements-dev.txt
View file @
99b471c2
...
@@ -7,13 +7,14 @@ codespell==2.2.6
...
@@ -7,13 +7,14 @@ codespell==2.2.6
isort==5.13.2
isort==5.13.2
# type checking
# type checking
mypy==
0
.9
91
mypy==
1
.9
.0
types-PyYAML
types-PyYAML
types-requests
types-requests
types-setuptools
types-setuptools
# testing
# testing
pytest
pytest
tensorizer==2.9.0a0
pytest-forked
pytest-forked
pytest-asyncio
pytest-asyncio
pytest-rerunfailures
pytest-rerunfailures
...
...
requirements-neuron.txt
View file @
99b471c2
sentencepiece # Required for LLaMA tokenizer.
# Common dependencies
numpy
-r requirements-common.txt
# Dependencies for Neuron devices
transformers-neuronx >= 0.9.0
transformers-neuronx >= 0.9.0
torch-neuronx >= 2.1.0
torch-neuronx >= 2.1.0
neuronx-cc
neuronx-cc
fastapi
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
requests
psutil
py-cpuinfo
\ No newline at end of file
requirements-rocm.txt
View file @
99b471c2
cmake>=3.21
# Common dependencies
ninja # For faster builds.
-r requirements-common.txt
typing-extensions>=4.8.0
starlette
# Dependencies for AMD GPUs
requests
py-cpuinfo
psutil
ray == 2.9.1
ray == 2.9.1
sentencepiece # Required for LLaMA tokenizer.
numpy
tokenizers>=0.15.0
transformers >= 4.39.1 # Required for StarCoder2 & Llava.
fastapi
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
outlines == 0.0.34
rocm_patch/commonpy_xformers-0.0.23.rocm.patch
deleted
100644 → 0
View file @
1925d2e9
--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py 2023-11-29 03:17:03.930103539 +0000
+++ common.py 2023-11-28 16:14:19.846233146 +0000
@@ -298,8 +298,8 @@
dtype = d.query.dtype
if device_type not in cls.SUPPORTED_DEVICES:
reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
- if device_type == "cuda" and not _built_with_cuda:
- reasons.append("xFormers wasn't build with CUDA support")
+ #if device_type == "cuda" and not _built_with_cuda:
+ # reasons.append("xFormers wasn't build with CUDA support")
if device_type == "cuda":
device_capability = torch.cuda.get_device_capability(d.device)
if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
rocm_patch/flashpy_xformers-0.0.23.rocm.patch
deleted
100644 → 0
View file @
1925d2e9
--- flash_ori.py 2023-12-13 05:43:31.530752623 +0000
+++ flash_patch.py 2023-12-13 06:00:45.962403104 +0000
@@ -36,44 +36,44 @@
FLASH_VERSION = "0.0.0"
try:
- try:
- from ... import _C_flashattention # type: ignore[attr-defined]
- from ..._cpp_lib import _build_metadata
-
- if _build_metadata is not None:
- FLASH_VERSION = _build_metadata.flash_version
- except ImportError:
- import flash_attn
- from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
-
- FLASH_VERSION = flash_attn.__version__
- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
- if (
- flash_ver_parsed != (2, 3, 6)
- and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
- ):
- raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
+ #try:
+ # from ... import _C_flashattention # type: ignore[attr-defined]
+ # from ..._cpp_lib import _build_metadata
+
+ # if _build_metadata is not None:
+ # FLASH_VERSION = _build_metadata.flash_version
+ #except ImportError:
+ import flash_attn
+ from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
+
+ FLASH_VERSION = flash_attn.__version__
+ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
+ # if (
+ # flash_ver_parsed != (2, 3, 6)
+ # and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
+ # ):
+ # raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
# create library so that flash-attn goes through the PyTorch Dispatcher
- _flash_lib = torch.library.Library("xformers_flash", "DEF")
-
- _flash_lib.define(
- "flash_fwd(Tensor query, Tensor key, Tensor value, "
- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
- "int max_seqlen_q, int max_seqlen_k, "
- "float p, float softmax_scale, "
- "bool is_causal, int window_left, "
- "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
- )
+ #_flash_lib = torch.library.Library("xformers_flash", "DEF")
- _flash_lib.define(
- "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
- "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
- "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
- "int max_seqlen_q, int max_seqlen_k, "
- "float p, float softmax_scale, bool is_causal, "
- "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
- )
+ #_flash_lib.define(
+ # "flash_fwd(Tensor query, Tensor key, Tensor value, "
+ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
+ # "int max_seqlen_q, int max_seqlen_k, "
+ # "float p, float softmax_scale, "
+ # "bool is_causal, int window_left, "
+ # "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
+ #)
+
+ #_flash_lib.define(
+ # "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
+ # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
+ # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
+ # "int max_seqlen_q, int max_seqlen_k, "
+ # "float p, float softmax_scale, bool is_causal, "
+ # "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
+ #)
def _flash_fwd(
query,
@@ -111,8 +111,8 @@
p,
softmax_scale,
is_causal,
- window_left, # window_size_left
- window_right, # window_size_right
+ # window_left, # window_size_left
+ # window_right, # window_size_right
return_softmax,
None, # rng
)
@@ -134,15 +134,15 @@
out,
cu_seq_lens_q,
cu_seq_lens_k,
- seqused_k,
+ # seqused_k,
max_seq_len_q,
max_seq_len_k,
p,
softmax_scale,
False,
is_causal,
- window_left,
- window_right,
+ # window_left,
+ # window_right,
return_softmax,
None,
)
@@ -184,8 +184,8 @@
p,
softmax_scale,
is_causal,
- window_left,
- window_right,
+ # window_left,
+ # window_right,
None,
rng_state,
)
@@ -208,15 +208,15 @@
softmax_scale,
False, # zero_tensors
is_causal,
- window_left,
- window_right,
+ # window_left,
+ # window_right,
None,
rng_state,
)
return dq, dk, dv
- _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
- _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
+ #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
+ #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
except ImportError:
pass
@@ -400,7 +400,7 @@
implementation.
"""
- OPERATOR = get_operator("xformers_flash", "flash_fwd")
+ OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd")
SUPPORTED_DEVICES: Set[str] = {"cuda"}
CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
setup.py
View file @
99b471c2
...
@@ -5,7 +5,7 @@ import re
...
@@ -5,7 +5,7 @@ import re
import
subprocess
import
subprocess
import
sys
import
sys
from
shutil
import
which
from
shutil
import
which
from
typing
import
List
from
typing
import
Dict
,
List
import
torch
import
torch
from
packaging.version
import
Version
,
parse
from
packaging.version
import
Version
,
parse
...
@@ -19,6 +19,8 @@ from pathlib import Path
...
@@ -19,6 +19,8 @@ from pathlib import Path
ROOT_DIR
=
os
.
path
.
dirname
(
__file__
)
ROOT_DIR
=
os
.
path
.
dirname
(
__file__
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
VLLM_TARGET_DEVICE
=
os
.
getenv
(
"VLLM_TARGET_DEVICE"
,
"cuda"
)
# vLLM only supports Linux platform
# vLLM only supports Linux platform
assert
sys
.
platform
.
startswith
(
assert
sys
.
platform
.
startswith
(
...
@@ -54,7 +56,7 @@ class CMakeExtension(Extension):
...
@@ -54,7 +56,7 @@ class CMakeExtension(Extension):
class
cmake_build_ext
(
build_ext
):
class
cmake_build_ext
(
build_ext
):
# A dict of extension directories that have been configured.
# A dict of extension directories that have been configured.
did_config
=
{}
did_config
:
Dict
[
str
,
bool
]
=
{}
#
#
# Determine number of compilation jobs and optionally nvcc compile threads.
# Determine number of compilation jobs and optionally nvcc compile threads.
...
@@ -116,6 +118,7 @@ class cmake_build_ext(build_ext):
...
@@ -116,6 +118,7 @@ class cmake_build_ext(build_ext):
'-DCMAKE_BUILD_TYPE={}'
.
format
(
cfg
),
'-DCMAKE_BUILD_TYPE={}'
.
format
(
cfg
),
'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'
.
format
(
outdir
),
'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'
.
format
(
outdir
),
'-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'
.
format
(
self
.
build_temp
),
'-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'
.
format
(
self
.
build_temp
),
'-DVLLM_TARGET_DEVICE={}'
.
format
(
VLLM_TARGET_DEVICE
),
]
]
verbose
=
bool
(
int
(
os
.
getenv
(
'VERBOSE'
,
'0'
)))
verbose
=
bool
(
int
(
os
.
getenv
(
'VERBOSE'
,
'0'
)))
...
@@ -189,11 +192,14 @@ class cmake_build_ext(build_ext):
...
@@ -189,11 +192,14 @@ class cmake_build_ext(build_ext):
def
_is_cuda
()
->
bool
:
def
_is_cuda
()
->
bool
:
return
torch
.
version
.
cuda
is
not
None
and
not
_is_neuron
()
return
VLLM_TARGET_DEVICE
==
"cuda"
\
and
torch
.
version
.
cuda
is
not
None
\
and
not
_is_neuron
()
def
_is_hip
()
->
bool
:
def
_is_hip
()
->
bool
:
return
torch
.
version
.
hip
is
not
None
return
(
VLLM_TARGET_DEVICE
==
"cuda"
or
VLLM_TARGET_DEVICE
==
"rocm"
)
and
torch
.
version
.
hip
is
not
None
def
_is_neuron
()
->
bool
:
def
_is_neuron
()
->
bool
:
...
@@ -202,7 +208,12 @@ def _is_neuron() -> bool:
...
@@ -202,7 +208,12 @@ def _is_neuron() -> bool:
subprocess
.
run
([
"neuron-ls"
],
capture_output
=
True
,
check
=
True
)
subprocess
.
run
([
"neuron-ls"
],
capture_output
=
True
,
check
=
True
)
except
(
FileNotFoundError
,
PermissionError
,
subprocess
.
CalledProcessError
):
except
(
FileNotFoundError
,
PermissionError
,
subprocess
.
CalledProcessError
):
torch_neuronx_installed
=
False
torch_neuronx_installed
=
False
return
torch_neuronx_installed
return
torch_neuronx_installed
or
os
.
environ
.
get
(
"VLLM_BUILD_WITH_NEURON"
,
False
)
def
_is_cpu
()
->
bool
:
return
VLLM_TARGET_DEVICE
==
"cpu"
def
_install_punica
()
->
bool
:
def
_install_punica
()
->
bool
:
...
@@ -255,6 +266,7 @@ def get_nvcc_cuda_version() -> Version:
...
@@ -255,6 +266,7 @@ def get_nvcc_cuda_version() -> Version:
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
"""
"""
assert
CUDA_HOME
is
not
None
,
"CUDA_HOME is not set"
nvcc_output
=
subprocess
.
check_output
([
CUDA_HOME
+
"/bin/nvcc"
,
"-V"
],
nvcc_output
=
subprocess
.
check_output
([
CUDA_HOME
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
universal_newlines
=
True
)
output
=
nvcc_output
.
split
()
output
=
nvcc_output
.
split
()
...
@@ -355,6 +367,8 @@ def get_vllm_version() -> str:
...
@@ -355,6 +367,8 @@ def get_vllm_version() -> str:
if
neuron_version
!=
MAIN_CUDA_VERSION
:
if
neuron_version
!=
MAIN_CUDA_VERSION
:
neuron_version_str
=
neuron_version
.
replace
(
"."
,
""
)[:
3
]
neuron_version_str
=
neuron_version
.
replace
(
"."
,
""
)[:
3
]
version
+=
f
"+neuron
{
neuron_version_str
}
"
version
+=
f
"+neuron
{
neuron_version_str
}
"
elif
_is_cpu
():
version
+=
"+cpu"
else
:
else
:
raise
RuntimeError
(
"Unknown runtime environment"
)
raise
RuntimeError
(
"Unknown runtime environment"
)
...
@@ -372,19 +386,38 @@ def read_readme() -> str:
...
@@ -372,19 +386,38 @@ def read_readme() -> str:
def
get_requirements
()
->
List
[
str
]:
def
get_requirements
()
->
List
[
str
]:
"""Get Python package dependencies from requirements.txt."""
"""Get Python package dependencies from requirements.txt."""
if
_is_cuda
():
with
open
(
get_path
(
"requirements.txt"
))
as
f
:
def
_read_requirements
(
filename
:
str
)
->
List
[
str
]:
with
open
(
get_path
(
filename
))
as
f
:
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
resolved_requirements
=
[]
for
line
in
requirements
:
if
line
.
startswith
(
"-r "
):
resolved_requirements
+=
_read_requirements
(
line
.
split
()[
1
])
else
:
resolved_requirements
.
append
(
line
)
return
resolved_requirements
if
_is_cuda
():
requirements
=
_read_requirements
(
"requirements-cuda.txt"
)
cuda_major
=
torch
.
version
.
cuda
.
split
(
"."
)[
0
]
modified_requirements
=
[]
for
req
in
requirements
:
if
"vllm-nccl-cu12"
in
req
:
modified_requirements
.
append
(
req
.
replace
(
"vllm-nccl-cu12"
,
f
"vllm-nccl-cu
{
cuda_major
}
"
))
else
:
modified_requirements
.
append
(
req
)
requirements
=
modified_requirements
elif
_is_hip
():
elif
_is_hip
():
with
open
(
get_path
(
"requirements-rocm.txt"
))
as
f
:
requirements
=
_read_requirements
(
"requirements-rocm.txt"
)
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
elif
_is_neuron
():
elif
_is_neuron
():
with
open
(
get_path
(
"requirements-neuron.txt"
))
as
f
:
requirements
=
_read_requirements
(
"requirements-neuron.txt"
)
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
elif
_is_cpu
():
requirements
=
_read_requirements
(
"requirements-cpu.txt"
)
else
:
else
:
raise
ValueError
(
raise
ValueError
(
"Unsupported platform, please use CUDA, ROCM or Neuron."
)
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU."
)
return
requirements
return
requirements
...
@@ -432,6 +465,9 @@ setup(
...
@@ -432,6 +465,9 @@ setup(
python_requires
=
">=3.8"
,
python_requires
=
">=3.8"
,
install_requires
=
get_requirements
(),
install_requires
=
get_requirements
(),
ext_modules
=
ext_modules
,
ext_modules
=
ext_modules
,
extras_require
=
{
"tensorizer"
:
[
"tensorizer==2.9.0a1"
],
},
cmdclass
=
{
"build_ext"
:
cmake_build_ext
}
if
not
_is_neuron
()
else
{},
cmdclass
=
{
"build_ext"
:
cmake_build_ext
}
if
not
_is_neuron
()
else
{},
package_data
=
package_data
,
package_data
=
package_data
,
)
)
tests/async_engine/test_api_server.py
View file @
99b471c2
...
@@ -25,21 +25,30 @@ def _query_server_long(prompt: str) -> dict:
...
@@ -25,21 +25,30 @@ def _query_server_long(prompt: str) -> dict:
@
pytest
.
fixture
@
pytest
.
fixture
def
api_server
(
tokenizer_pool_size
:
int
):
def
api_server
(
tokenizer_pool_size
:
int
,
engine_use_ray
:
bool
,
worker_use_ray
:
bool
):
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
"api_server_async_engine.py"
).
absolute
()
"api_server_async_engine.py"
).
absolute
()
uvicorn_process
=
subprocess
.
Popen
(
[
commands
=
[
sys
.
executable
,
"-u"
,
sys
.
executable
,
"-u"
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
,
"--host"
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
,
"--host"
,
"127.0.0.1"
,
"--tokenizer-pool-size"
,
"127.0.0.1"
,
"--tokenizer-pool-size"
,
str
(
tokenizer_pool_size
)
str
(
tokenizer_pool_size
)
])
]
if
engine_use_ray
:
commands
.
append
(
"--engine-use-ray"
)
if
worker_use_ray
:
commands
.
append
(
"--worker-use-ray"
)
uvicorn_process
=
subprocess
.
Popen
(
commands
)
yield
yield
uvicorn_process
.
terminate
()
uvicorn_process
.
terminate
()
@
pytest
.
mark
.
parametrize
(
"tokenizer_pool_size"
,
[
0
,
2
])
@
pytest
.
mark
.
parametrize
(
"tokenizer_pool_size"
,
[
0
,
2
])
def
test_api_server
(
api_server
,
tokenizer_pool_size
:
int
):
@
pytest
.
mark
.
parametrize
(
"worker_use_ray"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"engine_use_ray"
,
[
False
,
True
])
def
test_api_server
(
api_server
,
tokenizer_pool_size
:
int
,
worker_use_ray
:
bool
,
engine_use_ray
:
bool
):
"""
"""
Run the API server and test it.
Run the API server and test it.
...
...
tests/async_engine/test_chat_template.py
View file @
99b471c2
...
@@ -76,20 +76,29 @@ def test_load_chat_template():
...
@@ -76,20 +76,29 @@ def test_load_chat_template():
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant
\\
n' }}{% endif %}"""
# noqa: E501
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant
\\
n' }}{% endif %}"""
# noqa: E501
def
test_no_load_chat_template
():
def
test_no_load_chat_template
_filelike
():
# Testing chatml template
# Testing chatml template
template
=
"../../examples/does_not_exist"
template
=
"../../examples/does_not_exist"
tokenizer
=
MockTokenizer
()
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
with
pytest
.
raises
(
ValueError
,
match
=
"looks like a file path"
):
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
def
test_no_load_chat_template_literallike
():
# Testing chatml template
template
=
"{{ messages }}"
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
chat_template
=
template
)
template_content
=
tokenizer
.
chat_template
template_content
=
tokenizer
.
chat_template
# Test assertions
assert
template_content
==
template
assert
template_content
is
not
None
# Hard coded value for template_chatml.jinja
assert
template_content
==
"""../../examples/does_not_exist"""
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/basic_correctness/test_chunked_prefill.py
0 → 100644
View file @
99b471c2
"""Compare the outputs of HF and vLLM when using greedy sampling.
It tests chunked prefill. Chunked prefill can be enabled by
enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
import
pytest
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
)
->
None
:
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
enable_chunked_prefill
=
False
max_num_batched_tokens
=
None
if
chunked_prefill_token_size
!=
-
1
:
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
print
(
vllm_outputs
[
0
])
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/conftest.py
View file @
99b471c2
...
@@ -11,8 +11,7 @@ from transformers import (AutoModelForCausalLM, AutoProcessor,
...
@@ -11,8 +11,7 @@ from transformers import (AutoModelForCausalLM, AutoProcessor,
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
TokenizerPoolConfig
,
VisionLanguageConfig
from
vllm.config
import
TokenizerPoolConfig
,
VisionLanguageConfig
from
vllm.model_executor.parallel_utils.parallel_state
import
(
from
vllm.distributed
import
destroy_model_parallel
destroy_model_parallel
)
from
vllm.sequence
import
MultiModalData
from
vllm.sequence
import
MultiModalData
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
@@ -55,10 +54,24 @@ def cleanup():
...
@@ -55,10 +54,24 @@ def cleanup():
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
This can provide a ~10x speedup for non-GPU unit tests since they don't need
to initialize torch.
"""
if
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
):
return
False
return
True
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup_fixture
():
def
cleanup_fixture
(
should_do_global_cleanup_after_test
:
bool
):
yield
yield
cleanup
()
if
should_do_global_cleanup_after_test
:
cleanup
()
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
...
@@ -388,7 +401,7 @@ class VllmRunner:
...
@@ -388,7 +401,7 @@ class VllmRunner:
cleanup
()
cleanup
()
@
pytest
.
fixture
@
pytest
.
fixture
(
scope
=
"session"
)
def
vllm_runner
():
def
vllm_runner
():
return
VllmRunner
return
VllmRunner
...
...
tests/core/block/conftest.py
0 → 100644
View file @
99b471c2
import
pytest
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
()
->
bool
:
"""Disable the global cleanup fixture for tests in this directory. This
provides a ~10x speedup for unit tests that don't load a model to GPU.
This requires that tests in this directory clean up after themselves if they
use the GPU.
"""
return
False
tests/core/block/e2e/conftest.py
View file @
99b471c2
import
contextlib
import
gc
import
pytest
import
pytest
import
ray
import
torch
from
tests.conftest
import
cleanup
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.model_executor.parallel_utils.parallel_state
import
(
destroy_model_parallel
)
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.model_executor.utils
import
set_random_seed
def
cleanup
():
destroy_model_parallel
()
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
ray
.
shutdown
()
@
pytest
.
fixture
@
pytest
.
fixture
def
baseline_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
def
baseline_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
seed
):
baseline_llm_kwargs
,
seed
):
...
...
tests/core/block/e2e/test_correctness.py
View file @
99b471c2
...
@@ -16,7 +16,7 @@ from vllm import SamplingParams
...
@@ -16,7 +16,7 @@ from vllm import SamplingParams
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"block_size"
:
16
,
"
forced_
num_gpu_blocks"
:
5
*
(
64
+
1
),
"num_gpu_blocks
_override
"
:
5
*
(
64
+
1
),
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
...
@@ -77,6 +77,229 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
...
@@ -77,6 +77,229 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
assert
baseline_token_ids
==
test_token_ids
assert
baseline_token_ids
==
test_token_ids
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Use a small model for a fast test.
"model"
:
"facebook/opt-125m"
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Use a large block size to trigger more copy-on-writes.
"block_size"
:
32
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_v1_v2_greedy_equality_with_cow
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify beam search equality with block manager v1 and v2.
This requires copy-on-writes; if the v1 and v2 output is the same, then
we have some confidence cow is working.
"""
output_len
=
128
temperature
=
0.0
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
use_beam_search
=
True
,
best_of
=
2
,
)
print
(
'Getting token ids from block manager v1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager v2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Use a small model for a fast test.
"model"
:
"facebook/opt-125m"
,
# Our prompts will generate 128 tokens; since the prompts themselves are
# small, we don't need much KV space beyond 128.
"max_model_len"
:
160
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Lookahead scheduling only supported in v2 block manager.
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
"block_size"
:
16
,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
"num_gpu_blocks_override"
:
2
*
(
8
+
1
),
},
{
"block_size"
:
8
,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size
"num_gpu_blocks_override"
:
2
*
(
16
+
1
),
}
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"num_lookahead_slots"
:
0
,
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
# We run one test with block_size < lookahead_slots, one test with
# block_size > lookahead_slots
"num_lookahead_slots"
:
10
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_lookahead_greedy_equality_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify vLLM produces the same output with greedy sampling, when lookahead
scheduling is used vs. not.
Lookahead scheduling is not expected to modify the output, as it simply
allocates empty slots ahead of the known token ids in a sliding fashion.
This test constrains the total number of blocks to force preemption. It also
varies the block size so that the lookahead size is less than and greater
than the block size.
"""
output_len
=
128
temperature
=
0.0
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
)
print
(
'Getting token ids without lookahead scheduling'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids with lookahead scheduling'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[
{
# Use a small model for a fast test.
"model"
:
"facebook/opt-125m"
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
"enable_chunked_prefill"
:
True
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[
{
"use_v2_block_manager"
:
False
,
},
])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"use_v2_block_manager"
:
True
,
"num_lookahead_slots"
:
0
,
},
{
"use_v2_block_manager"
:
True
,
"num_lookahead_slots"
:
5
,
},
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_chunked_prefill_block_manager_v2
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify that chunked prefill works with BlockManagerV2, with and without
lookahead scheduling.
"""
output_len
=
32
temperature
=
0.0
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
)
print
(
'Getting token ids with BlockManagerV1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids with BlockManagerV2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
def
get_token_ids_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
):
def
get_token_ids_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
):
for
llm
in
llm_generator
:
for
llm
in
llm_generator
:
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
...
...
tests/core/block/test_block_
space_
manager.py
→
tests/core/block/test_block_manager
_v2
.py
View file @
99b471c2
...
@@ -2,6 +2,8 @@ import pytest
...
@@ -2,6 +2,8 @@ import pytest
from
vllm.core.block_manager_v2
import
BlockSpaceManagerV2
from
vllm.core.block_manager_v2
import
BlockSpaceManagerV2
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.utils
import
chunk_list
from
..utils
import
create_seq_group
from
..utils
import
create_seq_group
...
@@ -29,7 +31,7 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
...
@@ -29,7 +31,7 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
for
num_prompt_blocks
in
range
(
1
,
num_gpu_blocks
-
num_output_blocks
):
for
num_prompt_blocks
in
range
(
1
,
num_gpu_blocks
-
num_output_blocks
):
seq_group
=
create_seq_group
(
seq_group
=
create_seq_group
(
seq_prompt_len
s
=
block_size
*
num_prompt_blocks
,
seq_prompt_len
=
block_size
*
num_prompt_blocks
,
seq_output_lens
=
[
seq_output_lens
=
[
block_size
*
num_output_blocks_per_seq
block_size
*
num_output_blocks_per_seq
for
_
in
range
(
num_seqs_per_group
)
for
_
in
range
(
num_seqs_per_group
)
...
@@ -48,3 +50,54 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
...
@@ -48,3 +50,54 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
assert
can_allocate_result
==
AllocStatus
.
OK
assert
can_allocate_result
==
AllocStatus
.
OK
else
:
else
:
assert
can_allocate_result
==
AllocStatus
.
LATER
assert
can_allocate_result
==
AllocStatus
.
LATER
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"prompt_len"
,
[
1
,
7
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_slots_to_append"
,
[
1
,
8
,
129
])
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
0
,
10
])
def
test_append_slots
(
block_size
,
prompt_len
,
num_slots_to_append
,
num_lookahead_slots
):
"""Verify append_slots consumes the correct number of blocks from the block
table.
"""
num_gpu_blocks
=
1024
watermark
=
0.1
block_manager
=
BlockSpaceManagerV2
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
watermark
=
watermark
,
)
seq_group
=
create_seq_group
(
seq_prompt_len
=
prompt_len
,
seq_output_lens
=
[
0
],
)
# Allocate seq
assert
block_manager
.
can_allocate
(
seq_group
)
block_manager
.
allocate
(
seq_group
)
# Seq seq to RUNNING
seq
=
seq_group
.
get_seqs
()[
0
]
seq
.
status
=
SequenceStatus
.
RUNNING
# Append tokens to the sequeqnce
for
token_id
in
range
(
num_slots_to_append
):
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Append slots for new tokens and lookahead slots.
free_blocks_before_append
=
block_manager
.
get_num_free_gpu_blocks
()
block_manager
.
append_slots
(
seq
,
num_lookahead_slots
)
num_consumed_blocks
=
(
free_blocks_before_append
-
block_manager
.
get_num_free_gpu_blocks
())
# Expect consumed blocks to be new blocks required to support the new slots.
expected_consumed_blocks
=
len
(
chunk_list
(
list
(
range
(
prompt_len
+
num_slots_to_append
+
num_lookahead_slots
)),
block_size
))
-
len
(
chunk_list
(
list
(
range
(
prompt_len
)),
block_size
))
assert
num_consumed_blocks
==
expected_consumed_blocks
tests/core/block/test_block_table.py
View file @
99b471c2
...
@@ -498,3 +498,78 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
...
@@ -498,3 +498,78 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
# After free, expect all blocks to be freed.
# After free, expect all blocks to be freed.
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"num_new_tokens"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
1
,
7
,
8
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_num_blocks_touched_by_append_slots
(
block_size
:
int
,
sequence_len
:
int
,
num_new_tokens
:
int
,
num_lookahead_slots
:
int
,
allocator_type
:
str
):
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
This is done by using copy-on-write, which requires any modified block to
be copied before write if the refcount > 1. We set the refcount>1 by forking
a sequence, then measure the free blocks before and after an append. If the
number of consumed blocks equals what `get_num_blocks_touched_by_append_
slots` returns, then the calculation is correct.
"""
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
token_ids_to_append
=
list
(
range
(
num_new_tokens
))
block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
# Add lookahead before fork so both sequences have the same lookahead
# blocks.
block_table
.
ensure_num_empty_slots
(
num_empty_slots
=
num_lookahead_slots
)
# Fork sequence so that every block has refcount > 1.
_
=
block_table
.
fork
()
# Determine how many blocks should be touched.
expected_num_touched_blocks
=
(
block_table
.
get_num_blocks_touched_by_append_slots
(
token_ids
=
token_ids_to_append
,
num_lookahead_slots
=
num_lookahead_slots
))
# Measure how many blocks are touched by measuring num_free_blocks before
# and after the append.
#
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
num_free_blocks_before_append
=
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
block_table
.
append_token_ids
(
token_ids_to_append
,
num_lookahead_slots
)
num_consumed_blocks
=
(
num_free_blocks_before_append
-
allocator
.
get_num_free_blocks
(
Device
.
GPU
))
# TODO(cade) ensure equality when num_lookahead_slots > 0.
# The reason we have < is because lookahead blocks are not copied eagerly;
# they are copied on first write. This will cause issues for beam search +
# speculative decoding. This is acceptable for now as it is a large effort
# to combine the two. To fix this, we can ensure single sequence ownership
# of lookahead blocks by appending empty slots to each block, which will
# trigger the CoW.
#
# Until then, we can accept that the consumed tokens are <= the expected
# tokens when appending with lookahead.
if
num_lookahead_slots
>
0
:
assert
num_consumed_blocks
<=
expected_num_touched_blocks
else
:
assert
num_consumed_blocks
==
expected_num_touched_blocks
tests/core/test_block_manager.py
View file @
99b471c2
...
@@ -103,9 +103,9 @@ def test_append_slot_single_seq():
...
@@ -103,9 +103,9 @@ def test_append_slot_single_seq():
block_manager
.
allocate
(
seq_group
)
block_manager
.
allocate
(
seq_group
)
# Nothing to append. Sequence has no new logical blocks.
# Nothing to append. Sequence has no new logical blocks.
assert
block_manager
.
can_append_slot
(
seq_group
)
assert
block_manager
.
can_append_slot
s
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slot
(
prompt
)
assert
not
block_manager
.
append_slot
s
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
==
after_blocks
assert
before_blocks
==
after_blocks
...
@@ -114,9 +114,9 @@ def test_append_slot_single_seq():
...
@@ -114,9 +114,9 @@ def test_append_slot_single_seq():
token_id
=
i
+
5
token_id
=
i
+
5
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
assert
block_manager
.
can_append_slot
(
seq_group
)
assert
block_manager
.
can_append_slot
s
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slot
(
prompt
)
assert
not
block_manager
.
append_slot
s
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
assert
before_blocks
-
after_blocks
==
1
...
@@ -150,13 +150,13 @@ def test_append_slot_cow():
...
@@ -150,13 +150,13 @@ def test_append_slot_cow():
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
fork
(
prompt
,
child
)
block_manager
.
fork
(
prompt
,
child
)
assert
block_manager
.
can_append_slot
(
seq_group
)
assert
block_manager
.
can_append_slot
s
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
maybe_src_dst_block
=
block_manager
.
append_slot
(
child
)
cows
=
block_manager
.
append_slot
s
(
child
)
assert
maybe_src_dst_block
is
not
None
assert
cows
src_block
,
dst_block
=
maybe_src_dst_block
for
src_block
,
dst_block
s
in
cows
.
items
():
assert
src_block
!=
dst_block
assert
src_block
not
in
dst_block
s
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
assert
before_blocks
-
after_blocks
==
1
...
@@ -184,7 +184,7 @@ def test_fork():
...
@@ -184,7 +184,7 @@ def test_fork():
token_id
=
4
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
child
)
block_manager
.
append_slot
s
(
child
)
assert
block_manager
.
get_block_table
(
assert
block_manager
.
get_block_table
(
prompt
)
!=
block_manager
.
get_block_table
(
child
)
prompt
)
!=
block_manager
.
get_block_table
(
child
)
...
@@ -325,7 +325,7 @@ def test_sliding_window_multi_seq():
...
@@ -325,7 +325,7 @@ def test_sliding_window_multi_seq():
token_id
=
4
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
child
)
block_manager
.
append_slot
s
(
child
)
# assert the number of blocks allocated is correct
# assert the number of blocks allocated is correct
# we will use now one block more. Each seq will use 2 blocks,
# we will use now one block more. Each seq will use 2 blocks,
...
@@ -335,7 +335,7 @@ def test_sliding_window_multi_seq():
...
@@ -335,7 +335,7 @@ def test_sliding_window_multi_seq():
token_id
=
5
token_id
=
5
parent
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
parent
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
parent
)
block_manager
.
append_slot
s
(
parent
)
# assert the number of blocks allocated is correct
# assert the number of blocks allocated is correct
# no change, because both sequences are still just sharing one block
# no change, because both sequences are still just sharing one block
...
...
tests/core/test_chunked_prefill_scheduler.py
0 → 100644
View file @
99b471c2
from
typing
import
List
from
unittest.mock
import
MagicMock
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
.utils
import
create_dummy_prompt
def
get_sequence_groups
(
scheduler_output
):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
seq_group
,
token_id
:
int
):
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
def
test_simple
():
"""Verify basic scheduling works."""
block_size
=
4
num_seq_group
=
4
max_model_len
=
16
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
num_seq_group
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Schedule seq groups prompts.
num_tokens
=
block_size
*
num_seq_group
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
out
.
num_batched_tokens
==
num_tokens
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
num_seq_group
for
s
in
running
:
append_new_token
(
s
,
1
)
# Schedule seq groups generation.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
out
.
num_batched_tokens
==
num_seq_group
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
num_seq_group
def
test_chunk
():
"""Verify prefills are chunked properly."""
block_size
=
4
max_seqs
=
60
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Verify the second request is chunked.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
60
# Verify it is chunked.
assert
seq_group_meta
[
1
].
token_chunk_size
==
4
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
64
# Only the first seq group has a new token appended.
append_new_token
(
running
[
0
],
1
)
# One chunked prefill, and one decoding.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# The first one is prefill. Scheduler guarantees ordering.
assert
seq_group_meta
[
0
].
token_chunk_size
==
56
# The second one is a chunked prefill.
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
57
def
test_complex
():
block_size
=
4
max_seqs
=
60
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# Verify the second request is chunked.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
60
# Verify it is chunked.
assert
seq_group_meta
[
1
].
token_chunk_size
==
4
assert
not
running
[
0
].
is_prefill
()
assert
running
[
1
].
is_prefill
()
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
64
# Only the first seq group has a new token appended.
append_new_token
(
running
[
0
],
1
)
# Add 2 more requsets.
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Decoding & chunked prefill & first chunk of 3rd request is scheduled.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
3
# The first one is the first chunked prefill.
assert
seq_group_meta
[
0
].
token_chunk_size
==
7
# The second one is the second new chunked prefill.
assert
seq_group_meta
[
1
].
token_chunk_size
==
56
# The last one is decode.
assert
seq_group_meta
[
2
].
token_chunk_size
==
1
# Two of them are in chunked prefill.
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
64
# The first 2 requests are now in decodine phase.
append_new_token
(
running
[
0
],
1
)
assert
not
running
[
0
].
is_prefill
()
append_new_token
(
running
[
1
],
1
)
assert
not
running
[
1
].
is_prefill
()
# The third request is still in prefill stage.
assert
running
[
2
].
is_prefill
()
def
test_maximal_decoding
():
"""Verify decoding requests are prioritized."""
block_size
=
4
max_seqs
=
2
max_model_len
=
2
max_num_batched_tokens
=
2
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
2
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# The first prefill is scheduled.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
1
assert
seq_group_meta
[
0
].
token_chunk_size
==
2
assert
not
running
[
0
].
is_prefill
()
assert
running
[
1
].
is_prefill
()
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
2
# Only the first seq group has a new token appended.
append_new_token
(
running
[
0
],
1
)
# Create one more seq_group.
_
,
seq_group
=
create_dummy_prompt
(
"3"
,
prompt_length
=
2
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# The first decoding + second chunk is scheduled.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
2
assert
seq_group_meta
[
0
].
token_chunk_size
==
1
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
assert
not
running
[
0
].
is_prefill
()
assert
running
[
1
].
is_prefill
()
assert
running
[
2
].
is_prefill
()
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
2
append_new_token
(
running
[
0
],
1
)
# Decoding + running prefill is prioritized.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
2
assert
seq_group_meta
[
0
].
token_chunk_size
==
1
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
assert
not
running
[
0
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
2
append_new_token
(
running
[
0
],
1
)
append_new_token
(
running
[
1
],
1
)
# Only decoding is prioritized.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
2
assert
seq_group_meta
[
0
].
token_chunk_size
==
1
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
assert
not
running
[
0
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
assert
out
.
num_prefill_groups
==
0
assert
out
.
num_batched_tokens
==
2
append_new_token
(
running
[
0
],
1
)
append_new_token
(
running
[
1
],
1
)
# After aborting the decoding request, the fcfs new prefill is prioritized.
scheduler
.
abort_seq_group
(
running
[
0
].
request_id
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
2
assert
seq_group_meta
[
0
].
token_chunk_size
==
1
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
assert
not
running
[
1
].
is_prefill
()
assert
running
[
2
].
is_prefill
()
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
2
def
test_prompt_limit
():
"""Verify max_num_batched_tokens < max_model_len is possible."""
block_size
=
4
max_seqs
=
32
max_model_len
=
64
max_num_batched_tokens
=
32
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
48
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
# The prompt length > max_num_batched_tokens should be still scheduled.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
get_sequence_groups
(
out
))
==
1
assert
seq_group_meta
[
0
].
token_chunk_size
==
32
assert
running
[
0
].
is_prefill
()
assert
out
.
num_prefill_groups
==
1
assert
out
.
num_batched_tokens
==
32
def
test_prompt_limit_exceed
():
block_size
=
4
max_seqs
=
64
max_model_len
=
32
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
prompt_length
=
48
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
ignored_seq_groups
)
==
1
assert
out
.
ignored_seq_groups
[
0
]
==
seq_group
def
test_swap
():
"""Verify swapping works with chunked prefill requests"""
block_size
=
4
max_seqs
=
30
max_model_len
=
200
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# prefill scheduled now.
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_prefill_groups
==
1
assert
seq_group
.
is_prefill
()
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
# The last request should be swapped out.
scheduler
.
block_manager
.
can_append_slots
=
MagicMock
()
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
return
seq_group
.
request_id
!=
"1"
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
# The running prefill is now swapped.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
0
assert
out
.
num_batched_tokens
==
0
assert
out
.
blocks_to_swap_out
!=
{}
assert
out
.
blocks_to_swap_in
==
{}
# Add 1 more task. Swap should be prioritized over new prefill.
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
!=
{}
assert
out
.
blocks_to_swap_out
==
{}
def
test_running_prefill_prioritized_over_swap
():
block_size
=
4
max_seqs
=
30
max_model_len
=
200
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# prefill scheduled now.
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_prefill_groups
==
1
assert
seq_group
.
is_prefill
()
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
# The request should be swapped out.
scheduler
.
block_manager
.
can_append_slots
=
MagicMock
()
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
return
seq_group
.
request_id
!=
"1"
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
# The running prefill is now swapped.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
0
assert
out
.
num_batched_tokens
==
0
assert
out
.
blocks_to_swap_out
!=
{}
assert
out
.
blocks_to_swap_in
==
{}
# Add 1 more task. Swap is not possible, so prefill is running.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
False
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group2
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
# Now although swap is possible, running prefill is prioritized.
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
True
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
not
seq_group2
.
is_prefill
()
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
append_new_token
(
seq_group2
,
1
)
# Decoding is prioritized.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
1
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
not
seq_group2
.
is_prefill
()
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
append_new_token
(
seq_group2
,
1
)
# Since we abort the sequence group, we can finally swap.
scheduler
.
abort_seq_group
(
seq_group2
.
request_id
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
!=
{}
assert
out
.
blocks_to_swap_out
==
{}
def
test_chunked_prefill_preempt
():
"""Verify preempt works with chunked prefill requests"""
block_size
=
4
max_seqs
=
30
max_model_len
=
200
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# prefill scheduled now.
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_prefill_groups
==
1
assert
seq_group
.
is_prefill
()
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
# The request should be preempted.
scheduler
.
block_manager
.
can_append_slots
=
MagicMock
()
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
return
seq_group
.
request_id
!=
"1"
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
# The running prefill is now preempted.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
0
assert
out
.
num_batched_tokens
==
0
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
==
{}
# Make sure we can reschedule preempted request.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_prefill_groups
==
1
assert
seq_group
.
is_prefill
()
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
assert
seq_group
.
get_num_uncomputed_tokens
()
==
30
# We should be able to run prefill twice as it is chunked.
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
return
True
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_prefill_groups
==
1
assert
not
seq_group
.
is_prefill
()
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
def
test_chunked_prefill_max_seqs
():
block_size
=
4
max_seqs
=
2
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# The first prefill is chunked.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
max_num_batched_tokens
assert
len
(
get_sequence_groups
(
out
))
==
1
# Add new requests.
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
65
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Make sure only 2 requests are scheduled.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
assert
len
(
get_sequence_groups
(
out
))
==
2
assert
not
running
[
0
].
is_prefill
()
assert
running
[
1
].
is_prefill
()
append_new_token
(
running
[
0
],
1
)
# Although we have enough token budget, we can only schedule max_seqs.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
2
assert
seq_group_meta
[
1
].
token_chunk_size
==
1
assert
out
.
num_batched_tokens
==
3
assert
len
(
get_sequence_groups
(
out
))
==
max_seqs
assert
not
running
[
0
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
tests/core/test_scheduler.py
View file @
99b471c2
import
time
import
time
from
collections
import
deque
from
typing
import
List
from
typing
import
List
from
unittest.mock
import
MagicMock
import
pytest
# noqa
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
vllm.core.policy
import
PolicyFactory
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
Logprob
,
SequenceGroup
,
SequenceStatus
from
.utils
import
create_dummy_prompt
from
.utils
import
create_dummy_prompt
...
@@ -14,6 +19,26 @@ def get_sequence_groups(scheduler_output):
...
@@ -14,6 +19,26 @@ def get_sequence_groups(scheduler_output):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
out
,
token_id
:
int
):
seq_groups
=
get_sequence_groups
(
out
)
for
seq_group
in
seq_groups
:
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
def
append_new_token_seq_group
(
token_chunk_size
,
seq_group
,
token_id
:
int
):
seq_group
.
update_num_computed_tokens
(
token_chunk_size
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
test_scheduler_add_seq_group
():
def
test_scheduler_add_seq_group
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
...
@@ -71,20 +96,52 @@ def test_scheduler_schedule_simple():
...
@@ -71,20 +96,52 @@ def test_scheduler_schedule_simple():
# Schedule seq groups prompts.
# Schedule seq groups prompts.
num_tokens
=
block_size
*
num_seq_group
num_tokens
=
block_size
*
num_seq_group
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
out
.
num_batched_tokens
==
num_tokens
assert
out
.
num_batched_tokens
==
num_tokens
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
num_seq_group
assert
len
(
seq_group_meta
)
==
num_seq_group
append_new_token
(
out
,
1
)
# Schedule seq groups generation.
# Schedule seq groups generation.
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
out
.
num_batched_tokens
==
num_seq_group
assert
out
.
num_batched_tokens
==
num_seq_group
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
num_seq_group
assert
len
(
seq_group_meta
)
==
num_seq_group
append_new_token
(
out
,
1
)
def
test_scheduler_prefill_prioritized
():
"""Verify running batched tokens are not applied to prefill requests."""
block_size
=
4
max_model_len
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_batched_num_tokens
,
2
,
max_model_len
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
_
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
1
)
scheduler
.
add_seq_group
(
seq_group_a
)
# Schedule seq groups prompts.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
# Add a new prefill request B.
_
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
30
)
scheduler
.
add_seq_group
(
seq_group_b
)
# Verify prefill requests are prioritized. Since max_batched_num_tokens
# is 1, new prefill request has to be scheduled first.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
def
test_scheduler_schedule_preempt_abort
():
def
test_scheduler_schedule_preempt_abort
():
...
@@ -103,7 +160,7 @@ def test_scheduler_schedule_preempt_abort():
...
@@ -103,7 +160,7 @@ def test_scheduler_schedule_preempt_abort():
scheduler
.
add_seq_group
(
seq_group_b
)
scheduler
.
add_seq_group
(
seq_group_b
)
# Schedule seq groups prompts.
# Schedule seq groups prompts.
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
,
seq_group_b
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
,
seq_group_b
]
assert
out
.
num_batched_tokens
==
block_size
*
2
# seq_a and seq_b
assert
out
.
num_batched_tokens
==
block_size
*
2
# seq_a and seq_b
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
...
@@ -113,12 +170,10 @@ def test_scheduler_schedule_preempt_abort():
...
@@ -113,12 +170,10 @@ def test_scheduler_schedule_preempt_abort():
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
# processed.
# processed.
token_id
=
0
append_new_token
(
out
,
1
)
seq_a
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
seq_b
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Schedule seq groups generation and preempt seq group b.
# Schedule seq groups generation and preempt seq group b.
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
assert
out
.
num_batched_tokens
==
1
assert
out
.
num_batched_tokens
==
1
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
...
@@ -128,7 +183,7 @@ def test_scheduler_schedule_preempt_abort():
...
@@ -128,7 +183,7 @@ def test_scheduler_schedule_preempt_abort():
# Abort seq group a. Re-schedule seq group b prompt with recomputation.
# Abort seq group a. Re-schedule seq group b prompt with recomputation.
scheduler
.
abort_seq_group
(
"1"
)
scheduler
.
abort_seq_group
(
"1"
)
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
assert
out
.
num_batched_tokens
==
5
# 4 prompt + 1 generation.
assert
out
.
num_batched_tokens
==
5
# 4 prompt + 1 generation.
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
...
@@ -158,12 +213,14 @@ def test_scheduler_max_seqs():
...
@@ -158,12 +213,14 @@ def test_scheduler_max_seqs():
scheduler
.
add_seq_group
(
all_seq_groups
[
0
])
scheduler
.
add_seq_group
(
all_seq_groups
[
0
])
# Schedule seq groups prompts.
# Schedule seq groups prompts.
_
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
0
]])
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
0
]])
append_new_token
(
out
,
1
)
# Schedule seq groups generation.
# Schedule seq groups generation.
_
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
0
]])
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
0
]])
append_new_token
(
out
,
1
)
# Append 2 more seq group
# Append 2 more seq group
scheduler
.
add_seq_group
(
all_seq_groups
[
1
])
scheduler
.
add_seq_group
(
all_seq_groups
[
1
])
...
@@ -172,12 +229,11 @@ def test_scheduler_max_seqs():
...
@@ -172,12 +229,11 @@ def test_scheduler_max_seqs():
# Schedule seq groups prompts.
# Schedule seq groups prompts.
# Only 1 seq group should be scheduled since max_seq_group is 2
# Only 1 seq group should be scheduled since max_seq_group is 2
# and one is prompting.
# and one is prompting.
_
,
out
=
schedule
r
.
schedule
(
)
_
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
def
test_scheduler_delay_factor
():
def
test_scheduler_delay_factor
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
...
@@ -186,24 +242,631 @@ def test_scheduler_delay_factor():
...
@@ -186,24 +242,631 @@ def test_scheduler_delay_factor():
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# schedule first prompt
# schedule first prompt
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
block_size
)
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
out
.
prompt_run
assert
out
.
num_prefill_groups
>
0
assert
seq_group_meta
[
0
].
request_id
==
'0'
assert
seq_group_meta
[
0
].
request_id
==
'0'
append_new_token
(
out
,
1
)
# wait for a second before scheduling next prompt
# wait for a second before scheduling next prompt
time
.
sleep
(
1
)
time
.
sleep
(
1
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
)
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# second prompt should *not* be scheduled
# second prompt should *not* be scheduled
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
not
out
.
prompt_run
assert
out
.
num_prefill_groups
==
0
assert
seq_group_meta
[
0
].
request_id
==
'0'
assert
seq_group_meta
[
0
].
request_id
==
'0'
append_new_token
(
out
,
1
)
# wait for more than 0.5 second and try again
# wait for more than 0.5 second and try again
time
.
sleep
(
0.6
)
time
.
sleep
(
0.6
)
seq_group_meta
,
out
=
schedule
r
.
schedule
(
)
seq_group_meta
,
out
=
schedule
_and_update_computed_tokens
(
schedule
r
)
assert
out
.
prompt_run
assert
out
.
num_prefill_groups
>
0
assert
seq_group_meta
[
0
].
request_id
==
'1'
assert
seq_group_meta
[
0
].
request_id
==
'1'
append_new_token
(
out
,
1
)
def
test_swapped_out_prioritized
():
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
)
# best_of=2 * 3 == 6 sequences.
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# prefill scheduled now.
assert
len
(
out
.
scheduled_seq_groups
)
==
3
append_new_token
(
out
,
1
)
# The last request should be swapped out.
scheduler
.
block_manager
.
can_append_slots
=
MagicMock
()
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
return
seq_group
.
request_id
!=
"2"
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
2
assert
out
.
num_batched_tokens
==
2
assert
out
.
blocks_to_swap_out
!=
{}
assert
out
.
blocks_to_swap_in
==
{}
append_new_token
(
out
,
1
)
# Add 1 more task. Swap should be prioritized over prefill.
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
append_new_token
(
out
,
1
)
assert
len
(
out
.
scheduled_seq_groups
)
==
3
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
3
assert
out
.
blocks_to_swap_in
!=
{}
assert
out
.
blocks_to_swap_out
==
{}
def
initialize_scheduler
(
*
,
max_num_seqs
=
1000
,
max_token_budget
=
1000
,
max_model_len
=
1000
,
lora_config
=
None
):
block_size
=
4
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
max_model_len
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
return
scheduler
def
create_token_budget
(
token_budget
:
int
=
10000
,
max_num_seqs
:
int
=
10000
)
->
SchedulingBudget
:
return
SchedulingBudget
(
token_budget
=
token_budget
,
max_num_seqs
=
max_num_seqs
,
)
def
add_token_budget
(
budget
:
SchedulingBudget
,
num_batched_tokens
:
int
=
0
,
num_curr_seqs
:
int
=
0
):
mock_seq_group
=
create_dummy_prompt
(
'10'
,
prompt_length
=
60
)[
1
]
budget
.
add_num_batched_tokens
(
mock_seq_group
.
request_id
,
num_batched_tokens
)
budget
.
add_num_seqs
(
mock_seq_group
.
request_id
,
num_curr_seqs
)
def
test_prefill_schedule_max_prompt_len
():
"""
Test prompt longer than max_prompt_len is aborted.
"""
scheduler
=
initialize_scheduler
(
max_model_len
=
30
)
_
,
seq_group
=
create_dummy_prompt
(
0
,
prompt_length
=
60
)
waiting
=
deque
([
seq_group
])
budget
=
create_token_budget
()
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
1
assert
len
(
output
.
seq_groups
)
==
0
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_prefill_schedule_token_budget
():
"""
Test token budget respected.
"""
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
budget
=
create_token_budget
(
token_budget
=
0
)
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
waiting
.
append
(
seq_group
)
# 0 token budget == nothing is scheduled.
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
0
assert
len
(
output
.
seq_groups
)
==
0
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
0
assert
len
(
remaining_waiting
)
==
2
# 60 token budget == 1 request scheduled.
budget
=
create_token_budget
(
token_budget
=
60
)
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
0
assert
len
(
output
.
seq_groups
)
==
1
assert
budget
.
num_batched_tokens
==
60
assert
budget
.
num_curr_seqs
==
1
assert
len
(
remaining_waiting
)
==
1
# Test when current_batched_tokens respected.
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
budget
=
create_token_budget
(
token_budget
=
60
)
add_token_budget
(
budget
,
30
,
0
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
# Cannot schedule a prompt that doesn't fit the budget.
waiting
.
append
(
seq_group
)
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
0
assert
len
(
output
.
seq_groups
)
==
0
assert
budget
.
num_batched_tokens
==
30
assert
budget
.
num_curr_seqs
==
0
assert
len
(
remaining_waiting
)
==
1
budget
=
create_token_budget
(
token_budget
=
90
)
add_token_budget
(
budget
,
30
,
0
)
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
seq_groups
)
==
1
assert
budget
.
num_batched_tokens
==
90
assert
budget
.
num_curr_seqs
==
1
assert
len
(
remaining_waiting
)
==
0
def
test_prefill_schedule_max_seqs
():
"""
Test max seq respected.
"""
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
budget
=
create_token_budget
(
max_num_seqs
=
2
)
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
waiting
.
append
(
seq_group
)
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
0
assert
len
(
output
.
seq_groups
)
==
2
assert
budget
.
num_batched_tokens
==
120
assert
budget
.
num_curr_seqs
==
2
assert
len
(
remaining_waiting
)
==
1
# Verify curr_num_seqs respected.
waiting
=
deque
()
budget
=
create_token_budget
(
max_num_seqs
=
2
)
add_token_budget
(
budget
,
0
,
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
waiting
.
append
(
seq_group
)
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
0
assert
len
(
output
.
seq_groups
)
==
0
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
2
assert
len
(
remaining_waiting
)
==
1
def
test_prefill_schedule_max_lora
():
"""
Test max lora is respected and prioritized.
"""
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
waiting
=
deque
()
budget
=
create_token_budget
(
token_budget
=
120
)
curr_loras
=
set
()
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_local_path
=
"abc"
))
waiting
.
append
(
seq_group
)
# Add two more requests to verify lora is prioritized.
# 0: Lora, 1: Lora, 2: regular, 3: regular
# In the first iteration, index 0, 2 is scheduled.
# If a request is not scheduled because it hits max lora, it is
# prioritized. Verify that.
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
waiting
.
append
(
seq_group
)
# Schedule 2 requests (0 and 2)
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
curr_loras
)
assert
len
(
output
.
ignored_seq_groups
)
==
0
assert
len
(
output
.
seq_groups
)
==
2
assert
budget
.
num_batched_tokens
==
120
assert
budget
.
num_curr_seqs
==
2
assert
len
(
remaining_waiting
)
==
2
assert
len
(
curr_loras
)
==
1
# The second lora request is scheduled next as FCFS policy.
# Reset curr_loras so that it can be scheduled.
curr_loras
=
set
()
budget
=
create_token_budget
(
token_budget
=
60
)
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
remaining_waiting
,
budget
,
curr_loras
)
assert
len
(
output
.
seq_groups
)
==
1
assert
output
.
seq_groups
[
0
].
seq_group
.
request_id
==
"1"
assert
len
(
remaining_waiting
)
==
1
assert
len
(
curr_loras
)
==
1
assert
budget
.
num_batched_tokens
==
60
def
test_prefill_schedule_no_block_manager_capacity
():
"""
Test sequence cannot be scheduled due to block manager has no capacity.
"""
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
waiting
.
append
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
LATER
remainig_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
0
assert
len
(
output
.
seq_groups
)
==
0
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
0
assert
len
(
remainig_waiting
)
==
3
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
waiting
.
append
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
NEVER
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
waiting
,
budget
,
None
)
assert
len
(
output
.
ignored_seq_groups
)
==
3
assert
len
(
output
.
seq_groups
)
==
0
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_decode_schedule_preempted
():
"""
Test decodes cannot be scheduled and preempted.
"""
scheduler
=
initialize_scheduler
()
running
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
running
.
append
(
seq_group
)
scheduler
.
block_manager
.
can_append_slots
=
MagicMock
()
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
return
seq_group
.
request_id
!=
"1"
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
# 1 cannot be scheduled, and the lowest priority (request 2)
# should be preempted. 1 will also be preempted.
budget
=
create_token_budget
()
remainig_running
,
output
=
scheduler
.
_schedule_running
(
running
,
budget
,
curr_loras
,
policy
)
assert
len
(
remainig_running
)
==
0
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
output
.
decode_seq_groups
[
0
].
seq_group
.
request_id
==
"0"
assert
len
(
output
.
preempted
)
==
2
# Verify budgets are updated.
assert
budget
.
num_batched_tokens
==
1
# NOTE: When enable_chunk is False, num_seqs budget is not updated.
# assert budget.num_curr_seqs == 1
# Both should be preempted, not swapped.
assert
output
.
blocks_to_swap_out
==
{}
# Nothing is copied.
assert
output
.
blocks_to_copy
==
{}
def
test_decode_swap_beam_search
():
"""
Test best_of > 1 swap out blocks
"""
scheduler
=
initialize_scheduler
()
running
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
budget
=
create_token_budget
()
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
running
.
append
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
budget
.
add_num_seqs
(
seq_group
.
request_id
,
seq_group
.
get_max_num_running_seqs
())
budget
.
add_num_batched_tokens
(
seq_group
.
request_id
,
seq_group
.
num_seqs
(
SequenceStatus
.
RUNNING
))
# The last request should be swapped out.
scheduler
.
block_manager
.
can_append_slots
=
MagicMock
()
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
return
seq_group
.
request_id
!=
"2"
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
scheduler
.
block_manager
.
swap_out
=
MagicMock
()
expected_swap_mapping
=
{
"5"
:
"7"
}
scheduler
.
block_manager
.
swap_out
.
return_value
=
expected_swap_mapping
remainig_running
,
output
=
scheduler
.
_schedule_running
(
running
,
budget
,
curr_loras
,
policy
)
assert
len
(
remainig_running
)
==
0
assert
len
(
output
.
decode_seq_groups
)
==
2
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
output
.
decode_seq_groups
[
0
].
seq_group
.
request_id
==
"0"
assert
output
.
decode_seq_groups
[
1
].
seq_group
.
request_id
==
"1"
assert
len
(
output
.
preempted
)
==
0
assert
len
(
output
.
swapped_out
)
==
1
# Budget should refledct preempted requests.
assert
budget
.
num_batched_tokens
==
2
# since there are 2 sequences, 2 should be subtracted.
assert
budget
.
num_curr_seqs
==
4
# Both should be preempted, not swapped.
assert
output
.
blocks_to_swap_out
==
expected_swap_mapping
# Nothing is copied.
assert
output
.
blocks_to_copy
==
{}
def
test_schedule_decode_blocks_to_copy_update
():
"""
Verify blocks_to_copy is updated.
"""
scheduler
=
initialize_scheduler
()
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
running
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
running
.
append
(
seq_group
)
# The last request should be swapped out.
scheduler
.
block_manager
.
append_slots
=
MagicMock
()
scheduler
.
block_manager
.
append_slots
.
return_value
=
{
2
:
[
3
]}
budget
=
create_token_budget
()
remaining_running
,
output
=
scheduler
.
_schedule_running
(
running
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_running
)
==
0
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
preempted
)
==
0
assert
len
(
output
.
swapped_out
)
==
0
# Nothing is preempted.
assert
output
.
blocks_to_swap_out
==
{}
# Since append_slot returns the source -> dist mapping, it should
# applied.
assert
output
.
blocks_to_copy
==
{
2
:
[
3
]}
def
test_schedule_swapped_simple
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
0
assert
budget
.
num_batched_tokens
==
1
assert
budget
.
num_curr_seqs
==
2
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
# swap in is the reverse of swap out
blocks_to_swap_in_reverse
=
{}
for
swapin
,
swapout
in
output
.
blocks_to_swap_in
.
items
():
blocks_to_swap_in_reverse
[
swapout
]
=
swapin
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
def
test_schedule_swapped_max_token_budget
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
budget
=
create_token_budget
(
token_budget
=
1
)
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
1
assert
budget
.
num_batched_tokens
==
1
assert
budget
.
num_curr_seqs
==
2
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
# Verify num_batched_tokens are respected.
budget
=
create_token_budget
(
token_budget
=
1
)
add_token_budget
(
budget
,
1
,
0
)
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
remaining_swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
1
assert
budget
.
num_batched_tokens
==
1
assert
budget
.
num_curr_seqs
==
0
assert
len
(
output
.
decode_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_max_seqs
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
2
assert
budget
.
num_batched_tokens
==
2
assert
budget
.
num_curr_seqs
==
2
assert
len
(
output
.
decode_seq_groups
)
==
2
assert
len
(
output
.
prefill_seq_groups
)
==
0
# Verify num_curr_seqs are respected.
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
remaining_swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
2
assert
budget
.
num_batched_tokens
==
2
assert
budget
.
num_curr_seqs
==
2
assert
len
(
output
.
decode_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_max_loras
():
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
set
()
blocks_to_swap_out
=
{}
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_local_path
=
"abc"
))
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
1
assert
budget
.
num_batched_tokens
==
1
assert
budget
.
num_curr_seqs
==
1
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
curr_loras
)
==
1
def
test_schedule_swapped_cannot_swap_in
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
# The last request should be swapped out.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
False
# Since we cannot swap in, none of the requests are swapped in.
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
2
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
0
assert
len
(
output
.
decode_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_blocks_to_copy
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
=
{}
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
# The last request should be swapped out.
scheduler
.
block_manager
.
append_slots
=
MagicMock
()
scheduler
.
block_manager
.
append_slots
.
return_value
=
{
2
:
[
3
]}
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
0
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
output
.
blocks_to_copy
==
{
2
:
[
3
]}
def
test_scheduling_budget
():
TOKEN_BUDGET
=
4
MAX_SEQS
=
4
budget
=
SchedulingBudget
(
token_budget
=
TOKEN_BUDGET
,
max_num_seqs
=
MAX_SEQS
)
assert
budget
.
can_schedule
(
num_new_tokens
=
1
,
num_new_seqs
=
1
)
assert
budget
.
can_schedule
(
num_new_tokens
=
4
,
num_new_seqs
=
4
)
assert
not
budget
.
can_schedule
(
num_new_tokens
=
1
,
num_new_seqs
=
5
)
assert
not
budget
.
can_schedule
(
num_new_tokens
=
5
,
num_new_seqs
=
1
)
assert
not
budget
.
can_schedule
(
num_new_tokens
=
5
,
num_new_seqs
=
5
)
assert
budget
.
remaining_token_budget
()
==
TOKEN_BUDGET
# Verify add/subtract num batched tokens.
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
3
)
budget
.
add_num_batched_tokens
(
seq_group
.
request_id
,
2
)
assert
budget
.
remaining_token_budget
()
==
2
assert
budget
.
num_batched_tokens
==
2
assert
budget
.
can_schedule
(
num_new_tokens
=
2
,
num_new_seqs
=
1
)
assert
not
budget
.
can_schedule
(
num_new_tokens
=
3
,
num_new_seqs
=
1
)
# Verify adding another seq group is no-op.
budget
.
add_num_batched_tokens
(
seq_group
.
request_id
,
2
)
assert
budget
.
remaining_token_budget
()
==
2
assert
budget
.
num_batched_tokens
==
2
budget
.
subtract_num_batched_tokens
(
seq_group
.
request_id
,
2
)
assert
budget
.
remaining_token_budget
()
==
4
assert
budget
.
num_batched_tokens
==
0
budget
.
subtract_num_batched_tokens
(
seq_group
.
request_id
,
2
)
assert
budget
.
remaining_token_budget
()
==
4
assert
budget
.
num_batched_tokens
==
0
# Verify add/subtract max seqs.
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
3
)
budget
.
add_num_seqs
(
seq_group
.
request_id
,
2
)
assert
budget
.
can_schedule
(
num_new_tokens
=
1
,
num_new_seqs
=
2
)
assert
not
budget
.
can_schedule
(
num_new_tokens
=
1
,
num_new_seqs
=
3
)
assert
budget
.
num_curr_seqs
==
2
# Verify adding another seq group is no-op.
budget
.
add_num_seqs
(
seq_group
.
request_id
,
2
)
assert
budget
.
num_curr_seqs
==
2
budget
.
subtract_num_seqs
(
seq_group
.
request_id
,
2
)
assert
budget
.
num_curr_seqs
==
0
budget
.
subtract_num_seqs
(
seq_group
.
request_id
,
2
)
assert
budget
.
num_curr_seqs
==
0
tests/core/utils.py
View file @
99b471c2
import
time
import
time
from
typing
import
Tuple
from
typing
import
Iterable
,
Optional
,
Tuple
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
Logprob
,
Sequence
,
SequenceGroup
from
vllm.sequence
import
Logprob
,
Sequence
,
SequenceGroup
def
create_dummy_prompt
(
def
create_dummy_prompt
(
request_id
:
str
,
request_id
:
str
,
prompt_length
:
int
,
prompt_length
:
int
,
block_size
:
int
=
None
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
if
not
block_size
:
block_size
=
prompt_length
block_size
=
prompt_length
...
@@ -17,22 +22,27 @@ def create_dummy_prompt(
...
@@ -17,22 +22,27 @@ def create_dummy_prompt(
prompt_tokens
=
list
(
range
(
prompt_length
))
prompt_tokens
=
list
(
range
(
prompt_length
))
prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
prompt_tokens
])
prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
prompt_tokens
])
prompt
=
Sequence
(
int
(
request_id
),
prompt_str
,
prompt_tokens
,
block_size
)
prompt
=
Sequence
(
int
(
request_id
),
prompt_str
,
prompt_tokens
,
block_size
)
seq_group
=
SequenceGroup
(
request_id
,
[
prompt
],
SamplingParams
(),
seq_group
=
SequenceGroup
(
time
.
time
(),
None
)
request_id
,
[
prompt
],
SamplingParams
(
use_beam_search
=
use_beam_search
,
best_of
=
best_of
),
time
.
time
(),
lora_request
)
return
prompt
,
seq_group
return
prompt
,
seq_group
def
create_seq_group
(
def
create_seq_group
(
seq_prompt_len
s
=
1024
,
seq_prompt_len
:
int
=
1024
,
seq_output_lens
=
(
128
,
),
seq_output_lens
:
Iterable
[
int
]
=
(
128
,
),
request_id
=
'0'
,
request_id
:
str
=
'0'
,
seq_id_start
=
0
,
seq_id_start
:
int
=
0
,
)
->
SequenceGroup
:
sampling_params
:
Optional
[
SamplingParams
]
=
None
)
->
SequenceGroup
:
assert
len
(
seq_output_lens
)
>
0
assert
len
(
seq_output_lens
)
>
0
prompt_token_ids
=
[
0
]
*
seq_prompt_lens
if
sampling_params
is
None
:
sampling_params
=
SamplingParams
()
prompt_token_ids
=
[
0
]
*
seq_prompt_len
seqs
=
[]
seqs
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
...
@@ -53,7 +63,7 @@ def create_seq_group(
...
@@ -53,7 +63,7 @@ def create_seq_group(
seq_group
=
SequenceGroup
(
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
request_id
=
request_id
,
seqs
=
seqs
,
seqs
=
seqs
,
sampling_params
=
S
ampling
P
arams
()
,
sampling_params
=
s
ampling
_p
arams
,
arrival_time
=
time
.
time
(),
arrival_time
=
time
.
time
(),
)
)
...
...
tests/distributed/test_basic_distributed_correctness.py
View file @
99b471c2
...
@@ -33,11 +33,16 @@ def test_models(
...
@@ -33,11 +33,16 @@ def test_models(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
None
:
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
del
vllm_model
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment