Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1591c68f
Commit
1591c68f
authored
May 25, 2024
by
zhuwenwen
Browse files
merge v0.4.2
parents
09bcf00b
c7f2cf2b
Changes
265
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1039 additions
and
47 deletions
+1039
-47
requirements-cpu.txt
requirements-cpu.txt
+1
-1
requirements-cuda.txt
requirements-cuda.txt
+2
-2
requirements-dev.txt
requirements-dev.txt
+1
-3
setup.py
setup.py
+32
-17
tests/async_engine/test_async_llm_engine.py
tests/async_engine/test_async_llm_engine.py
+2
-0
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+14
-11
tests/async_engine/test_merge_async_iterators.py
tests/async_engine/test_merge_async_iterators.py
+41
-0
tests/async_engine/test_openapi_server_ray.py
tests/async_engine/test_openapi_server_ray.py
+157
-0
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+11
-1
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+0
-1
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+223
-0
tests/conftest.py
tests/conftest.py
+2
-1
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+146
-0
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+125
-0
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+1
-1
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+3
-2
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+29
-1
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+9
-5
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+64
-1
tests/engine/test_multiproc_workers.py
tests/engine/test_multiproc_workers.py
+176
-0
No files found.
requirements-cpu.txt
View file @
1591c68f
...
...
@@ -2,5 +2,5 @@
-r requirements-common.txt
# Dependencies for x86_64 CPUs
torch == 2.
2.1
+cpu
torch == 2.
3.0
+cpu
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
requirements-cuda.txt
View file @
1591c68f
...
...
@@ -5,5 +5,5 @@
ray >= 2.9
nvidia-ml-py # for pynvml package
vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library
torch == 2.
2.1
xformers == 0.0.2
5
# Requires PyTorch 2.
2.1
torch == 2.
3.0
xformers == 0.0.2
6.post1
# Requires PyTorch 2.
3.0
requirements-dev.txt
View file @
1591c68f
...
...
@@ -14,19 +14,17 @@ types-setuptools
# testing
pytest
tensorizer==2.9.0
a0
tensorizer==2.9.0
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
httpx
einops # required for MPT
openai
requests
ray
peft
awscli
ai2-olmo # required for OLMo
# Benchmarking
aiohttp
...
...
setup.py
View file @
1591c68f
import
importlib.util
import
io
import
logging
import
os
...
...
@@ -17,10 +18,23 @@ from typing import Optional, Union
import
subprocess
from
pathlib
import
Path
def
load_module_from_path
(
module_name
,
path
):
spec
=
importlib
.
util
.
spec_from_file_location
(
module_name
,
path
)
module
=
importlib
.
util
.
module_from_spec
(
spec
)
sys
.
modules
[
module_name
]
=
module
spec
.
loader
.
exec_module
(
module
)
return
module
ROOT_DIR
=
os
.
path
.
dirname
(
__file__
)
logger
=
logging
.
getLogger
(
__name__
)
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
VLLM_TARGET_DEVICE
=
os
.
getenv
(
"VLLM_TARGET_DEVICE"
,
"cuda"
)
# cannot import envs directly because it depends on vllm,
# which is not installed yet
envs
=
load_module_from_path
(
'envs'
,
os
.
path
.
join
(
ROOT_DIR
,
'vllm'
,
'envs.py'
))
VLLM_TARGET_DEVICE
=
envs
.
VLLM_TARGET_DEVICE
# vLLM only supports Linux platform
assert
sys
.
platform
.
startswith
(
...
...
@@ -64,10 +78,10 @@ class cmake_build_ext(build_ext):
def
compute_num_jobs
(
self
):
# `num_jobs` is either the value of the MAX_JOBS environment variable
# (if defined) or the number of CPUs available.
num_jobs
=
os
.
environ
.
get
(
"MAX_JOBS"
,
None
)
num_jobs
=
envs
.
MAX_JOBS
if
num_jobs
is
not
None
:
num_jobs
=
int
(
num_jobs
)
logger
.
info
(
f
"Using MAX_JOBS=
{
num_jobs
}
as the number of jobs."
)
logger
.
info
(
"Using MAX_JOBS=
%d
as the number of jobs."
,
num_jobs
)
else
:
try
:
# os.sched_getaffinity() isn't universally available, so fall
...
...
@@ -82,11 +96,12 @@ class cmake_build_ext(build_ext):
# environment variable (if defined) or 1.
# when it is set, we reduce `num_jobs` to avoid
# overloading the system.
nvcc_threads
=
os
.
get
env
(
"
NVCC_THREADS
"
,
None
)
nvcc_threads
=
env
s
.
NVCC_THREADS
if
nvcc_threads
is
not
None
:
nvcc_threads
=
int
(
nvcc_threads
)
logger
.
info
(
f
"Using NVCC_THREADS=
{
nvcc_threads
}
as the number"
" of nvcc threads."
)
logger
.
info
(
"Using NVCC_THREADS=%d as the number of nvcc threads."
,
nvcc_threads
)
else
:
nvcc_threads
=
1
num_jobs
=
max
(
1
,
num_jobs
//
nvcc_threads
)
...
...
@@ -107,7 +122,7 @@ class cmake_build_ext(build_ext):
# Select the build type.
# Note: optimization level + debug info are set by the build type
default_cfg
=
"Debug"
if
self
.
debug
else
"RelWithDebInfo"
cfg
=
os
.
get
env
(
"
CMAKE_BUILD_TYPE
"
,
default_cfg
)
cfg
=
env
s
.
CMAKE_BUILD_TYPE
or
default_cfg
# where .so files will be written, should be the same for all extensions
# that use the same CMakeLists.txt.
...
...
@@ -121,7 +136,7 @@ class cmake_build_ext(build_ext):
'-DVLLM_TARGET_DEVICE={}'
.
format
(
VLLM_TARGET_DEVICE
),
]
verbose
=
bool
(
int
(
os
.
get
env
(
'
VERBOSE
'
,
'0'
)))
verbose
=
env
s
.
VERBOSE
if
verbose
:
cmake_args
+=
[
'-DCMAKE_VERBOSE_MAKEFILE=ON'
]
...
...
@@ -208,8 +223,7 @@ def _is_neuron() -> bool:
subprocess
.
run
([
"neuron-ls"
],
capture_output
=
True
,
check
=
True
)
except
(
FileNotFoundError
,
PermissionError
,
subprocess
.
CalledProcessError
):
torch_neuronx_installed
=
False
return
torch_neuronx_installed
or
os
.
environ
.
get
(
"VLLM_BUILD_WITH_NEURON"
,
False
)
return
torch_neuronx_installed
or
envs
.
VLLM_BUILD_WITH_NEURON
def
_is_cpu
()
->
bool
:
...
...
@@ -217,7 +231,7 @@ def _is_cpu() -> bool:
def
_install_punica
()
->
bool
:
return
bool
(
int
(
os
.
get
env
(
"
VLLM_INSTALL_PUNICA_KERNELS
"
,
"0"
)))
return
env
s
.
VLLM_INSTALL_PUNICA_KERNELS
def
get_hipcc_rocm_version
():
...
...
@@ -333,8 +347,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
version
+=
".torch"
+
torch
.
__version__
[:
5
]
with
open
(
add_version_path
,
encoding
=
"utf-8"
,
mode
=
"w"
)
as
file
:
file
.
write
(
"__version__='0.4.
0
'
\n
"
)
file
.
write
(
"__dcu_version__='0.4.
0
+{}'
\n
"
.
format
(
version
))
file
.
write
(
"__version__='0.4.
2
'
\n
"
)
file
.
write
(
"__dcu_version__='0.4.
2
+{}'
\n
"
.
format
(
version
))
file
.
close
()
...
...
@@ -435,7 +449,8 @@ if not _is_neuron():
package_data
=
{
"vllm"
:
[
"py.typed"
,
"model_executor/layers/fused_moe/configs/*.json"
]
}
if
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
):
if
envs
.
VLLM_USE_PRECOMPILED
:
ext_modules
=
[]
package_data
[
"vllm"
].
append
(
"*.so"
)
setup
(
...
...
@@ -461,12 +476,12 @@ setup(
"Topic :: Scientific/Engineering :: Artificial Intelligence"
,
],
packages
=
find_packages
(
exclude
=
(
"benchmarks"
,
"csrc"
,
"docs"
,
"examples"
,
"tests"
)),
"tests
*
"
)),
python_requires
=
">=3.8"
,
install_requires
=
get_requirements
(),
ext_modules
=
ext_modules
,
extras_require
=
{
"tensorizer"
:
[
"tensorizer==2.9.0
a1
"
],
"tensorizer"
:
[
"tensorizer==2.9.0"
],
},
cmdclass
=
{
"build_ext"
:
cmake_build_ext
}
if
not
_is_neuron
()
else
{},
package_data
=
package_data
,
...
...
tests/async_engine/test_async_llm_engine.py
View file @
1591c68f
...
...
@@ -91,4 +91,6 @@ async def test_new_requests_event():
assert
engine
.
engine
.
step_calls
==
old_step_calls
+
1
engine
=
MockAsyncLLMEngine
(
worker_use_ray
=
True
,
engine_use_ray
=
True
)
assert
engine
.
get_model_config
()
is
not
None
assert
engine
.
get_tokenizer
()
is
not
None
assert
engine
.
get_decoding_config
()
is
not
None
tests/async_engine/test_chat_template.py
View file @
1591c68f
...
...
@@ -60,12 +60,13 @@ class MockServingChat:
tokenizer
:
MockTokenizer
def
test_load_chat_template
():
@
pytest
.
mark
.
asyncio
async
def
test_load_chat_template
():
# Testing chatml template
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
chatml_jinja_path
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
chatml_jinja_path
)
template_content
=
tokenizer
.
chat_template
...
...
@@ -76,7 +77,8 @@ def test_load_chat_template():
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant
\\
n' }}{% endif %}"""
# noqa: E501
def
test_no_load_chat_template_filelike
():
@
pytest
.
mark
.
asyncio
async
def
test_no_load_chat_template_filelike
():
# Testing chatml template
template
=
"../../examples/does_not_exist"
tokenizer
=
MockTokenizer
()
...
...
@@ -84,18 +86,19 @@ def test_no_load_chat_template_filelike():
mock_serving_chat
=
MockServingChat
(
tokenizer
)
with
pytest
.
raises
(
ValueError
,
match
=
"looks like a file path"
):
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
def
test_no_load_chat_template_literallike
():
@
pytest
.
mark
.
asyncio
async
def
test_no_load_chat_template_literallike
():
# Testing chatml template
template
=
"{{ messages }}"
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
template_content
=
tokenizer
.
chat_template
assert
template_content
==
template
...
...
@@ -110,8 +113,8 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
...
...
tests/async_engine/test_merge_async_iterators.py
0 → 100644
View file @
1591c68f
import
asyncio
from
typing
import
AsyncIterator
,
Tuple
import
pytest
from
vllm.utils
import
merge_async_iterators
@
pytest
.
mark
.
asyncio
async
def
test_merge_async_iterators
():
async
def
mock_async_iterator
(
idx
:
int
)
->
AsyncIterator
[
str
]:
try
:
while
True
:
yield
f
"item from iterator
{
idx
}
"
await
asyncio
.
sleep
(
0.1
)
except
asyncio
.
CancelledError
:
pass
iterators
=
[
mock_async_iterator
(
i
)
for
i
in
range
(
3
)]
merged_iterator
:
AsyncIterator
[
Tuple
[
int
,
str
]]
=
merge_async_iterators
(
*
iterators
)
async
def
stream_output
(
generator
:
AsyncIterator
[
Tuple
[
int
,
str
]]):
async
for
idx
,
output
in
generator
:
print
(
f
"idx:
{
idx
}
, output:
{
output
}
"
)
task
=
asyncio
.
create_task
(
stream_output
(
merged_iterator
))
await
asyncio
.
sleep
(
0.5
)
task
.
cancel
()
with
pytest
.
raises
(
asyncio
.
CancelledError
):
await
task
for
iterator
in
iterators
:
try
:
await
asyncio
.
wait_for
(
anext
(
iterator
),
1
)
except
StopAsyncIteration
:
# All iterators should be cancelled and print this message.
print
(
"Iterator was cancelled normally"
)
except
(
Exception
,
asyncio
.
CancelledError
)
as
e
:
raise
AssertionError
()
from
e
tests/async_engine/test_openapi_server_ray.py
0 → 100644
View file @
1591c68f
# imports for guided decoding tests
import
os
import
subprocess
import
sys
import
time
import
openai
# use the official client for correctness check
import
pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import
ray
import
requests
MAX_SERVER_START_WAIT_S
=
600
# wait for server to start for 60 seconds
# any model with a chat template should work here
MODEL_NAME
=
"facebook/opt-125m"
@
ray
.
remote
(
num_gpus
=
1
)
class
ServerRunner
:
def
__init__
(
self
,
args
):
env
=
os
.
environ
.
copy
()
env
[
"PYTHONUNBUFFERED"
]
=
"1"
self
.
proc
=
subprocess
.
Popen
(
[
"python3"
,
"-m"
,
"vllm.entrypoints.openai.api_server"
]
+
args
,
env
=
env
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
,
)
self
.
_wait_for_server
()
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
"http://localhost:8000/health"
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
server
():
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--max-model-len"
,
"2048"
,
"--enforce-eager"
,
"--engine-use-ray"
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
@
pytest
.
mark
.
asyncio
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
@
pytest
.
mark
.
asyncio
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
asyncio
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
chat_completion
.
id
is
not
None
assert
chat_completion
.
choices
is
not
None
and
len
(
chat_completion
.
choices
)
==
1
assert
chat_completion
.
choices
[
0
].
message
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
top_logprobs
is
not
None
assert
len
(
chat_completion
.
choices
[
0
].
logprobs
.
top_logprobs
[
0
])
==
5
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
tests/basic_correctness/test_basic_correctness.py
View file @
1591c68f
...
...
@@ -2,12 +2,15 @@
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
"""
import
os
import
pytest
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
]
VLLM_ATTENTION_BACKEND
=
"VLLM_ATTENTION_BACKEND"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
@@ -23,11 +26,18 @@ def test_models(
max_tokens
:
int
,
enforce_eager
:
bool
,
)
->
None
:
backend_by_env_var
=
os
.
getenv
(
VLLM_ATTENTION_BACKEND
)
if
backend_by_env_var
==
"FLASHINFER"
and
enforce_eager
is
False
:
pytest
.
skip
(
"Skipping non-eager test for FlashInferBackend."
)
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
1591c68f
...
...
@@ -55,7 +55,6 @@ def test_models(
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
print
(
vllm_outputs
[
0
])
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
tests/basic_correctness/test_preemption.py
0 → 100644
View file @
1591c68f
"""Compare the short outputs of HF and vLLM when using greedy sampling.
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`.
"""
import
pytest
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
ENABLE_ARTIFICIAL_PREEMPT
)
MODELS
=
[
"facebook/opt-125m"
,
]
assert
ENABLE_ARTIFICIAL_PREEMPT
is
True
,
(
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
"tests/basic_correctness/test_preemption.py`"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
16
])
def
test_chunked_prefill_recompute
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
)
->
None
:
"""Ensure that chunked prefill works with preemption."""
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
enable_chunked_prefill
=
False
max_num_batched_tokens
=
None
if
chunked_prefill_token_size
!=
-
1
:
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_seqs
=
max_num_seqs
,
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
"""By default, recompute preemption is enabled"""
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
)
->
None
:
"""Use beam search enables swapping."""
example_prompts
=
example_prompts
[:
1
]
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
swap_space
=
10
)
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
_
=
hf_outputs
[
i
]
vllm_output_ids
,
_
=
vllm_outputs
[
i
]
assert
len
(
hf_output_ids
)
==
len
(
vllm_output_ids
)
for
j
in
range
(
len
(
hf_output_ids
)):
assert
hf_output_ids
[
j
]
==
vllm_output_ids
[
j
],
(
f
"Test
{
i
}
output
{
j
}
:
\n
HF:
{
hf_output_ids
}
\n
"
f
"vLLM:
{
vllm_output_ids
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap_infeasible
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
)
->
None
:
"""Verify infeasible swap request will be ignored."""
BLOCK_SIZE
=
16
prefill_blocks
=
2
decode_blocks
=
max_tokens
//
BLOCK_SIZE
example_prompts
=
example_prompts
[:
1
]
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
swap_space
=
10
,
block_size
=
BLOCK_SIZE
,
# Since beam search have more than 1 sequence, prefill + decode blocks
# are not enough to finish.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
,
max_model_len
=
(
prefill_blocks
+
decode_blocks
)
*
BLOCK_SIZE
,
)
sampling_params
=
SamplingParams
(
n
=
beam_width
,
use_beam_search
=
True
,
temperature
=
0.0
,
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
sampling_params
,
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
del
vllm_model
# Verify the request is ignored and not hang.
assert
req_outputs
[
0
].
outputs
[
0
].
finish_reason
==
"length"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption_infeasible
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE
=
16
prefill_blocks
=
2
decode_blocks
=
max_tokens
//
BLOCK_SIZE
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
block_size
=
BLOCK_SIZE
,
# Not enough gpu blocks to complete a single sequence.
# preemption should happen, and the sequence should be
# ignored instead of hanging forever.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
)
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
sampling_params
,
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
del
vllm_model
# Verify the request is ignored and not hang.
for
req_output
in
req_outputs
:
outputs
=
req_output
.
outputs
assert
len
(
outputs
)
==
1
assert
outputs
[
0
].
finish_reason
==
"length"
tests/conftest.py
View file @
1591c68f
...
...
@@ -296,6 +296,7 @@ class VllmRunner:
tensor_parallel_size
:
int
=
1
,
block_size
:
int
=
16
,
enable_chunked_prefill
:
bool
=
False
,
swap_space
=
4
,
**
kwargs
,
)
->
None
:
self
.
model
=
LLM
(
...
...
@@ -303,7 +304,7 @@ class VllmRunner:
tokenizer
=
tokenizer_name
,
trust_remote_code
=
True
,
dtype
=
dtype
,
swap_space
=
0
,
swap_space
=
swap_space
,
disable_log_stats
=
disable_log_stats
,
tensor_parallel_size
=
tensor_parallel_size
,
max_model_len
=
max_model_len
,
...
...
tests/core/block/e2e/test_correctness.py
View file @
1591c68f
...
...
@@ -300,6 +300,152 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
assert
baseline_token_ids
==
test_token_ids
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Use a small model for a fast test.
"model"
:
"facebook/opt-125m"
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
# Enable prefill cache
"enable_prefix_caching"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted in the v2 block manager.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
"""
output_len
=
1024
temperature
=
0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
)
print
(
'Getting token ids from block manager v1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager v2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Use a small model for a fast test.
"model"
:
"facebook/opt-125m"
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
# Test APC in v2 block
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"enable_prefix_caching"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"enable_prefix_caching"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_auto_prefix_caching_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify block manager v2 with auto prefix caching enabled produces same
outputs as auto prefix caching disabled, even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that auto
prefix caching itself at least don't cause result error.
"""
output_len
=
1024
temperature
=
0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
)
print
(
'Getting token ids with APC disabled'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids with APC enabled'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
def
get_token_ids_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
):
for
llm
in
llm_generator
:
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
...
...
tests/core/block/test_prefix_caching_block.py
View file @
1591c68f
...
...
@@ -358,6 +358,131 @@ class TestPrefixCachingBlockAllocator:
i
)
allocator
.
free
(
block
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
20
)))
def
test_get_common_computed_block_ids
(
num_blocks
:
int
,
block_size
:
int
,
seed
:
int
):
"""Verify get_common_computed_block_ids could get correct result
by create two immutable chain sharing prefix at specified pos,
and compare whether we also could get right result
from get_common_computed_block_ids.
"""
random
.
seed
(
seed
)
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
*
2
,
block_size
=
block_size
)
num_blocks_to_consume
=
random
.
randint
(
1
,
num_blocks
-
1
)
# Create token ids that will exhaust all blocks.
token_ids
=
list
(
range
(
num_blocks_to_consume
*
block_size
))
blocks
=
list
(
range
(
num_blocks_to_consume
))
first_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
# mark all blocks in first chain as computed
allocator
.
mark_blocks_as_computed
(
blocks
)
# After zero_point, second_chain's token_ids would be set -1, which
# make it different from here comparing with first_chain
zero_point
=
random
.
randint
(
1
,
len
(
token_ids
)
-
1
)
zero_point_blocks
=
zero_point
//
block_size
token_ids
[
zero_point
:]
=
[
-
1
]
*
(
len
(
token_ids
)
-
zero_point
)
second_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
first_computed_ids
=
[
first_chain
[
i
].
block_id
for
i
in
range
(
num_blocks_to_consume
)
]
second_computed_ids
=
[
second_chain
[
i
].
block_id
for
i
in
range
(
num_blocks_to_consume
)
]
res
=
allocator
.
get_common_computed_block_ids
(
[
first_computed_ids
,
second_computed_ids
])
assert
(
len
(
res
)
==
zero_point_blocks
)
# Test case where two last accessed times are equal
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
20
)))
def
test_eviction_order
(
num_blocks
:
int
,
block_size
:
int
,
seed
:
int
):
"""This test case simulate the two chain created and free in order,
and together they would exhaust the initial freed blocks.
So the next block created after those two chain shall use the block
from the first chain as that block has long access time.
While first chain has two blocks, it shall pick up the last one, as
it has larger token number.
"""
random
.
seed
(
seed
)
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
num_blocks_to_consume
=
num_blocks
+
1
token_ids
=
list
(
range
(
num_blocks_to_consume
*
block_size
))
num_blocks_in_first_chain
=
2
num_tokens_in_first_chain
=
block_size
*
num_blocks_in_first_chain
# First chain takes the first block
first_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
[:
num_tokens_in_first_chain
],
allocator
=
allocator
,
)
# There should only be one block allocated at this point
assert
allocator
.
get_num_free_blocks
()
==
(
num_blocks
-
num_blocks_in_first_chain
)
# Set the last accessed time of the first block to 1
blocks_ids
=
[
block
.
block_id
for
block
in
first_chain
]
allocator
.
mark_blocks_as_accessed
(
blocks_ids
,
1
)
# Second chain takes the rest of the blocks
second_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
[
num_tokens_in_first_chain
:
-
block_size
],
allocator
=
allocator
,
)
# There shouldn't be any blocks left at this point
assert
allocator
.
get_num_free_blocks
()
==
(
0
)
assert
len
(
first_chain
)
==
num_blocks_in_first_chain
last_block_id
=
first_chain
[
-
1
].
block_id
# Free each block in the first chain.
for
i
,
block
in
enumerate
(
first_chain
):
allocator
.
free
(
block
)
# Set the last accessed time on all of the blocks in the second chain
# to 2
blocks_ids
=
[
block
.
block_id
for
block
in
second_chain
]
allocator
.
mark_blocks_as_accessed
(
blocks_ids
,
2
)
# Free each block in the second chain.
for
i
,
block
in
enumerate
(
second_chain
):
allocator
.
free
(
block
)
# Allocate a new block and check that it's the least recently used block
# from the first chain.
new_block
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
[
-
block_size
:],
allocator
=
allocator
,
)
assert
new_block
[
0
].
block_id
==
last_block_id
@
staticmethod
def
create_immutable_chain
(
block_size
:
int
,
...
...
tests/core/test_block_manager.py
View file @
1591c68f
...
...
@@ -224,7 +224,7 @@ def test_swap():
# Swap seq group from CPU -> GPU.
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_in
(
seq_group
)
assert
block_manager
.
can_swap_in
(
seq_group
)
==
AllocStatus
.
OK
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
1591c68f
...
...
@@ -4,6 +4,7 @@ from unittest.mock import MagicMock
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
Logprob
,
SequenceGroup
...
...
@@ -410,7 +411,7 @@ def test_running_prefill_prioritized_over_swap():
# Add 1 more task. Swap is not possible, so prefill is running.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
False
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group2
)
...
...
@@ -423,7 +424,7 @@ def test_running_prefill_prioritized_over_swap():
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
# Now although swap is possible, running prefill is prioritized.
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
True
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
OK
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
...
...
tests/core/test_scheduler.py
View file @
1591c68f
...
...
@@ -791,7 +791,7 @@ def test_schedule_swapped_cannot_swap_in():
# The last request should be swapped out.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
False
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
# Since we cannot swap in, none of the requests are swapped in.
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
...
...
@@ -803,6 +803,34 @@ def test_schedule_swapped_cannot_swap_in():
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_infeasible_swap
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
# The last request should be swapped out.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
NEVER
# Since we cannot swap in, none of the requests are swapped in.
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
0
assert
len
(
output
.
infeasible_seq_groups
)
==
2
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
0
assert
len
(
output
.
decode_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_blocks_to_copy
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
...
...
tests/distributed/test_basic_distributed_correctness.py
View file @
1591c68f
...
...
@@ -18,6 +18,7 @@ import torch
MODELS
=
[
os
.
environ
[
"TEST_DIST_MODEL"
],
]
VLLM_ATTENTION_BACKEND
=
"VLLM_ATTENTION_BACKEND"
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
...
...
@@ -33,16 +34,19 @@ def test_models(
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
enforce_eager
=
False
backend_by_env_var
=
os
.
getenv
(
VLLM_ATTENTION_BACKEND
)
if
backend_by_env_var
==
"FLASHINFER"
:
enforce_eager
=
True
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
enforce_eager
=
enforce_eager
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
...
...
tests/distributed/test_pynccl.py
View file @
1591c68f
...
...
@@ -3,9 +3,13 @@ import multiprocessing
import
pytest
import
torch
import
vllm.distributed.device_communicators.pynccl_utils
as
pynccl_utils
from
vllm.distributed.communication_op
import
tensor_model_parallel_all_reduce
from
vllm.distributed.device_communicators.pynccl
import
(
NCCLCommunicator
,
ncclGetUniqueId
)
from
vllm.distributed.parallel_state
import
init_distributed_environment
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
get_tensor_model_parallel_cpu_group
,
init_distributed_environment
,
with_pynccl_for_all_reduce
)
from
vllm.utils
import
update_environment_variables
...
...
@@ -58,6 +62,65 @@ def test_pynccl():
distributed_run
(
worker_fn
,
2
)
@
worker_fn_wrapper
def
multiple_tp_worker_fn
():
device
=
torch
.
device
(
f
"cuda:
{
torch
.
distributed
.
get_rank
()
}
"
)
groups
=
[
torch
.
distributed
.
new_group
(
ranks
=
[
0
,
1
],
backend
=
"gloo"
),
torch
.
distributed
.
new_group
(
ranks
=
[
2
,
3
],
backend
=
"gloo"
)
]
group
=
groups
[
0
]
if
torch
.
distributed
.
get_rank
()
in
[
0
,
1
]
else
groups
[
1
]
comm
=
NCCLCommunicator
(
group
=
group
,
device
=
device
)
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
,
device
=
device
)
# two groups can communicate independently
if
torch
.
distributed
.
get_rank
()
in
[
0
,
1
]:
comm
.
all_reduce
(
tensor
)
comm
.
all_reduce
(
tensor
)
result
=
tensor
.
mean
().
cpu
().
item
()
assert
result
==
4
else
:
comm
.
all_reduce
(
tensor
)
result
=
tensor
.
mean
().
cpu
().
item
()
assert
result
==
2
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
def
test_pynccl_multiple_tp
():
# this tests pynccl for multiple tp groups, in a standalone way
# i.e. call `comm.all_reduce` directly
distributed_run
(
multiple_tp_worker_fn
,
4
)
@
worker_fn_wrapper
def
multiple_tp_with_vllm_worker_fn
():
device
=
torch
.
device
(
f
"cuda:
{
torch
.
distributed
.
get_rank
()
}
"
)
torch
.
cuda
.
set_device
(
torch
.
distributed
.
get_rank
())
ensure_model_parallel_initialized
(
2
,
2
)
pynccl_utils
.
init_process_group
(
group
=
get_tensor_model_parallel_cpu_group
())
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
,
device
=
device
)
with
with_pynccl_for_all_reduce
():
# two tp groups can communicate independently
if
torch
.
distributed
.
get_rank
()
in
[
0
,
1
]:
tensor
=
tensor_model_parallel_all_reduce
(
tensor
)
tensor
=
tensor_model_parallel_all_reduce
(
tensor
)
result
=
tensor
.
mean
().
cpu
().
item
()
assert
result
==
4
else
:
tensor
=
tensor_model_parallel_all_reduce
(
tensor
)
result
=
tensor
.
mean
().
cpu
().
item
()
assert
result
==
2
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
def
test_pynccl_multiple_tp_with_vllm
():
# this tests pynccl for multiple tp groups, together with vllm
# i.e. call `tensor_model_parallel_all_reduce`
distributed_run
(
multiple_tp_with_vllm_worker_fn
,
4
)
@
worker_fn_wrapper
def
worker_fn_with_cudagraph
():
with
torch
.
no_grad
():
...
...
tests/engine/test_multiproc_workers.py
0 → 100644
View file @
1591c68f
import
asyncio
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
from
time
import
sleep
from
typing
import
Any
,
List
,
Tuple
import
pytest
from
vllm.executor.multiproc_worker_utils
import
(
ProcessWorkerWrapper
,
ResultHandler
,
WorkerMonitor
)
class
DummyWorker
:
"""Dummy version of vllm.worker.worker.Worker"""
def
__init__
(
self
,
rank
:
int
):
self
.
rank
=
rank
def
worker_method
(
self
,
worker_input
:
Any
)
->
Tuple
[
int
,
Any
]:
sleep
(
0.05
)
if
isinstance
(
worker_input
,
Exception
):
# simulate error case
raise
worker_input
return
self
.
rank
,
input
def
_start_workers
()
->
Tuple
[
List
[
ProcessWorkerWrapper
],
WorkerMonitor
]:
result_handler
=
ResultHandler
()
workers
=
[
ProcessWorkerWrapper
(
result_handler
,
partial
(
DummyWorker
,
rank
=
rank
))
for
rank
in
range
(
8
)
]
worker_monitor
=
WorkerMonitor
(
workers
,
result_handler
)
assert
not
worker_monitor
.
is_alive
()
result_handler
.
start
()
worker_monitor
.
start
()
assert
worker_monitor
.
is_alive
()
return
workers
,
worker_monitor
def
test_local_workers
()
->
None
:
"""Test workers with sync task submission"""
workers
,
worker_monitor
=
_start_workers
()
def
execute_workers
(
worker_input
:
str
)
->
None
:
worker_outputs
=
[
worker
.
execute_method
(
"worker_method"
,
worker_input
)
for
worker
in
workers
]
for
rank
,
output
in
enumerate
(
worker_outputs
):
assert
output
.
get
()
==
(
rank
,
input
)
executor
=
ThreadPoolExecutor
(
max_workers
=
4
)
# Test concurrent submission from different threads
futures
=
[
executor
.
submit
(
partial
(
execute_workers
,
f
"thread
{
thread_num
}
"
))
for
thread_num
in
range
(
4
)
]
for
future
in
futures
:
future
.
result
()
# Test error case
exception
=
ValueError
(
"fake error"
)
result
=
workers
[
0
].
execute_method
(
"worker_method"
,
exception
)
try
:
result
.
get
()
pytest
.
fail
(
"task should have failed"
)
except
Exception
as
e
:
assert
isinstance
(
e
,
ValueError
)
assert
str
(
e
)
==
"fake error"
# Test cleanup when a worker fails
assert
worker_monitor
.
is_alive
()
workers
[
3
].
process
.
kill
()
# Other workers should get shut down here
worker_monitor
.
join
(
2
)
# Ensure everything is stopped
assert
not
worker_monitor
.
is_alive
()
assert
all
(
not
worker
.
process
.
is_alive
()
for
worker
in
workers
)
# Further attempts to submit tasks should fail
try
:
_result
=
workers
[
0
].
execute_method
(
"worker_method"
,
"test"
)
pytest
.
fail
(
"task should fail once workers have been shut down"
)
except
Exception
as
e
:
assert
isinstance
(
e
,
ChildProcessError
)
def
test_local_workers_clean_shutdown
()
->
None
:
"""Test clean shutdown"""
workers
,
worker_monitor
=
_start_workers
()
assert
worker_monitor
.
is_alive
()
assert
all
(
worker
.
process
.
is_alive
()
for
worker
in
workers
)
# Clean shutdown
worker_monitor
.
close
()
worker_monitor
.
join
(
5
)
# Ensure everything is stopped
assert
not
worker_monitor
.
is_alive
()
assert
all
(
not
worker
.
process
.
is_alive
()
for
worker
in
workers
)
# Further attempts to submit tasks should fail
try
:
_result
=
workers
[
0
].
execute_method
(
"worker_method"
,
"test"
)
pytest
.
fail
(
"task should fail once workers have been shut down"
)
except
Exception
as
e
:
assert
isinstance
(
e
,
ChildProcessError
)
@
pytest
.
mark
.
asyncio
async
def
test_local_workers_async
()
->
None
:
"""Test local workers with async task submission"""
workers
,
worker_monitor
=
_start_workers
()
async
def
execute_workers
(
worker_input
:
str
)
->
None
:
worker_coros
=
[
worker
.
execute_method_async
(
"worker_method"
,
worker_input
)
for
worker
in
workers
]
results
=
await
asyncio
.
gather
(
*
worker_coros
)
for
rank
,
result
in
enumerate
(
results
):
assert
result
==
(
rank
,
input
)
tasks
=
[
asyncio
.
create_task
(
execute_workers
(
f
"task
{
task_num
}
"
))
for
task_num
in
range
(
4
)
]
for
task
in
tasks
:
await
task
# Test error case
exception
=
ValueError
(
"fake error"
)
try
:
_result
=
await
workers
[
0
].
execute_method_async
(
"worker_method"
,
exception
)
pytest
.
fail
(
"task should have failed"
)
except
Exception
as
e
:
assert
isinstance
(
e
,
ValueError
)
assert
str
(
e
)
==
"fake error"
# Test cleanup when a worker fails
assert
worker_monitor
.
is_alive
()
workers
[
3
].
process
.
kill
()
# Other workers should get shut down here
worker_monitor
.
join
(
2
)
# Ensure everything is stopped
assert
not
worker_monitor
.
is_alive
()
assert
all
(
not
worker
.
process
.
is_alive
()
for
worker
in
workers
)
# Further attempts to submit tasks should fail
try
:
_result
=
await
workers
[
0
].
execute_method_async
(
"worker_method"
,
"test"
)
pytest
.
fail
(
"task should fail once workers have been shut down"
)
except
Exception
as
e
:
assert
isinstance
(
e
,
ChildProcessError
)
Prev
1
2
3
4
5
6
7
8
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment