Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
af7f4372
Commit
af7f4372
authored
Sep 03, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1
parents
5e19cdef
09c77926
Changes
448
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
612 additions
and
200 deletions
+612
-200
pyproject.toml
pyproject.toml
+3
-1
requirements-build.txt
requirements-build.txt
+2
-1
requirements-common.txt
requirements-common.txt
+10
-6
requirements-cuda.txt
requirements-cuda.txt
+2
-2
requirements-lint.txt
requirements-lint.txt
+1
-1
requirements-openvino.txt
requirements-openvino.txt
+3
-29
requirements-test.txt
requirements-test.txt
+8
-2
setup.py
setup.py
+27
-9
tests/async_engine/api_server_async_engine.py
tests/async_engine/api_server_async_engine.py
+5
-4
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+8
-1
tests/async_engine/test_async_llm_engine.py
tests/async_engine/test_async_llm_engine.py
+94
-13
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+8
-13
tests/async_engine/test_openapi_server_ray.py
tests/async_engine/test_openapi_server_ray.py
+12
-4
tests/async_engine/test_request_tracker.py
tests/async_engine/test_request_tracker.py
+14
-13
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+96
-8
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+0
-38
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+18
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+20
-0
tests/conftest.py
tests/conftest.py
+266
-52
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+15
-3
No files found.
Too many changes to show.
To preserve performance only
448 of 448+
files are displayed.
Plain diff
Email patch
pyproject.toml
View file @
af7f4372
[build-system]
[build-system]
# Should be mirrored in requirements-build.txt
# Should be mirrored in requirements-build.txt
requires
=
[
requires
=
[
"cmake>=3.2
1
"
,
"cmake>=3.2
6
"
,
"ninja"
,
"ninja"
,
"packaging"
,
"packaging"
,
"setuptools >= 49.4.0"
,
"setuptools >= 49.4.0"
,
"torch == 2.4.0"
,
"torch == 2.4.0"
,
"wheel"
,
"wheel"
,
"jinja2"
,
]
]
build-backend
=
"setuptools.build_meta"
build-backend
=
"setuptools.build_meta"
...
@@ -56,6 +57,7 @@ files = [
...
@@ -56,6 +57,7 @@ files = [
"vllm/*.py"
,
"vllm/*.py"
,
"vllm/adapter_commons"
,
"vllm/adapter_commons"
,
"vllm/assets"
,
"vllm/assets"
,
"vllm/entrypoints"
,
"vllm/inputs"
,
"vllm/inputs"
,
"vllm/logging"
,
"vllm/logging"
,
"vllm/multimodal"
,
"vllm/multimodal"
,
...
...
requirements-build.txt
View file @
af7f4372
# Should be mirrored in pyproject.toml
# Should be mirrored in pyproject.toml
cmake>=3.2
1
cmake>=3.2
6
ninja
ninja
packaging
packaging
setuptools>=49.4.0
setuptools>=49.4.0
torch==2.4.0
torch==2.4.0
wheel
wheel
jinja2
requirements-common.txt
View file @
af7f4372
cmake >= 3.21
ninja # For faster builds.
psutil
psutil
sentencepiece # Required for LLaMA tokenizer.
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
numpy < 2.0.0
...
@@ -8,17 +6,23 @@ tqdm
...
@@ -8,17 +6,23 @@ tqdm
py-cpuinfo
py-cpuinfo
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
tokenizers >= 0.19.1 # Required for Llama 3.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi
fastapi
aiohttp
aiohttp
openai
openai
>= 1.0 # Ensure modern openai package (ensure types module present)
uvicorn[standard]
uvicorn[standard]
pydantic >= 2.
0
# Required for OpenAI server.
pydantic >= 2.
8
# Required for OpenAI server.
pillow # Required for image processing
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.
3
lm-format-enforcer == 0.10.
6
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
typing_extensions
>= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
pyzmq
msgspec
librosa # Required for audio processing
soundfile # Required for audio processing
gguf == 0.9.1
importlib_metadata
requirements-cuda.txt
View file @
af7f4372
...
@@ -7,5 +7,5 @@ nvidia-ml-py # for pynvml package
...
@@ -7,5 +7,5 @@ nvidia-ml-py # for pynvml package
torch == 2.4.0
torch == 2.4.0
# These must be updated alongside torch
# These must be updated alongside torch
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27.post2 # Requires PyTorch 2.4.0
xformers == 0.0.27.post2
; platform_system == 'Linux' and platform_machine == 'x86_64'
# Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1 # Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1
; platform_system == 'Linux' and platform_machine == 'x86_64'
# Requires PyTorch 2.4.0
requirements-lint.txt
View file @
af7f4372
...
@@ -8,7 +8,7 @@ isort==5.13.2
...
@@ -8,7 +8,7 @@ isort==5.13.2
clang-format==18.1.5
clang-format==18.1.5
# type checking
# type checking
mypy==1.
9.0
mypy==1.
11.1
types-PyYAML
types-PyYAML
types-requests
types-requests
types-setuptools
types-setuptools
requirements-openvino.txt
View file @
af7f4372
# Common dependencies
# Common dependencies
# -r requirements-common.txt
-r requirements-common.txt
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers < 4.43
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
# OpenVINO dependencies
# OpenVINO dependencies
torch >= 2.1.2
torch >= 2.1.2
openvino ~= 2024.3.0.dev
openvino ~= 2024.3.0
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
optimum-intel[openvino] >= 1.18.2
optimum-intel[openvino] >= 1.18.1
requirements-test.txt
View file @
af7f4372
...
@@ -11,7 +11,7 @@ pytest-shard
...
@@ -11,7 +11,7 @@ pytest-shard
# testing utils
# testing utils
awscli
awscli
einops # required for MPT
einops # required for MPT
and qwen-vl
httpx
httpx
peft
peft
requests
requests
...
@@ -19,9 +19,15 @@ ray
...
@@ -19,9 +19,15 @@ ray
sentence-transformers # required for embedding
sentence-transformers # required for embedding
compressed-tensors==0.4.0 # required for compressed-tensors
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
# Benchmarking
# Benchmarking
aiohttp
aiohttp
# quantization
# quantization
bitsandbytes==0.42.0
bitsandbytes==0.42.0
\ No newline at end of file
buildkite-test-collector==0.1.8
\ No newline at end of file
setup.py
View file @
af7f4372
...
@@ -68,9 +68,12 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
...
@@ -68,9 +68,12 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
VLLM_TARGET_DEVICE
=
envs
.
VLLM_TARGET_DEVICE
VLLM_TARGET_DEVICE
=
envs
.
VLLM_TARGET_DEVICE
# vLLM only supports Linux platform
if
not
sys
.
platform
.
startswith
(
"linux"
):
assert
sys
.
platform
.
startswith
(
logger
.
warning
(
"linux"
),
"vLLM only supports Linux platform (including WSL)."
"vLLM only supports Linux platform (including WSL). "
"Building on %s, "
"so vLLM may not be able to run correctly"
,
sys
.
platform
)
VLLM_TARGET_DEVICE
=
"empty"
MAIN_CUDA_VERSION
=
"12.1"
MAIN_CUDA_VERSION
=
"12.1"
...
@@ -188,6 +191,10 @@ class cmake_build_ext(build_ext):
...
@@ -188,6 +191,10 @@ class cmake_build_ext(build_ext):
# match.
# match.
cmake_args
+=
[
'-DVLLM_PYTHON_EXECUTABLE={}'
.
format
(
sys
.
executable
)]
cmake_args
+=
[
'-DVLLM_PYTHON_EXECUTABLE={}'
.
format
(
sys
.
executable
)]
# Pass the python path to cmake so it can reuse the build dependencies
# on subsequent calls to python.
cmake_args
+=
[
'-DVLLM_PYTHON_PATH={}'
.
format
(
":"
.
join
(
sys
.
path
))]
#
#
# Setup parallelism and build tool
# Setup parallelism and build tool
#
#
...
@@ -238,6 +245,10 @@ class cmake_build_ext(build_ext):
...
@@ -238,6 +245,10 @@ class cmake_build_ext(build_ext):
subprocess
.
check_call
([
"cmake"
,
*
build_args
],
cwd
=
self
.
build_temp
)
subprocess
.
check_call
([
"cmake"
,
*
build_args
],
cwd
=
self
.
build_temp
)
def
_no_device
()
->
bool
:
return
VLLM_TARGET_DEVICE
==
"empty"
def
_is_cuda
()
->
bool
:
def
_is_cuda
()
->
bool
:
has_cuda
=
torch
.
version
.
cuda
is
not
None
has_cuda
=
torch
.
version
.
cuda
is
not
None
return
(
VLLM_TARGET_DEVICE
==
"cuda"
and
has_cuda
return
(
VLLM_TARGET_DEVICE
==
"cuda"
and
has_cuda
...
@@ -279,7 +290,7 @@ def _build_custom_ops() -> bool:
...
@@ -279,7 +290,7 @@ def _build_custom_ops() -> bool:
def
_build_core_ext
()
->
bool
:
def
_build_core_ext
()
->
bool
:
return
not
_is_neuron
()
and
not
_is_
t
pu
()
return
not
(
_is_neuron
()
or
_is_tpu
()
or
_is_openvino
()
or
_is_
x
pu
()
)
def
get_hipcc_rocm_version
():
def
get_hipcc_rocm_version
():
...
@@ -398,13 +409,13 @@ try:
...
@@ -398,13 +409,13 @@ try:
import vllm.commit_id
import vllm.commit_id
__commit__ = vllm.commit_id.__commit__
__commit__ = vllm.commit_id.__commit__
except Exception as e:
except Exception as e:
warnings.warn(f"Failed to read commit hash:
\
\
n + str(e)
",
warnings.warn(f"Failed to read commit hash:
\
n
{
e
}
",
RuntimeWarning,
RuntimeWarning,
stacklevel=2)
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.5.
4
"
__version__ = "0.5.
5
"
__dcu_version__ = f'0.5.
4
+
{
version
}
'
__dcu_version__ = f'0.5.
5
+
{
version
}
'
"""
"""
...
@@ -424,7 +435,9 @@ def get_version():
...
@@ -424,7 +435,9 @@ def get_version():
def
get_vllm_version
()
->
str
:
def
get_vllm_version
()
->
str
:
# version = find_version(get_path("vllm", "version.py"))
# version = find_version(get_path("vllm", "version.py"))
if
_is_cuda
():
if
_no_device
():
version
+=
"+empty"
elif
_is_cuda
():
cuda_version
=
str
(
get_nvcc_cuda_version
())
cuda_version
=
str
(
get_nvcc_cuda_version
())
if
cuda_version
!=
MAIN_CUDA_VERSION
:
if
cuda_version
!=
MAIN_CUDA_VERSION
:
cuda_version_str
=
cuda_version
.
replace
(
"."
,
""
)[:
3
]
cuda_version_str
=
cuda_version
.
replace
(
"."
,
""
)[:
3
]
...
@@ -479,7 +492,9 @@ def get_requirements() -> List[str]:
...
@@ -479,7 +492,9 @@ def get_requirements() -> List[str]:
resolved_requirements
.
append
(
line
)
resolved_requirements
.
append
(
line
)
return
resolved_requirements
return
resolved_requirements
if
_is_cuda
():
if
_no_device
():
requirements
=
_read_requirements
(
"requirements-cuda.txt"
)
elif
_is_cuda
():
requirements
=
_read_requirements
(
"requirements-cuda.txt"
)
requirements
=
_read_requirements
(
"requirements-cuda.txt"
)
cuda_major
,
cuda_minor
=
torch
.
version
.
cuda
.
split
(
"."
)
cuda_major
,
cuda_minor
=
torch
.
version
.
cuda
.
split
(
"."
)
modified_requirements
=
[]
modified_requirements
=
[]
...
@@ -528,6 +543,9 @@ if envs.VLLM_USE_PRECOMPILED:
...
@@ -528,6 +543,9 @@ if envs.VLLM_USE_PRECOMPILED:
ext_modules
=
[]
ext_modules
=
[]
package_data
[
"vllm"
].
append
(
"*.so"
)
package_data
[
"vllm"
].
append
(
"*.so"
)
if
_no_device
():
ext_modules
=
[]
setup
(
setup
(
name
=
"vllm"
,
name
=
"vllm"
,
version
=
get_vllm_version
(),
version
=
get_vllm_version
(),
...
...
tests/async_engine/api_server_async_engine.py
View file @
af7f4372
"""vllm.entrypoints.api_server with some extra logging for testing."""
"""vllm.entrypoints.api_server with some extra logging for testing."""
from
typing
import
Any
,
Dict
from
typing
import
Any
,
Dict
,
Iterable
import
uvicorn
import
uvicorn
from
fastapi.responses
import
JSONResponse
,
Response
from
fastapi.responses
import
JSONResponse
,
Response
...
@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
...
@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
_num_aborts
=
0
self
.
_num_aborts
=
0
async
def
abort
(
self
,
request_id
:
str
)
->
None
:
async
def
_engine_abort
(
self
,
request_ids
:
Iterable
[
str
]):
await
super
().
abort
(
request_id
)
ids
=
list
(
request_ids
)
self
.
_num_aborts
+=
1
self
.
_num_aborts
+=
len
(
ids
)
await
super
().
_engine_abort
(
ids
)
def
testing_stats
(
self
)
->
Dict
[
str
,
Any
]:
def
testing_stats
(
self
)
->
Dict
[
str
,
Any
]:
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
...
...
tests/async_engine/test_api_server.py
View file @
af7f4372
import
os
import
subprocess
import
subprocess
import
sys
import
sys
import
time
import
time
...
@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
...
@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
"127.0.0.1"
,
"--tokenizer-pool-size"
,
"127.0.0.1"
,
"--tokenizer-pool-size"
,
str
(
tokenizer_pool_size
)
str
(
tokenizer_pool_size
)
]
]
# Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
# to prevent `--engine-use-ray` raises an exception due to it deprecation
env_vars
=
os
.
environ
.
copy
()
env_vars
[
"VLLM_ALLOW_ENGINE_USE_RAY"
]
=
"1"
if
engine_use_ray
:
if
engine_use_ray
:
commands
.
append
(
"--engine-use-ray"
)
commands
.
append
(
"--engine-use-ray"
)
if
worker_use_ray
:
if
worker_use_ray
:
commands
.
append
(
"--worker-use-ray"
)
commands
.
append
(
"--worker-use-ray"
)
uvicorn_process
=
subprocess
.
Popen
(
commands
)
uvicorn_process
=
subprocess
.
Popen
(
commands
,
env
=
env_vars
)
yield
yield
uvicorn_process
.
terminate
()
uvicorn_process
.
terminate
()
...
...
tests/async_engine/test_async_llm_engine.py
View file @
af7f4372
import
asyncio
import
asyncio
import
os
from
asyncio
import
CancelledError
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Optional
import
pytest
import
pytest
import
pytest_asyncio
import
torch
import
torch
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.config
import
ParallelConfig
from
vllm.config
import
ParallelConfig
from
vllm.engine.async_llm_engine
import
AsyncEngineArgs
,
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncEngineArgs
,
AsyncLLMEngine
from
vllm.outputs
import
RequestOutput
as
RealRequestOutput
from
..conftest
import
cleanup
from
..utils
import
wait_for_gpu_memory_to_clear
from
..utils
import
wait_for_gpu_memory_to_clear
...
@@ -106,21 +112,49 @@ async def test_new_requests_event():
...
@@ -106,21 +112,49 @@ async def test_new_requests_event():
assert
engine
.
engine
.
add_request_calls
==
3
assert
engine
.
engine
.
add_request_calls
==
3
assert
engine
.
engine
.
step_calls
==
old_step_calls
+
1
assert
engine
.
engine
.
step_calls
==
old_step_calls
+
1
# Allow deprecated engine_use_ray to not raise exception
os
.
environ
[
"VLLM_ALLOW_ENGINE_USE_RAY"
]
=
"1"
engine
=
MockAsyncLLMEngine
(
worker_use_ray
=
True
,
engine_use_ray
=
True
)
engine
=
MockAsyncLLMEngine
(
worker_use_ray
=
True
,
engine_use_ray
=
True
)
assert
engine
.
get_model_config
()
is
not
None
assert
engine
.
get_model_config
()
is
not
None
assert
engine
.
get_tokenizer
()
is
not
None
assert
engine
.
get_tokenizer
()
is
not
None
assert
engine
.
get_decoding_config
()
is
not
None
assert
engine
.
get_decoding_config
()
is
not
None
os
.
environ
.
pop
(
"VLLM_ALLOW_ENGINE_USE_RAY"
)
def
test_asyncio_run
():
def
start_engine
():
wait_for_gpu_memory_to_clear
(
wait_for_gpu_memory_to_clear
(
devices
=
list
(
range
(
torch
.
cuda
.
device_count
())),
devices
=
list
(
range
(
torch
.
cuda
.
device_count
())),
threshold_bytes
=
2
*
2
**
30
,
threshold_bytes
=
2
*
2
**
30
,
timeout_s
=
60
,
timeout_s
=
60
,
)
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
return
AsyncLLMEngine
.
from_engine_args
(
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
))
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
))
@
pytest_asyncio
.
fixture
(
scope
=
"module"
)
async
def
async_engine
():
engine
=
await
asyncio
.
get_event_loop
().
run_in_executor
(
executor
=
None
,
func
=
start_engine
)
try
:
yield
engine
finally
:
engine
.
shutdown_background_loop
()
del
engine
await
asyncio
.
sleep
(
0.1
)
cleanup
()
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
# So we can share the async engine fixture between these tests
return
False
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_asyncio_run
(
async_engine
):
async
def
run
(
prompt
:
str
):
async
def
run
(
prompt
:
str
):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
...
@@ -128,17 +162,64 @@ def test_asyncio_run():
...
@@ -128,17 +162,64 @@ def test_asyncio_run():
max_tokens
=
32
,
max_tokens
=
32
,
)
)
async
for
output
in
engine
.
generate
(
prompt
,
async
for
output
in
async_
engine
.
generate
(
prompt
,
sampling_params
,
sampling_params
,
request_id
=
prompt
):
request_id
=
prompt
):
final_output
=
output
final_output
=
output
return
final_output
return
final_output
async
def
generate
():
results
=
await
asyncio
.
gather
(
return
await
asyncio
.
gather
(
run
(
"test0"
),
run
(
"test0"
),
run
(
"test1"
),
run
(
"test1"
),
)
)
results
=
asyncio
.
run
(
generate
())
assert
len
(
results
)
==
2
assert
len
(
results
)
==
2
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_cancellation
(
async_engine
):
sampling_params
=
SamplingParams
(
temperature
=
0
,
min_tokens
=
10
,
max_tokens
=
10
,
)
i
=
0
with
pytest
.
raises
(
CancelledError
):
async
for
output
in
async_engine
.
generate
(
"test2"
,
sampling_params
,
request_id
=
"test2"
):
assert
not
output
.
finished
i
+=
1
if
i
==
5
:
await
async_engine
.
abort
(
"test2"
)
assert
i
==
5
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_delayed_generator
(
async_engine
):
sampling_params
=
SamplingParams
(
temperature
=
0
,
min_tokens
=
10
,
max_tokens
=
10
,
)
stream
=
async_engine
.
generate
(
"test3"
,
sampling_params
,
request_id
=
"test3"
)
i
=
0
final_output
:
Optional
[
RealRequestOutput
]
=
None
async
for
output
in
stream
:
final_output
=
output
if
i
==
0
:
# wait for generation to complete before consuming
# the remaining messages
await
asyncio
.
sleep
(
1
)
if
i
<
9
:
assert
not
output
.
finished
i
+=
1
assert
i
==
10
assert
final_output
is
not
None
assert
len
(
final_output
.
outputs
[
0
].
token_ids
)
==
10
assert
final_output
.
finished
tests/async_engine/test_chat_template.py
View file @
af7f4372
import
os
import
pathlib
import
pytest
import
pytest
from
vllm.entrypoints.chat_utils
import
load_chat_template
from
vllm.entrypoints.chat_utils
import
apply_chat_template
,
load_chat_template
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
from
..utils
import
VLLM_PATH
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
assert
chatml_jinja_path
.
exists
()
# Define models, templates, and their corresponding expected outputs
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
"facebook/opt-125m"
,
None
,
True
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
None
,
False
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
...
@@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
add_generation_prompt
=
add_generation_prompt
)
add_generation_prompt
=
add_generation_prompt
)
# Call the function and get the result
# Call the function and get the result
result
=
tokenizer
.
apply_chat_template
(
result
=
apply_chat_template
(
tokenizer
,
conversation
=
mock_request
.
messages
,
conversation
=
mock_request
.
messages
,
tokenize
=
False
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
chat_template
=
mock_request
.
chat_template
or
template_content
)
)
# Test assertion
# Test assertion
assert
result
==
expected_output
,
(
assert
result
==
expected_output
,
(
...
...
tests/async_engine/test_openapi_server_ray.py
View file @
af7f4372
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
from
..utils
import
RemoteOpenAIServer
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServer
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"facebook/opt-125m"
MODEL_NAME
=
"facebook/opt-125m"
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -16,10 +18,16 @@ def server():
...
@@ -16,10 +18,16 @@ def server():
"--max-model-len"
,
"--max-model-len"
,
"2048"
,
"2048"
,
"--enforce-eager"
,
"--enforce-eager"
,
"--engine-use-ray"
"--engine-use-ray"
,
"--chat-template"
,
str
(
chatml_jinja_path
),
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
# Allow `--engine-use-ray`, otherwise the launch of the server throw
# an error due to try to use a deprecated feature
env_dict
=
{
"VLLM_ALLOW_ENGINE_USE_RAY"
:
"1"
}
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
env_dict
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
@@ -83,7 +91,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI):
...
@@ -83,7 +91,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI):
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
13
,
total_tokens
=
23
)
completion_tokens
=
10
,
prompt_tokens
=
55
,
total_tokens
=
65
)
message
=
choice
.
message
message
=
choice
.
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
...
...
tests/async_engine/test_request_tracker.py
View file @
af7f4372
...
@@ -10,23 +10,23 @@ async def test_request_tracker():
...
@@ -10,23 +10,23 @@ async def test_request_tracker():
stream_1
=
tracker
.
add_request
(
"1"
)
stream_1
=
tracker
.
add_request
(
"1"
)
assert
tracker
.
new_requests_event
.
is_set
()
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
new
)
==
1
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"1"
assert
new
[
0
][
"request_id"
]
==
"1"
assert
not
finish
ed
assert
not
abort
ed
assert
not
stream_1
.
finished
assert
not
stream_1
.
finished
stream_2
=
tracker
.
add_request
(
"2"
)
stream_2
=
tracker
.
add_request
(
"2"
)
stream_3
=
tracker
.
add_request
(
"3"
)
stream_3
=
tracker
.
add_request
(
"3"
)
assert
tracker
.
new_requests_event
.
is_set
()
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
new
)
==
2
assert
len
(
new
)
==
2
assert
new
[
0
][
"request_id"
]
==
"2"
assert
new
[
0
][
"request_id"
]
==
"2"
assert
new
[
1
][
"request_id"
]
==
"3"
assert
new
[
1
][
"request_id"
]
==
"3"
assert
not
finish
ed
assert
not
abort
ed
assert
not
stream_2
.
finished
assert
not
stream_2
.
finished
assert
not
stream_3
.
finished
assert
not
stream_3
.
finished
...
@@ -36,9 +36,9 @@ async def test_request_tracker():
...
@@ -36,9 +36,9 @@ async def test_request_tracker():
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
tracker
.
abort_request
(
"1"
)
tracker
.
abort_request
(
"1"
)
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
len
(
finish
ed
)
==
1
assert
len
(
abort
ed
)
==
1
assert
"1"
in
finish
ed
assert
"1"
in
abort
ed
assert
not
new
assert
not
new
assert
stream_1
.
finished
assert
stream_1
.
finished
...
@@ -46,9 +46,11 @@ async def test_request_tracker():
...
@@ -46,9 +46,11 @@ async def test_request_tracker():
tracker
.
abort_request
(
"4"
)
tracker
.
abort_request
(
"4"
)
assert
tracker
.
new_requests_event
.
is_set
()
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
new
,
aborted
=
tracker
.
get_new_and_aborted_requests
()
assert
len
(
finished
)
==
1
# aborted new requests will cancel each other out -
assert
"4"
in
finished
# there's no need for them to propagate into the
# engine
assert
not
aborted
assert
not
new
assert
not
new
assert
stream_4
.
finished
assert
stream_4
.
finished
...
@@ -57,10 +59,9 @@ async def test_request_tracker():
...
@@ -57,10 +59,9 @@ async def test_request_tracker():
tracker
.
process_request_output
(
tracker
.
process_request_output
(
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
finished
)
==
1
assert
not
aborted
assert
"2"
in
finished
assert
len
(
new
)
==
1
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"5"
assert
new
[
0
][
"request_id"
]
==
"5"
assert
stream_2
.
finished
assert
stream_2
.
finished
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
af7f4372
...
@@ -6,14 +6,27 @@ prefill requests are chunked.
...
@@ -6,14 +6,27 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
"""
import
pytest
import
pytest
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
MODELS
=
[
MODELS
=
[
"facebook/opt-125m"
,
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Llama-2-7b-hf"
,
]
]
E5M2_KV_MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-chat-hf"
,
]
E4M3_KV_MODELS
=
[
"meta-llama/Llama-2-7b-chat-hf"
,
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
]
KV_CACHE_QUANTIZATION_PATHS
=
{
"meta-llama/Llama-2-7b-chat-hf"
:
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
}
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
@@ -35,12 +48,12 @@ def test_models(
...
@@ -35,12 +48,12 @@ def test_models(
enforce_eager
:
bool
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
)
->
None
:
)
->
None
:
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
"""
enable_chunked_prefill
=
False
Checks exact match decode between huggingface model and vllm runner with
max_num_batched_tokens
=
None
chunked prefill.
if
chunked_prefill_token_size
!=
-
1
:
"""
enable_
chunked_prefill
=
Tru
e
max_num_seqs
=
chunked_prefill
_token_siz
e
max_num_batched_tokens
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
...
@@ -49,7 +62,7 @@ def test_models(
...
@@ -49,7 +62,7 @@ def test_models(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
...
@@ -62,3 +75,78 @@ def test_models(
...
@@ -62,3 +75,78 @@ def test_models(
name_0
=
"hf"
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
name_1
=
"vllm"
,
)
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype,model"
,
[(
"fp8_e5m2"
,
m
)
for
m
in
E5M2_KV_MODELS
]
+
[(
"fp8_e4m3"
,
m
)
for
m
in
E4M3_KV_MODELS
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
def
test_models_with_fp8_kv_cache
(
vllm_runner
,
example_prompts
,
kv_cache_dtype
:
str
,
model
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
)
->
None
:
"""
Only checks log probs match between chunked-prefill and
non-chunked-prefill version of vLLM model runner.
This test is used when there is discrepancy in kernels
/ numerics (e.g. when using lower-precision types like FP8).
"""
NUM_LOG_PROBS
=
8
if
model
==
"facebook/opt-125m"
:
pytest
.
skip
(
"#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
)
max_num_seqs
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
extra_kwargs
=
{}
if
model
in
KV_CACHE_QUANTIZATION_PATHS
:
extra_kwargs
[
"quantization_param_path"
]
=
KV_CACHE_QUANTIZATION_PATHS
[
model
]
with
vllm_runner
(
model
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
**
extra_kwargs
,
)
as
vllm_model
:
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
**
extra_kwargs
,
)
as
vllm_model
:
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
outputs_0_lst
=
no_chunked_prefill_outputs
,
outputs_1_lst
=
chunked_prefill_outputs
,
name_0
=
"no_chunked_prefill"
,
name_1
=
"chunked_prefill"
,
)
tests/basic_correctness/test_cpu_offload.py
View file @
af7f4372
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
compare_two_settings
from
..utils
import
compare_two_settings
def
test_cpu_offload
():
def
test_cpu_offload
():
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
[
"--cpu-offload-gb"
,
"4"
])
[
"--cpu-offload-gb"
,
"4"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
])
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq"
),
reason
=
"awq is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
compare_two_settings
(
"casperhansen/llama-3-8b-instruct-awq"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
tests/basic_correctness/test_preemption.py
View file @
af7f4372
...
@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
...
@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
import
pytest
import
pytest
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
import
vllm.envs
as
envs
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
ENABLE_ARTIFICIAL_PREEMPT
)
ENABLE_ARTIFICIAL_PREEMPT
)
...
@@ -24,6 +25,13 @@ assert ENABLE_ARTIFICIAL_PREEMPT is True, (
...
@@ -24,6 +25,13 @@ assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"tests/basic_correctness/test_preemption.py`"
)
"tests/basic_correctness/test_preemption.py`"
)
@
pytest
.
fixture
def
worker_use_ray
()
->
bool
:
# When SPMD worker is used, use ray_use_worker=True
# to test delta input optimization works with preemption.
return
envs
.
VLLM_USE_RAY_SPMD_WORKER
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
...
@@ -36,6 +44,7 @@ def test_chunked_prefill_recompute(
...
@@ -36,6 +44,7 @@ def test_chunked_prefill_recompute(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
chunked_prefill_token_size
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""Ensure that chunked prefill works with preemption."""
"""Ensure that chunked prefill works with preemption."""
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
...
@@ -54,6 +63,7 @@ def test_chunked_prefill_recompute(
...
@@ -54,6 +63,7 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
@@ -80,6 +90,7 @@ def test_preemption(
...
@@ -80,6 +90,7 @@ def test_preemption(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""By default, recompute preemption is enabled"""
"""By default, recompute preemption is enabled"""
...
@@ -90,6 +101,7 @@ def test_preemption(
...
@@ -90,6 +101,7 @@ def test_preemption(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
@@ -134,6 +146,7 @@ def test_swap(
...
@@ -134,6 +146,7 @@ def test_swap(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
beam_width
:
int
,
beam_width
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""Use beam search enables swapping."""
"""Use beam search enables swapping."""
example_prompts
=
example_prompts
[:
1
]
example_prompts
=
example_prompts
[:
1
]
...
@@ -146,6 +159,7 @@ def test_swap(
...
@@ -146,6 +159,7 @@ def test_swap(
dtype
=
dtype
,
dtype
=
dtype
,
swap_space
=
10
,
swap_space
=
10
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
beam_width
,
max_tokens
)
...
@@ -191,6 +205,7 @@ def test_swap_infeasible(
...
@@ -191,6 +205,7 @@ def test_swap_infeasible(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
beam_width
:
int
,
beam_width
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""Verify infeasible swap request will be ignored."""
"""Verify infeasible swap request will be ignored."""
BLOCK_SIZE
=
16
BLOCK_SIZE
=
16
...
@@ -207,6 +222,7 @@ def test_swap_infeasible(
...
@@ -207,6 +222,7 @@ def test_swap_infeasible(
# decode blocks are not enough to finish.
# decode blocks are not enough to finish.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
,
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
,
max_model_len
=
(
prefill_blocks
+
decode_blocks
)
*
BLOCK_SIZE
,
max_model_len
=
(
prefill_blocks
+
decode_blocks
)
*
BLOCK_SIZE
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
n
=
beam_width
,
sampling_params
=
SamplingParams
(
n
=
beam_width
,
use_beam_search
=
True
,
use_beam_search
=
True
,
...
@@ -234,6 +250,7 @@ def test_preemption_infeasible(
...
@@ -234,6 +250,7 @@ def test_preemption_infeasible(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""Verify infeasible preemption request will be ignored."""
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE
=
16
BLOCK_SIZE
=
16
...
@@ -248,6 +265,7 @@ def test_preemption_infeasible(
...
@@ -248,6 +265,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever.
# ignored instead of hanging forever.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
ignore_eos
=
True
)
...
...
tests/compile/test_full_graph.py
0 → 100644
View file @
af7f4372
import
os
import
pytest
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Meta-Llama-3-8B"
])
def
test_full_graph
(
model
):
# make sure these models can be captured in full graph mode
os
.
environ
[
"VLLM_TEST_DYNAMO_GRAPH_CAPTURE"
]
=
"1"
from
vllm
import
LLM
,
SamplingParams
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B"
)
llm
.
generate
(
prompts
,
sampling_params
)
tests/conftest.py
View file @
af7f4372
import
contextlib
import
contextlib
import
gc
import
gc
import
json
import
os
import
os
import
sys
import
sys
import
tempfile
from
collections
import
UserList
from
collections
import
UserList
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
,
Union
from
enum
import
Enum
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
,
Union
)
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
huggingface_hub
import
snapshot_download
from
PIL
import
Image
from
PIL
import
Image
from
transformers
import
(
AutoModelForCausalLM
,
Auto
ModelForVision2Seq
,
from
transformers
import
(
AutoModelForCausalLM
,
Auto
Tokenizer
,
BatchEncoding
,
AutoTokenizer
,
BatchEncoding
,
BatchFeature
)
BatchFeature
)
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.config
import
TokenizerPoolConfig
from
vllm.config
import
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
destroy_distributed_environment
,
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
)
destroy_model_parallel
,
from
vllm.inputs
import
TextPrompt
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
is_cpu
)
identity
,
is_cpu
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -82,6 +92,21 @@ def init_test_http_connection():
...
@@ -82,6 +92,21 @@ def init_test_http_connection():
global_http_connection
.
reuse_client
=
False
global_http_connection
.
reuse_client
=
False
@
pytest
.
fixture
def
dist_init
():
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
1
,
1
)
yield
cleanup
()
def
cleanup
():
def
cleanup
():
destroy_model_parallel
()
destroy_model_parallel
()
destroy_distributed_environment
()
destroy_distributed_environment
()
...
@@ -120,6 +145,46 @@ def example_prompts() -> List[str]:
...
@@ -120,6 +145,46 @@ def example_prompts() -> List[str]:
return
prompts
return
prompts
class
DecoderPromptType
(
Enum
):
"""For encoder/decoder models only."""
CUSTOM
=
1
NONE
=
2
EMPTY_STR
=
3
@
pytest
.
fixture
def
example_encoder_decoder_prompts
(
)
->
Dict
[
DecoderPromptType
,
List
[
ExplicitEncoderDecoderPrompt
]]:
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
decoder prompt) tuple.
Returns:
* Encoder prompt list
* Decoder prompt list (reverse of encoder prompt list)
'''
encoder_prompts
=
[]
for
filename
in
_TEST_PROMPTS
:
encoder_prompts
+=
_read_prompts
(
filename
)
custom_decoder_prompts
=
encoder_prompts
[::
-
1
]
empty_str_decoder_prompts
=
[
""
]
*
len
(
encoder_prompts
)
none_decoder_prompts
=
[
None
]
*
len
(
encoder_prompts
)
# NONE decoder prompt type
return
{
DecoderPromptType
.
NONE
:
zip_enc_dec_prompts
(
encoder_prompts
,
none_decoder_prompts
),
DecoderPromptType
.
EMPTY_STR
:
zip_enc_dec_prompts
(
encoder_prompts
,
empty_str_decoder_prompts
),
DecoderPromptType
.
CUSTOM
:
zip_enc_dec_prompts
(
encoder_prompts
,
custom_decoder_prompts
),
}
@
pytest
.
fixture
@
pytest
.
fixture
def
example_long_prompts
()
->
List
[
str
]:
def
example_long_prompts
()
->
List
[
str
]:
prompts
=
[]
prompts
=
[]
...
@@ -151,7 +216,9 @@ class HfRunner:
...
@@ -151,7 +216,9 @@ class HfRunner:
*
,
*
,
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
is_embedding_model
:
bool
=
False
,
is_embedding_model
:
bool
=
False
,
is_vision_model
:
bool
=
False
,
auto_cls
=
AutoModelForCausalLM
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
]
=
identity
,
)
->
None
:
)
->
None
:
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
...
@@ -166,11 +233,6 @@ class HfRunner:
...
@@ -166,11 +233,6 @@ class HfRunner:
device
=
"cpu"
,
device
=
"cpu"
,
).
to
(
dtype
=
torch_dtype
))
).
to
(
dtype
=
torch_dtype
))
else
:
else
:
if
is_vision_model
:
auto_cls
=
AutoModelForVision2Seq
else
:
auto_cls
=
AutoModelForCausalLM
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
self
.
model
=
self
.
wrap_device
(
self
.
model
=
self
.
wrap_device
(
auto_cls
.
from_pretrained
(
auto_cls
.
from_pretrained
(
...
@@ -195,12 +257,14 @@ class HfRunner:
...
@@ -195,12 +257,14 @@ class HfRunner:
torch_dtype
=
torch_dtype
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
)
)
except
Exception
:
except
Exception
as
exc
:
logger
.
warning
(
logger
.
warning
(
"Unable to auto-load
processor from HuggingFace for
"
"Unable to auto-load
HuggingFace processor for model (%s).
"
"
model %s.
Using tokenizer instead."
,
model_name
)
"Using tokenizer instead.
Reason: %s
"
,
model_name
,
exc
)
self
.
processor
=
self
.
tokenizer
self
.
processor
=
self
.
tokenizer
self
.
postprocess_inputs
=
postprocess_inputs
def
generate
(
def
generate
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
...
@@ -220,6 +284,7 @@ class HfRunner:
...
@@ -220,6 +284,7 @@ class HfRunner:
processor_kwargs
[
"images"
]
=
images
[
i
]
processor_kwargs
[
"images"
]
=
images
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
output_ids
=
self
.
model
.
generate
(
output_ids
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
),
...
@@ -289,6 +354,7 @@ class HfRunner:
...
@@ -289,6 +354,7 @@ class HfRunner:
processor_kwargs
[
"images"
]
=
images
[
i
]
processor_kwargs
[
"images"
]
=
images
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
output
=
self
.
model
.
generate
(
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
),
...
@@ -314,12 +380,51 @@ class HfRunner:
...
@@ -314,12 +380,51 @@ class HfRunner:
all_logprobs
.
append
(
seq_logprobs
)
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
return
all_logprobs
def
_hidden_states_to_logprobs
(
self
,
hidden_states
,
num_logprobs
,
)
->
Tuple
[
List
[
Dict
[
int
,
float
]],
int
]:
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
output_len
=
len
(
hidden_states
)
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
getattr
(
self
.
model
.
get_output_embeddings
(),
"bias"
,
None
)
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
().
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
# convert to dict
seq_logprobs_lst
:
List
[
Dict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
if
tok_idx
==
0
:
tok_logprobs
=
tok_logprobs
[
-
1
,
:].
reshape
(
1
,
-
1
)
topk
=
tok_logprobs
.
topk
(
num_logprobs
)
tok_logprobs_dct
=
{}
for
token_id
,
logprob
in
zip
(
topk
.
indices
[
0
],
topk
.
values
[
0
]):
tok_logprobs_dct
[
token_id
.
item
()]
=
logprob
.
item
()
seq_logprobs_lst
.
append
(
tok_logprobs_dct
)
return
(
seq_logprobs_lst
,
output_len
,
)
def
generate_greedy_logprobs_limit
(
def
generate_greedy_logprobs_limit
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
audios
:
Optional
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]]:
)
->
List
[
Tuple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]]:
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
...
@@ -334,7 +439,13 @@ class HfRunner:
...
@@ -334,7 +439,13 @@ class HfRunner:
if
images
is
not
None
and
images
[
i
]
is
not
None
:
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
processor_kwargs
[
"images"
]
=
images
[
i
]
if
audios
is
not
None
:
audio
,
sr
=
audios
[
i
]
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
output
=
self
.
model
.
generate
(
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
),
...
@@ -346,37 +457,66 @@ class HfRunner:
...
@@ -346,37 +457,66 @@ class HfRunner:
**
kwargs
,
**
kwargs
,
)
)
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
(
for
_
,
hidden_states
in
enumerate
(
output
.
hidden_states
):
seq_logprobs_lst
,
last_hidden_states
=
hidden_states
[
-
1
][
0
]
output_len
,
logits
=
torch
.
matmul
(
)
=
self
.
_hidden_states_to_logprobs
(
output
.
hidden_states
,
last_hidden_states
,
num_logprobs
)
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
getattr
(
self
.
model
.
get_output_embeddings
(),
"bias"
,
None
)
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
(
).
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
# convert to dict
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_logprobs_lst
:
List
[
Dict
[
int
,
float
]]
=
[]
seq_ids
=
output
.
sequences
[
0
]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
output_len
=
len
(
seq_logprobs_lst
)
# drop prompt logprobs
output_ids
=
seq_ids
[
-
output_len
:]
if
tok_idx
==
0
:
all_output_ids
.
append
(
output_ids
.
tolist
())
tok_logprobs
=
tok_logprobs
[
-
1
,
:].
reshape
(
1
,
-
1
)
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
topk
=
tok_logprobs
.
topk
(
num_logprobs
)
tok_logprobs_dct
=
{}
outputs
=
zip
(
all_output_ids
,
all_output_strs
,
all_logprobs
)
for
token
_id
,
logprob
in
zip
(
topk
.
indices
[
0
],
topk
.
values
[
0
]):
return
[(
output
_id
s
,
output_str
,
output_logprobs
)
tok_logprobs_dct
[
token_id
.
item
()]
=
logprob
.
item
()
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
seq_logprobs_lst
.
append
(
tok_logprobs_dct
)
def
generate_encoder_decoder_greedy_logprobs_limit
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]]:
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
(
encoder_prompt
,
decoder_prompt
)
in
to_enc_dec_tuple_list
(
encoder_decoder_prompts
):
encoder_input_ids
=
self
.
wrap_device
(
self
.
tokenizer
(
encoder_prompt
,
return_tensors
=
"pt"
).
input_ids
)
decoder_input_ids
=
(
None
if
decoder_prompt
is
None
else
self
.
wrap_device
(
self
.
tokenizer
(
decoder_prompt
,
return_tensors
=
"pt"
).
input_ids
))
output
=
self
.
model
.
generate
(
encoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
,
use_cache
=
True
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
**
kwargs
,
)
(
seq_logprobs_lst
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
output
.
decoder_hidden_states
,
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
output
.
sequences
[
0
]
seq_ids
=
output
.
sequences
[
0
]
output_len
=
len
(
seq_logprobs_lst
)
output_ids
=
seq_ids
[
-
output_len
:]
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
...
@@ -416,7 +556,7 @@ class VllmRunner:
...
@@ -416,7 +556,7 @@ class VllmRunner:
block_size
:
int
=
16
,
block_size
:
int
=
16
,
enable_chunked_prefill
:
bool
=
False
,
enable_chunked_prefill
:
bool
=
False
,
swap_space
:
int
=
4
,
swap_space
:
int
=
4
,
enforce_eager
:
bool
=
False
,
enforce_eager
:
Optional
[
bool
]
=
False
,
**
kwargs
,
**
kwargs
,
)
->
None
:
)
->
None
:
self
.
model
=
LLM
(
self
.
model
=
LLM
(
...
@@ -438,7 +578,8 @@ class VllmRunner:
...
@@ -438,7 +578,8 @@ class VllmRunner:
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]
=
None
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
if
images
is
not
None
:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
assert
len
(
prompts
)
==
len
(
images
)
...
@@ -465,11 +606,27 @@ class VllmRunner:
...
@@ -465,11 +606,27 @@ class VllmRunner:
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
return
outputs
return
outputs
def
_final_steps_generate_w_logprobs
(
self
,
req_outputs
:
List
[
RequestOutput
],
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
outputs
:
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
=
[]
for
req_output
in
req_outputs
:
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_ids
=
list
(
sample
.
token_ids
)
output_logprobs
=
sample
.
logprobs
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
))
return
outputs
def
generate_w_logprobs
(
def
generate_w_logprobs
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]
=
None
,
audios
:
Optional
[
Union
[
List
[
Tuple
[
np
.
ndarray
,
int
]],
List
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]]]
=
None
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
assert
sampling_params
.
logprobs
is
not
None
assert
sampling_params
.
logprobs
is
not
None
...
@@ -481,16 +638,27 @@ class VllmRunner:
...
@@ -481,16 +638,27 @@ class VllmRunner:
for
i
,
image
in
enumerate
(
images
):
for
i
,
image
in
enumerate
(
images
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
if
audios
is
not
None
:
for
i
,
audio
in
enumerate
(
audios
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"audio"
:
audio
}
req_outputs
=
self
.
model
.
generate
(
inputs
,
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
)
sampling_params
=
sampling_params
)
outputs
:
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
=
[]
return
self
.
_final_steps_generate_w_logprobs
(
req_outputs
)
for
req_output
in
req_outputs
:
for
sample
in
req_output
.
outputs
:
def
generate_encoder_decoder_w_logprobs
(
output_str
=
sample
.
text
self
,
output_ids
=
sample
.
token_ids
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
output_logprobs
=
sample
.
logprobs
sampling_params
:
SamplingParams
,
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
))
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
return
outputs
'''
Logprobs generation for vLLM encoder/decoder models
'''
assert
sampling_params
.
logprobs
is
not
None
req_outputs
=
self
.
model
.
generate
(
encoder_decoder_prompts
,
sampling_params
=
sampling_params
)
return
self
.
_final_steps_generate_w_logprobs
(
req_outputs
)
def
generate_greedy
(
def
generate_greedy
(
self
,
self
,
...
@@ -510,6 +678,8 @@ class VllmRunner:
...
@@ -510,6 +678,8 @@ class VllmRunner:
num_logprobs
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]
=
None
,
List
[
List
[
Image
.
Image
]]]]
=
None
,
audios
:
Optional
[
Union
[
List
[
Tuple
[
np
.
ndarray
,
int
]],
List
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
...
@@ -518,7 +688,28 @@ class VllmRunner:
...
@@ -518,7 +688,28 @@ class VllmRunner:
stop_token_ids
=
stop_token_ids
)
stop_token_ids
=
stop_token_ids
)
outputs
=
self
.
generate_w_logprobs
(
prompts
,
outputs
=
self
.
generate_w_logprobs
(
prompts
,
greedy_logprobs_params
,
greedy_logprobs_params
,
images
=
images
)
images
=
images
,
audios
=
audios
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
generate_encoder_decoder_greedy_logprobs
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
use_beam_search
=
False
,
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
)
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
outputs
=
self
.
generate_encoder_decoder_w_logprobs
(
encoder_decoder_prompts
,
greedy_logprobs_params
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
...
@@ -593,3 +784,26 @@ def num_gpus_available():
...
@@ -593,3 +784,26 @@ def num_gpus_available():
in current process."""
in current process."""
return
cuda_device_count_stateless
()
return
cuda_device_count_stateless
()
temp_dir
=
tempfile
.
gettempdir
()
_dummy_path
=
os
.
path
.
join
(
temp_dir
,
"dummy_opt"
)
@
pytest
.
fixture
def
dummy_opt_path
():
json_path
=
os
.
path
.
join
(
_dummy_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_path
):
snapshot_download
(
repo_id
=
"facebook/opt-125m"
,
local_dir
=
_dummy_path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyOPTForCausalLM"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_path
tests/core/block/e2e/test_correctness.py
View file @
af7f4372
...
@@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
...
@@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
# skip cuda graph creation for fast test.
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
"enable_chunked_prefill"
:
True
,
"enable_chunked_prefill"
:
True
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
},
])
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
"block_size"
:
8
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[
{
{
"use_v2_block_manager"
:
False
,
"use_v2_block_manager"
:
False
,
...
@@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
...
@@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
"The president of the United States is"
,
"The president of the United States is"
,
(
"1 + "
*
50
)
+
" 1 = "
,
# Longer prompt.
"The capital of France is"
,
"The capital of France is"
,
"The future of AI is"
,
"The future of AI is"
,
]
]
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment