Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
af7f4372
Commit
af7f4372
authored
Sep 03, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1
parents
5e19cdef
09c77926
Changes
465
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
612 additions
and
200 deletions
+612
-200
pyproject.toml
pyproject.toml
+3
-1
requirements-build.txt
requirements-build.txt
+2
-1
requirements-common.txt
requirements-common.txt
+10
-6
requirements-cuda.txt
requirements-cuda.txt
+2
-2
requirements-lint.txt
requirements-lint.txt
+1
-1
requirements-openvino.txt
requirements-openvino.txt
+3
-29
requirements-test.txt
requirements-test.txt
+8
-2
setup.py
setup.py
+27
-9
tests/async_engine/api_server_async_engine.py
tests/async_engine/api_server_async_engine.py
+5
-4
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+8
-1
tests/async_engine/test_async_llm_engine.py
tests/async_engine/test_async_llm_engine.py
+94
-13
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+8
-13
tests/async_engine/test_openapi_server_ray.py
tests/async_engine/test_openapi_server_ray.py
+12
-4
tests/async_engine/test_request_tracker.py
tests/async_engine/test_request_tracker.py
+14
-13
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+96
-8
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+0
-38
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+18
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+20
-0
tests/conftest.py
tests/conftest.py
+266
-52
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+15
-3
No files found.
Too many changes to show.
To preserve performance only
465 of 465+
files are displayed.
Plain diff
Email patch
pyproject.toml
View file @
af7f4372
[build-system]
# Should be mirrored in requirements-build.txt
requires
=
[
"cmake>=3.2
1
"
,
"cmake>=3.2
6
"
,
"ninja"
,
"packaging"
,
"setuptools >= 49.4.0"
,
"torch == 2.4.0"
,
"wheel"
,
"jinja2"
,
]
build-backend
=
"setuptools.build_meta"
...
...
@@ -56,6 +57,7 @@ files = [
"vllm/*.py"
,
"vllm/adapter_commons"
,
"vllm/assets"
,
"vllm/entrypoints"
,
"vllm/inputs"
,
"vllm/logging"
,
"vllm/multimodal"
,
...
...
requirements-build.txt
View file @
af7f4372
# Should be mirrored in pyproject.toml
cmake>=3.2
1
cmake>=3.2
6
ninja
packaging
setuptools>=49.4.0
torch==2.4.0
wheel
jinja2
requirements-common.txt
View file @
af7f4372
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
...
...
@@ -8,17 +6,23 @@ tqdm
py-cpuinfo
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi
aiohttp
openai
openai
>= 1.0 # Ensure modern openai package (ensure types module present)
uvicorn[standard]
pydantic >= 2.
0
# Required for OpenAI server.
pydantic >= 2.
8
# Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.
3
lm-format-enforcer == 0.10.
6
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
typing_extensions
>= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
msgspec
librosa # Required for audio processing
soundfile # Required for audio processing
gguf == 0.9.1
importlib_metadata
requirements-cuda.txt
View file @
af7f4372
...
...
@@ -7,5 +7,5 @@ nvidia-ml-py # for pynvml package
torch == 2.4.0
# These must be updated alongside torch
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27.post2 # Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1 # Requires PyTorch 2.4.0
xformers == 0.0.27.post2
; platform_system == 'Linux' and platform_machine == 'x86_64'
# Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1
; platform_system == 'Linux' and platform_machine == 'x86_64'
# Requires PyTorch 2.4.0
requirements-lint.txt
View file @
af7f4372
...
...
@@ -8,7 +8,7 @@ isort==5.13.2
clang-format==18.1.5
# type checking
mypy==1.
9.0
mypy==1.
11.1
types-PyYAML
types-requests
types-setuptools
requirements-openvino.txt
View file @
af7f4372
# Common dependencies
# -r requirements-common.txt
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers < 4.43
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
-r requirements-common.txt
# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
optimum-intel[openvino] >= 1.18.1
openvino ~= 2024.3.0
optimum-intel[openvino] >= 1.18.2
requirements-test.txt
View file @
af7f4372
...
...
@@ -11,7 +11,7 @@ pytest-shard
# testing utils
awscli
einops # required for MPT
einops # required for MPT
and qwen-vl
httpx
peft
requests
...
...
@@ -19,9 +19,15 @@ ray
sentence-transformers # required for embedding
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
# Benchmarking
aiohttp
# quantization
bitsandbytes==0.42.0
buildkite-test-collector==0.1.8
\ No newline at end of file
setup.py
View file @
af7f4372
...
...
@@ -68,9 +68,12 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
VLLM_TARGET_DEVICE
=
envs
.
VLLM_TARGET_DEVICE
# vLLM only supports Linux platform
assert
sys
.
platform
.
startswith
(
"linux"
),
"vLLM only supports Linux platform (including WSL)."
if
not
sys
.
platform
.
startswith
(
"linux"
):
logger
.
warning
(
"vLLM only supports Linux platform (including WSL). "
"Building on %s, "
"so vLLM may not be able to run correctly"
,
sys
.
platform
)
VLLM_TARGET_DEVICE
=
"empty"
MAIN_CUDA_VERSION
=
"12.1"
...
...
@@ -188,6 +191,10 @@ class cmake_build_ext(build_ext):
# match.
cmake_args
+=
[
'-DVLLM_PYTHON_EXECUTABLE={}'
.
format
(
sys
.
executable
)]
# Pass the python path to cmake so it can reuse the build dependencies
# on subsequent calls to python.
cmake_args
+=
[
'-DVLLM_PYTHON_PATH={}'
.
format
(
":"
.
join
(
sys
.
path
))]
#
# Setup parallelism and build tool
#
...
...
@@ -238,6 +245,10 @@ class cmake_build_ext(build_ext):
subprocess
.
check_call
([
"cmake"
,
*
build_args
],
cwd
=
self
.
build_temp
)
def
_no_device
()
->
bool
:
return
VLLM_TARGET_DEVICE
==
"empty"
def
_is_cuda
()
->
bool
:
has_cuda
=
torch
.
version
.
cuda
is
not
None
return
(
VLLM_TARGET_DEVICE
==
"cuda"
and
has_cuda
...
...
@@ -279,7 +290,7 @@ def _build_custom_ops() -> bool:
def
_build_core_ext
()
->
bool
:
return
not
_is_neuron
()
and
not
_is_
t
pu
()
return
not
(
_is_neuron
()
or
_is_tpu
()
or
_is_openvino
()
or
_is_
x
pu
()
)
def
get_hipcc_rocm_version
():
...
...
@@ -398,13 +409,13 @@ try:
import vllm.commit_id
__commit__ = vllm.commit_id.__commit__
except Exception as e:
warnings.warn(f"Failed to read commit hash:
\
\
n + str(e)
",
warnings.warn(f"Failed to read commit hash:
\
n
{
e
}
",
RuntimeWarning,
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.5.
4
"
__dcu_version__ = f'0.5.
4
+
{
version
}
'
__version__ = "0.5.
5
"
__dcu_version__ = f'0.5.
5
+
{
version
}
'
"""
...
...
@@ -424,7 +435,9 @@ def get_version():
def
get_vllm_version
()
->
str
:
# version = find_version(get_path("vllm", "version.py"))
if
_is_cuda
():
if
_no_device
():
version
+=
"+empty"
elif
_is_cuda
():
cuda_version
=
str
(
get_nvcc_cuda_version
())
if
cuda_version
!=
MAIN_CUDA_VERSION
:
cuda_version_str
=
cuda_version
.
replace
(
"."
,
""
)[:
3
]
...
...
@@ -479,7 +492,9 @@ def get_requirements() -> List[str]:
resolved_requirements
.
append
(
line
)
return
resolved_requirements
if
_is_cuda
():
if
_no_device
():
requirements
=
_read_requirements
(
"requirements-cuda.txt"
)
elif
_is_cuda
():
requirements
=
_read_requirements
(
"requirements-cuda.txt"
)
cuda_major
,
cuda_minor
=
torch
.
version
.
cuda
.
split
(
"."
)
modified_requirements
=
[]
...
...
@@ -528,6 +543,9 @@ if envs.VLLM_USE_PRECOMPILED:
ext_modules
=
[]
package_data
[
"vllm"
].
append
(
"*.so"
)
if
_no_device
():
ext_modules
=
[]
setup
(
name
=
"vllm"
,
version
=
get_vllm_version
(),
...
...
tests/async_engine/api_server_async_engine.py
View file @
af7f4372
"""vllm.entrypoints.api_server with some extra logging for testing."""
from
typing
import
Any
,
Dict
from
typing
import
Any
,
Dict
,
Iterable
import
uvicorn
from
fastapi.responses
import
JSONResponse
,
Response
...
...
@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
_num_aborts
=
0
async
def
abort
(
self
,
request_id
:
str
)
->
None
:
await
super
().
abort
(
request_id
)
self
.
_num_aborts
+=
1
async
def
_engine_abort
(
self
,
request_ids
:
Iterable
[
str
]):
ids
=
list
(
request_ids
)
self
.
_num_aborts
+=
len
(
ids
)
await
super
().
_engine_abort
(
ids
)
def
testing_stats
(
self
)
->
Dict
[
str
,
Any
]:
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
...
...
tests/async_engine/test_api_server.py
View file @
af7f4372
import
os
import
subprocess
import
sys
import
time
...
...
@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
"127.0.0.1"
,
"--tokenizer-pool-size"
,
str
(
tokenizer_pool_size
)
]
# Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
# to prevent `--engine-use-ray` raises an exception due to it deprecation
env_vars
=
os
.
environ
.
copy
()
env_vars
[
"VLLM_ALLOW_ENGINE_USE_RAY"
]
=
"1"
if
engine_use_ray
:
commands
.
append
(
"--engine-use-ray"
)
if
worker_use_ray
:
commands
.
append
(
"--worker-use-ray"
)
uvicorn_process
=
subprocess
.
Popen
(
commands
)
uvicorn_process
=
subprocess
.
Popen
(
commands
,
env
=
env_vars
)
yield
uvicorn_process
.
terminate
()
...
...
tests/async_engine/test_async_llm_engine.py
View file @
af7f4372
import
asyncio
import
os
from
asyncio
import
CancelledError
from
dataclasses
import
dataclass
from
typing
import
Optional
import
pytest
import
pytest_asyncio
import
torch
from
vllm
import
SamplingParams
from
vllm.config
import
ParallelConfig
from
vllm.engine.async_llm_engine
import
AsyncEngineArgs
,
AsyncLLMEngine
from
vllm.outputs
import
RequestOutput
as
RealRequestOutput
from
..conftest
import
cleanup
from
..utils
import
wait_for_gpu_memory_to_clear
...
...
@@ -106,21 +112,49 @@ async def test_new_requests_event():
assert
engine
.
engine
.
add_request_calls
==
3
assert
engine
.
engine
.
step_calls
==
old_step_calls
+
1
# Allow deprecated engine_use_ray to not raise exception
os
.
environ
[
"VLLM_ALLOW_ENGINE_USE_RAY"
]
=
"1"
engine
=
MockAsyncLLMEngine
(
worker_use_ray
=
True
,
engine_use_ray
=
True
)
assert
engine
.
get_model_config
()
is
not
None
assert
engine
.
get_tokenizer
()
is
not
None
assert
engine
.
get_decoding_config
()
is
not
None
os
.
environ
.
pop
(
"VLLM_ALLOW_ENGINE_USE_RAY"
)
def
test_asyncio_run
():
def
start_engine
():
wait_for_gpu_memory_to_clear
(
devices
=
list
(
range
(
torch
.
cuda
.
device_count
())),
threshold_bytes
=
2
*
2
**
30
,
timeout_s
=
60
,
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
))
return
AsyncLLMEngine
.
from_engine_args
(
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
))
@
pytest_asyncio
.
fixture
(
scope
=
"module"
)
async
def
async_engine
():
engine
=
await
asyncio
.
get_event_loop
().
run_in_executor
(
executor
=
None
,
func
=
start_engine
)
try
:
yield
engine
finally
:
engine
.
shutdown_background_loop
()
del
engine
await
asyncio
.
sleep
(
0.1
)
cleanup
()
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
# So we can share the async engine fixture between these tests
return
False
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_asyncio_run
(
async_engine
):
async
def
run
(
prompt
:
str
):
sampling_params
=
SamplingParams
(
...
...
@@ -128,17 +162,64 @@ def test_asyncio_run():
max_tokens
=
32
,
)
async
for
output
in
engine
.
generate
(
prompt
,
async
for
output
in
async_
engine
.
generate
(
prompt
,
sampling_params
,
request_id
=
prompt
):
final_output
=
output
return
final_output
async
def
generate
():
return
await
asyncio
.
gather
(
results
=
await
asyncio
.
gather
(
run
(
"test0"
),
run
(
"test1"
),
)
results
=
asyncio
.
run
(
generate
())
assert
len
(
results
)
==
2
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_cancellation
(
async_engine
):
sampling_params
=
SamplingParams
(
temperature
=
0
,
min_tokens
=
10
,
max_tokens
=
10
,
)
i
=
0
with
pytest
.
raises
(
CancelledError
):
async
for
output
in
async_engine
.
generate
(
"test2"
,
sampling_params
,
request_id
=
"test2"
):
assert
not
output
.
finished
i
+=
1
if
i
==
5
:
await
async_engine
.
abort
(
"test2"
)
assert
i
==
5
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_delayed_generator
(
async_engine
):
sampling_params
=
SamplingParams
(
temperature
=
0
,
min_tokens
=
10
,
max_tokens
=
10
,
)
stream
=
async_engine
.
generate
(
"test3"
,
sampling_params
,
request_id
=
"test3"
)
i
=
0
final_output
:
Optional
[
RealRequestOutput
]
=
None
async
for
output
in
stream
:
final_output
=
output
if
i
==
0
:
# wait for generation to complete before consuming
# the remaining messages
await
asyncio
.
sleep
(
1
)
if
i
<
9
:
assert
not
output
.
finished
i
+=
1
assert
i
==
10
assert
final_output
is
not
None
assert
len
(
final_output
.
outputs
[
0
].
token_ids
)
==
10
assert
final_output
.
finished
tests/async_engine/test_chat_template.py
View file @
af7f4372
import
os
import
pathlib
import
pytest
from
vllm.entrypoints.chat_utils
import
load_chat_template
from
vllm.entrypoints.chat_utils
import
apply_chat_template
,
load_chat_template
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
from
..utils
import
VLLM_PATH
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
"facebook/opt-125m"
,
None
,
True
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
None
,
False
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
...
...
@@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
add_generation_prompt
=
add_generation_prompt
)
# Call the function and get the result
result
=
tokenizer
.
apply_chat_template
(
result
=
apply_chat_template
(
tokenizer
,
conversation
=
mock_request
.
messages
,
tokenize
=
False
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
chat_template
=
mock_request
.
chat_template
or
template_content
)
)
# Test assertion
assert
result
==
expected_output
,
(
...
...
tests/async_engine/test_openapi_server_ray.py
View file @
af7f4372
import
openai
# use the official client for correctness check
import
pytest
from
..utils
import
RemoteOpenAIServer
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"facebook/opt-125m"
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
@@ -16,10 +18,16 @@ def server():
"--max-model-len"
,
"2048"
,
"--enforce-eager"
,
"--engine-use-ray"
"--engine-use-ray"
,
"--chat-template"
,
str
(
chatml_jinja_path
),
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
# Allow `--engine-use-ray`, otherwise the launch of the server throw
# an error due to try to use a deprecated feature
env_dict
=
{
"VLLM_ALLOW_ENGINE_USE_RAY"
:
"1"
}
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
env_dict
)
as
remote_server
:
yield
remote_server
...
...
@@ -83,7 +91,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI):
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
13
,
total_tokens
=
23
)
completion_tokens
=
10
,
prompt_tokens
=
55
,
total_tokens
=
65
)
message
=
choice
.
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
...
...
tests/async_engine/test_request_tracker.py
View file @
af7f4372
...
...
@@ -10,23 +10,23 @@ async def test_request_tracker():
stream_1
=
tracker
.
add_request
(
"1"
)
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"1"
assert
not
finish
ed
assert
not
abort
ed
assert
not
stream_1
.
finished
stream_2
=
tracker
.
add_request
(
"2"
)
stream_3
=
tracker
.
add_request
(
"3"
)
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
new
)
==
2
assert
new
[
0
][
"request_id"
]
==
"2"
assert
new
[
1
][
"request_id"
]
==
"3"
assert
not
finish
ed
assert
not
abort
ed
assert
not
stream_2
.
finished
assert
not
stream_3
.
finished
...
...
@@ -36,9 +36,9 @@ async def test_request_tracker():
assert
not
tracker
.
new_requests_event
.
is_set
()
tracker
.
abort_request
(
"1"
)
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
assert
len
(
finish
ed
)
==
1
assert
"1"
in
finish
ed
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
len
(
abort
ed
)
==
1
assert
"1"
in
abort
ed
assert
not
new
assert
stream_1
.
finished
...
...
@@ -46,9 +46,11 @@ async def test_request_tracker():
tracker
.
abort_request
(
"4"
)
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
len
(
finished
)
==
1
assert
"4"
in
finished
new
,
aborted
=
tracker
.
get_new_and_aborted_requests
()
# aborted new requests will cancel each other out -
# there's no need for them to propagate into the
# engine
assert
not
aborted
assert
not
new
assert
stream_4
.
finished
...
...
@@ -57,10 +59,9 @@ async def test_request_tracker():
tracker
.
process_request_output
(
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
finished
)
==
1
assert
"2"
in
finished
assert
not
aborted
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"5"
assert
stream_2
.
finished
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
af7f4372
...
...
@@ -6,14 +6,27 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
import
pytest
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
]
E5M2_KV_MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-chat-hf"
,
]
E4M3_KV_MODELS
=
[
"meta-llama/Llama-2-7b-chat-hf"
,
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
]
KV_CACHE_QUANTIZATION_PATHS
=
{
"meta-llama/Llama-2-7b-chat-hf"
:
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
}
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
@@ -35,11 +48,11 @@ def test_models(
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
)
->
None
:
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
enable_chunked_prefill
=
False
max_num_batched_tokens
=
None
if
chunked_prefill_token_size
!=
-
1
:
enable_
chunked_prefill
=
Tru
e
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
max_num_seqs
=
chunked_prefill
_token_siz
e
max_num_batched_tokens
=
chunked_prefill_token_size
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
...
...
@@ -49,7 +62,7 @@ def test_models(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
...
...
@@ -62,3 +75,78 @@ def test_models(
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype,model"
,
[(
"fp8_e5m2"
,
m
)
for
m
in
E5M2_KV_MODELS
]
+
[(
"fp8_e4m3"
,
m
)
for
m
in
E4M3_KV_MODELS
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
def
test_models_with_fp8_kv_cache
(
vllm_runner
,
example_prompts
,
kv_cache_dtype
:
str
,
model
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
)
->
None
:
"""
Only checks log probs match between chunked-prefill and
non-chunked-prefill version of vLLM model runner.
This test is used when there is discrepancy in kernels
/ numerics (e.g. when using lower-precision types like FP8).
"""
NUM_LOG_PROBS
=
8
if
model
==
"facebook/opt-125m"
:
pytest
.
skip
(
"#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
)
max_num_seqs
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
extra_kwargs
=
{}
if
model
in
KV_CACHE_QUANTIZATION_PATHS
:
extra_kwargs
[
"quantization_param_path"
]
=
KV_CACHE_QUANTIZATION_PATHS
[
model
]
with
vllm_runner
(
model
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
**
extra_kwargs
,
)
as
vllm_model
:
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
**
extra_kwargs
,
)
as
vllm_model
:
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
outputs_0_lst
=
no_chunked_prefill_outputs
,
outputs_1_lst
=
chunked_prefill_outputs
,
name_0
=
"no_chunked_prefill"
,
name_1
=
"chunked_prefill"
,
)
tests/basic_correctness/test_cpu_offload.py
View file @
af7f4372
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
compare_two_settings
def
test_cpu_offload
():
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
[
"--cpu-offload-gb"
,
"4"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
])
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq"
),
reason
=
"awq is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
compare_two_settings
(
"casperhansen/llama-3-8b-instruct-awq"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
tests/basic_correctness/test_preemption.py
View file @
af7f4372
...
...
@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
import
pytest
from
prometheus_client
import
REGISTRY
import
vllm.envs
as
envs
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
ENABLE_ARTIFICIAL_PREEMPT
)
...
...
@@ -24,6 +25,13 @@ assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"tests/basic_correctness/test_preemption.py`"
)
@
pytest
.
fixture
def
worker_use_ray
()
->
bool
:
# When SPMD worker is used, use ray_use_worker=True
# to test delta input optimization works with preemption.
return
envs
.
VLLM_USE_RAY_SPMD_WORKER
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
...
...
@@ -36,6 +44,7 @@ def test_chunked_prefill_recompute(
dtype
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
"""Ensure that chunked prefill works with preemption."""
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
...
...
@@ -54,6 +63,7 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_seqs
=
max_num_seqs
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
...
@@ -80,6 +90,7 @@ def test_preemption(
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
"""By default, recompute preemption is enabled"""
...
...
@@ -90,6 +101,7 @@ def test_preemption(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
...
@@ -134,6 +146,7 @@ def test_swap(
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
"""Use beam search enables swapping."""
example_prompts
=
example_prompts
[:
1
]
...
...
@@ -146,6 +159,7 @@ def test_swap(
dtype
=
dtype
,
swap_space
=
10
,
disable_log_stats
=
False
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
...
...
@@ -191,6 +205,7 @@ def test_swap_infeasible(
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
"""Verify infeasible swap request will be ignored."""
BLOCK_SIZE
=
16
...
...
@@ -207,6 +222,7 @@ def test_swap_infeasible(
# decode blocks are not enough to finish.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
,
max_model_len
=
(
prefill_blocks
+
decode_blocks
)
*
BLOCK_SIZE
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
n
=
beam_width
,
use_beam_search
=
True
,
...
...
@@ -234,6 +250,7 @@ def test_preemption_infeasible(
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE
=
16
...
...
@@ -248,6 +265,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
...
...
tests/compile/test_full_graph.py
0 → 100644
View file @
af7f4372
import
os
import
pytest
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Meta-Llama-3-8B"
])
def
test_full_graph
(
model
):
# make sure these models can be captured in full graph mode
os
.
environ
[
"VLLM_TEST_DYNAMO_GRAPH_CAPTURE"
]
=
"1"
from
vllm
import
LLM
,
SamplingParams
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B"
)
llm
.
generate
(
prompts
,
sampling_params
)
tests/conftest.py
View file @
af7f4372
import
contextlib
import
gc
import
json
import
os
import
sys
import
tempfile
from
collections
import
UserList
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
,
Union
from
enum
import
Enum
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
,
Union
)
import
numpy
as
np
import
pytest
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
huggingface_hub
import
snapshot_download
from
PIL
import
Image
from
transformers
import
(
AutoModelForCausalLM
,
Auto
ModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
,
BatchFeature
)
from
transformers
import
(
AutoModelForCausalLM
,
Auto
Tokenizer
,
BatchEncoding
,
BatchFeature
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.config
import
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
)
from
vllm.inputs
import
TextPrompt
destroy_model_parallel
,
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
is_cpu
)
identity
,
is_cpu
)
logger
=
init_logger
(
__name__
)
...
...
@@ -82,6 +92,21 @@ def init_test_http_connection():
global_http_connection
.
reuse_client
=
False
@
pytest
.
fixture
def
dist_init
():
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
1
,
1
)
yield
cleanup
()
def
cleanup
():
destroy_model_parallel
()
destroy_distributed_environment
()
...
...
@@ -120,6 +145,46 @@ def example_prompts() -> List[str]:
return
prompts
class
DecoderPromptType
(
Enum
):
"""For encoder/decoder models only."""
CUSTOM
=
1
NONE
=
2
EMPTY_STR
=
3
@
pytest
.
fixture
def
example_encoder_decoder_prompts
(
)
->
Dict
[
DecoderPromptType
,
List
[
ExplicitEncoderDecoderPrompt
]]:
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
decoder prompt) tuple.
Returns:
* Encoder prompt list
* Decoder prompt list (reverse of encoder prompt list)
'''
encoder_prompts
=
[]
for
filename
in
_TEST_PROMPTS
:
encoder_prompts
+=
_read_prompts
(
filename
)
custom_decoder_prompts
=
encoder_prompts
[::
-
1
]
empty_str_decoder_prompts
=
[
""
]
*
len
(
encoder_prompts
)
none_decoder_prompts
=
[
None
]
*
len
(
encoder_prompts
)
# NONE decoder prompt type
return
{
DecoderPromptType
.
NONE
:
zip_enc_dec_prompts
(
encoder_prompts
,
none_decoder_prompts
),
DecoderPromptType
.
EMPTY_STR
:
zip_enc_dec_prompts
(
encoder_prompts
,
empty_str_decoder_prompts
),
DecoderPromptType
.
CUSTOM
:
zip_enc_dec_prompts
(
encoder_prompts
,
custom_decoder_prompts
),
}
@
pytest
.
fixture
def
example_long_prompts
()
->
List
[
str
]:
prompts
=
[]
...
...
@@ -151,7 +216,9 @@ class HfRunner:
*
,
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
is_embedding_model
:
bool
=
False
,
is_vision_model
:
bool
=
False
,
auto_cls
=
AutoModelForCausalLM
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
]
=
identity
,
)
->
None
:
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
...
...
@@ -166,11 +233,6 @@ class HfRunner:
device
=
"cpu"
,
).
to
(
dtype
=
torch_dtype
))
else
:
if
is_vision_model
:
auto_cls
=
AutoModelForVision2Seq
else
:
auto_cls
=
AutoModelForCausalLM
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
self
.
model
=
self
.
wrap_device
(
auto_cls
.
from_pretrained
(
...
...
@@ -195,12 +257,14 @@ class HfRunner:
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
)
except
Exception
:
except
Exception
as
exc
:
logger
.
warning
(
"Unable to auto-load
processor from HuggingFace for
"
"
model %s.
Using tokenizer instead."
,
model_name
)
"Unable to auto-load
HuggingFace processor for model (%s).
"
"Using tokenizer instead.
Reason: %s
"
,
model_name
,
exc
)
self
.
processor
=
self
.
tokenizer
self
.
postprocess_inputs
=
postprocess_inputs
def
generate
(
self
,
prompts
:
List
[
str
],
...
...
@@ -220,6 +284,7 @@ class HfRunner:
processor_kwargs
[
"images"
]
=
images
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
output_ids
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
...
...
@@ -289,6 +354,7 @@ class HfRunner:
processor_kwargs
[
"images"
]
=
images
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
...
...
@@ -314,12 +380,51 @@ class HfRunner:
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
def
_hidden_states_to_logprobs
(
self
,
hidden_states
,
num_logprobs
,
)
->
Tuple
[
List
[
Dict
[
int
,
float
]],
int
]:
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
output_len
=
len
(
hidden_states
)
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
getattr
(
self
.
model
.
get_output_embeddings
(),
"bias"
,
None
)
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
().
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
# convert to dict
seq_logprobs_lst
:
List
[
Dict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
if
tok_idx
==
0
:
tok_logprobs
=
tok_logprobs
[
-
1
,
:].
reshape
(
1
,
-
1
)
topk
=
tok_logprobs
.
topk
(
num_logprobs
)
tok_logprobs_dct
=
{}
for
token_id
,
logprob
in
zip
(
topk
.
indices
[
0
],
topk
.
values
[
0
]):
tok_logprobs_dct
[
token_id
.
item
()]
=
logprob
.
item
()
seq_logprobs_lst
.
append
(
tok_logprobs_dct
)
return
(
seq_logprobs_lst
,
output_len
,
)
def
generate_greedy_logprobs_limit
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
audios
:
Optional
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]]:
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
...
...
@@ -334,7 +439,13 @@ class HfRunner:
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
if
audios
is
not
None
:
audio
,
sr
=
audios
[
i
]
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
...
...
@@ -346,37 +457,66 @@ class HfRunner:
**
kwargs
,
)
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
for
_
,
hidden_states
in
enumerate
(
output
.
hidden_states
):
last_hidden_states
=
hidden_states
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
getattr
(
self
.
model
.
get_output_embeddings
(),
"bias"
,
None
)
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
(
).
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
(
seq_logprobs_lst
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
output
.
hidden_states
,
num_logprobs
)
# convert to dict
seq_logprobs_lst
:
List
[
Dict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
if
tok_idx
==
0
:
tok_logprobs
=
tok_logprobs
[
-
1
,
:].
reshape
(
1
,
-
1
)
topk
=
tok_logprobs
.
topk
(
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
output
.
sequences
[
0
]
output_len
=
len
(
seq_logprobs_lst
)
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
tok_logprobs_dct
=
{}
for
token
_id
,
logprob
in
zip
(
topk
.
indices
[
0
],
topk
.
values
[
0
]):
tok_logprobs_dct
[
token_id
.
item
()]
=
logprob
.
item
()
outputs
=
zip
(
all_output_ids
,
all_output_strs
,
all_logprobs
)
return
[(
output
_id
s
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
seq_logprobs_lst
.
append
(
tok_logprobs_dct
)
def
generate_encoder_decoder_greedy_logprobs_limit
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]]:
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
(
encoder_prompt
,
decoder_prompt
)
in
to_enc_dec_tuple_list
(
encoder_decoder_prompts
):
encoder_input_ids
=
self
.
wrap_device
(
self
.
tokenizer
(
encoder_prompt
,
return_tensors
=
"pt"
).
input_ids
)
decoder_input_ids
=
(
None
if
decoder_prompt
is
None
else
self
.
wrap_device
(
self
.
tokenizer
(
decoder_prompt
,
return_tensors
=
"pt"
).
input_ids
))
output
=
self
.
model
.
generate
(
encoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
,
use_cache
=
True
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
**
kwargs
,
)
(
seq_logprobs_lst
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
output
.
decoder_hidden_states
,
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
output
.
sequences
[
0
]
output_len
=
len
(
seq_logprobs_lst
)
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
...
...
@@ -416,7 +556,7 @@ class VllmRunner:
block_size
:
int
=
16
,
enable_chunked_prefill
:
bool
=
False
,
swap_space
:
int
=
4
,
enforce_eager
:
bool
=
False
,
enforce_eager
:
Optional
[
bool
]
=
False
,
**
kwargs
,
)
->
None
:
self
.
model
=
LLM
(
...
...
@@ -438,7 +578,8 @@ class VllmRunner:
self
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]
=
None
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
...
...
@@ -465,11 +606,27 @@ class VllmRunner:
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
return
outputs
def
_final_steps_generate_w_logprobs
(
self
,
req_outputs
:
List
[
RequestOutput
],
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
outputs
:
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
=
[]
for
req_output
in
req_outputs
:
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_ids
=
list
(
sample
.
token_ids
)
output_logprobs
=
sample
.
logprobs
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
))
return
outputs
def
generate_w_logprobs
(
self
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]
=
None
,
audios
:
Optional
[
Union
[
List
[
Tuple
[
np
.
ndarray
,
int
]],
List
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]]]
=
None
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
assert
sampling_params
.
logprobs
is
not
None
...
...
@@ -481,16 +638,27 @@ class VllmRunner:
for
i
,
image
in
enumerate
(
images
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
if
audios
is
not
None
:
for
i
,
audio
in
enumerate
(
audios
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"audio"
:
audio
}
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
)
outputs
:
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
=
[]
for
req_output
in
req_outputs
:
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_ids
=
sample
.
token_ids
output_logprobs
=
sample
.
logprobs
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
))
return
outputs
return
self
.
_final_steps_generate_w_logprobs
(
req_outputs
)
def
generate_encoder_decoder_w_logprobs
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
sampling_params
:
SamplingParams
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
'''
Logprobs generation for vLLM encoder/decoder models
'''
assert
sampling_params
.
logprobs
is
not
None
req_outputs
=
self
.
model
.
generate
(
encoder_decoder_prompts
,
sampling_params
=
sampling_params
)
return
self
.
_final_steps_generate_w_logprobs
(
req_outputs
)
def
generate_greedy
(
self
,
...
...
@@ -510,6 +678,8 @@ class VllmRunner:
num_logprobs
:
int
,
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]
=
None
,
audios
:
Optional
[
Union
[
List
[
Tuple
[
np
.
ndarray
,
int
]],
List
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
...
...
@@ -518,7 +688,28 @@ class VllmRunner:
stop_token_ids
=
stop_token_ids
)
outputs
=
self
.
generate_w_logprobs
(
prompts
,
greedy_logprobs_params
,
images
=
images
)
images
=
images
,
audios
=
audios
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
generate_encoder_decoder_greedy_logprobs
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
use_beam_search
=
False
,
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
)
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
outputs
=
self
.
generate_encoder_decoder_w_logprobs
(
encoder_decoder_prompts
,
greedy_logprobs_params
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
...
...
@@ -593,3 +784,26 @@ def num_gpus_available():
in current process."""
return
cuda_device_count_stateless
()
temp_dir
=
tempfile
.
gettempdir
()
_dummy_path
=
os
.
path
.
join
(
temp_dir
,
"dummy_opt"
)
@
pytest
.
fixture
def
dummy_opt_path
():
json_path
=
os
.
path
.
join
(
_dummy_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_path
):
snapshot_download
(
repo_id
=
"facebook/opt-125m"
,
local_dir
=
_dummy_path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyOPTForCausalLM"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_path
tests/core/block/e2e/test_correctness.py
View file @
af7f4372
...
...
@@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
"enable_chunked_prefill"
:
True
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
"block_size"
:
8
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[
{
"use_v2_block_manager"
:
False
,
...
...
@@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
(
"1 + "
*
50
)
+
" 1 = "
,
# Longer prompt.
"The capital of France is"
,
"The future of AI is"
,
]
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
…
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment