Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
420 additions
and
276 deletions
+420
-276
tests/weight_loading/run_model_weight_loading_test.sh
tests/weight_loading/run_model_weight_loading_test.sh
+4
-0
tests/weight_loading/test_weight_loading.py
tests/weight_loading/test_weight_loading.py
+7
-0
vllm/_custom_ops.py
vllm/_custom_ops.py
+109
-53
vllm/adapter_commons/models.py
vllm/adapter_commons/models.py
+4
-5
vllm/assets/audio.py
vllm/assets/audio.py
+12
-7
vllm/assets/base.py
vllm/assets/base.py
+3
-4
vllm/assets/image.py
vllm/assets/image.py
+1
-2
vllm/assets/video.py
vllm/assets/video.py
+3
-6
vllm/attention/backends/rocm_flash_attn.py
vllm/attention/backends/rocm_flash_attn.py
+1
-1
vllm/attention/layer.py
vllm/attention/layer.py
+1
-0
vllm/benchmarks/benchmark_throughput.py
vllm/benchmarks/benchmark_throughput.py
+91
-42
vllm/block.py
vllm/block.py
+0
-88
vllm/compilation/backends.py
vllm/compilation/backends.py
+5
-4
vllm/compilation/multi_output_match.py
vllm/compilation/multi_output_match.py
+2
-1
vllm/compilation/pass_manager.py
vllm/compilation/pass_manager.py
+2
-2
vllm/config.py
vllm/config.py
+120
-15
vllm/core/evictor.py
vllm/core/evictor.py
+2
-2
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+41
-24
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+7
-0
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+5
-20
No files found.
tests/weight_loading/run_model_weight_loading_test.sh
View file @
96ae75ad
...
@@ -26,6 +26,10 @@ do
...
@@ -26,6 +26,10 @@ do
export
QUANTIZATION
=
${
array
[0]
}
export
QUANTIZATION
=
${
array
[0]
}
export
MODEL_NAME
=
${
array
[1]
}
export
MODEL_NAME
=
${
array
[1]
}
export
REVISION
=
${
array
[2]
}
export
REVISION
=
${
array
[2]
}
# If array length is larger than 3, then MIN_CAPABILITY is provided
if
[
${#
array
[@]
}
-gt
3
]
;
then
export
MIN_CAPABILITY
=
${
array
[3]
}
fi
pytest
-s
weight_loading/test_weight_loading.py
||
LOCAL_SUCCESS
=
$?
pytest
-s
weight_loading/test_weight_loading.py
||
LOCAL_SUCCESS
=
$?
if
[[
$LOCAL_SUCCESS
==
0
]]
;
then
if
[[
$LOCAL_SUCCESS
==
0
]]
;
then
...
...
tests/weight_loading/test_weight_loading.py
View file @
96ae75ad
import
os
import
os
import
pytest
import
torch
import
torch
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.platforms
import
current_platform
MAX_MODEL_LEN
=
1024
MAX_MODEL_LEN
=
1024
MODEL_NAME
=
os
.
environ
.
get
(
"MODEL_NAME"
,
MODEL_NAME
=
os
.
environ
.
get
(
"MODEL_NAME"
,
os
.
path
.
join
(
models_path_prefix
,
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
))
os
.
path
.
join
(
models_path_prefix
,
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
))
REVISION
=
os
.
environ
.
get
(
"REVISION"
,
"main"
)
REVISION
=
os
.
environ
.
get
(
"REVISION"
,
"main"
)
QUANTIZATION
=
os
.
environ
.
get
(
"QUANTIZATION"
,
"gptq_marlin"
)
QUANTIZATION
=
os
.
environ
.
get
(
"QUANTIZATION"
,
"gptq_marlin"
)
MIN_CAPABILITY
=
os
.
environ
.
get
(
"MIN_CAPABILITY"
,
"89"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
int
(
MIN_CAPABILITY
)),
reason
=
"Current system does not have minimum capability."
)
def
test_weight_loading
(
vllm_runner
):
def
test_weight_loading
(
vllm_runner
):
"""
"""
Test parameter weight loading with tp>1.
Test parameter weight loading with tp>1.
...
...
vllm/_custom_ops.py
View file @
96ae75ad
import
contextlib
import
contextlib
import
functools
import
importlib
import
importlib
from
typing
import
TYPE_CHECKING
,
List
,
Optional
,
Tuple
,
Union
,
Type
from
typing
import
TYPE_CHECKING
,
List
,
Optional
,
Tuple
,
Union
,
Type
...
@@ -44,34 +42,6 @@ else:
...
@@ -44,34 +42,6 @@ else:
from
torch.library
import
impl_abstract
as
register_fake
from
torch.library
import
impl_abstract
as
register_fake
def
hint_on_error
(
fn
):
@
functools
.
wraps
(
fn
)
def
wrapper
(
*
args
,
**
kwargs
):
try
:
return
fn
(
*
args
,
**
kwargs
)
except
NotImplementedError
as
e
:
msg
=
(
"Error in calling custom op %s: %s
\n
"
"Not implemented or built, mostly likely because the current current device "
"does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
"incorrectly while building)"
)
logger
.
error
(
msg
,
fn
.
__name__
,
e
)
raise
NotImplementedError
(
msg
%
(
fn
.
__name__
,
e
))
from
e
except
AttributeError
as
e
:
msg
=
(
"Error in calling custom op %s: %s
\n
"
"Possibly you have built or installed an obsolete version of vllm.
\n
"
"Please try a clean build and install of vllm,"
"or remove old built files such as vllm/*cpython*.so and build/ ."
)
logger
.
error
(
msg
,
fn
.
__name__
,
e
)
raise
e
return
wrapper
# activation ops
# activation ops
def
silu_and_mul
(
out
:
torch
.
Tensor
,
x
:
torch
.
Tensor
)
->
None
:
def
silu_and_mul
(
out
:
torch
.
Tensor
,
x
:
torch
.
Tensor
)
->
None
:
torch
.
ops
.
_C
.
silu_and_mul
(
out
,
x
)
torch
.
ops
.
_C
.
silu_and_mul
(
out
,
x
)
...
@@ -984,6 +954,114 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
...
@@ -984,6 +954,114 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
return
out
return
out
def
cutlass_sparse_scaled_mm_supported
(
cuda_device_capability
:
int
)
->
bool
:
return
torch
.
ops
.
_C
.
cutlass_sparse_scaled_mm_supported
(
cuda_device_capability
)
def
cutlass_sparse_compress
(
a
:
torch
.
Tensor
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Compresses a sparse matrix for use with Cutlass sparse operations.
This function takes a dense tensor and compresses it into two components:
non-zero elements and metadata. The compressed representation is compatible
with Cutlass sparse kernels.
Args:
a (torch.Tensor):
The input tensor to be compressed. Must have one of the following data types:
- `torch.int8`
- `torch.float8_e4m3fn`
- `torch.bfloat16`
- `torch.float16`
Returns:
Tuple[torch.Tensor, torch.Tensor]:
A tuple containing:
- `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
- `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
Raises:
ValueError: If the compression operation fails.
Notes:
- The `a_meta` tensor has a data type of `torch.uint8`.
- Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
- The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
- The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
"""
assert
(
a
.
dtype
in
[
torch
.
int8
,
torch
.
float8_e4m3fn
,
torch
.
bfloat16
,
torch
.
float16
])
assert
(
a
.
is_contiguous
())
# a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
elemsPerMetaElem
=
4
m
=
a
.
shape
[
0
]
k
=
a
.
shape
[
1
]
assert
(
k
%
2
==
0
)
a_nzs
=
torch
.
empty
((
m
,
k
//
2
),
dtype
=
a
.
dtype
,
device
=
a
.
device
)
a_meta
=
torch
.
empty
((
m
,
k
//
2
//
elemsPerMetaElem
),
dtype
=
torch
.
uint8
,
device
=
a
.
device
)
if
not
(
torch
.
ops
.
_C
.
cutlass_sparse_compress_entry
(
a_nzs
,
a_meta
,
a
)):
raise
ValueError
assert
(
a_nzs
.
is_contiguous
())
assert
(
a_meta
.
is_contiguous
())
return
a_nzs
,
a_meta
def
cutlass_scaled_sparse_mm
(
a
:
torch
.
Tensor
,
bt_nzs
:
torch
.
Tensor
,
bt_meta
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
out_dtype
:
torch
.
dtype
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
"""
Performs a scaled sparse matrix multiplication using Cutlass.
Steps:
1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
`a = torch.randn((m, k), device='cuda')`.
2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
`b = torch.randn((k, n), device='cuda')`.
3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
`b = prune_to_2_4(b, dim=0)`.
4. Compress the transposed sparse matrix `b.t()`:
`bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
5. Perform sparse matrix multiplication using the compressed matrix,
applying scaling factors for `a` and `b`, and the output data type:
`out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
Returns:
- The result of the scaled sparse matrix multiplication.
"""
assert
(
bt_nzs
.
shape
[
0
]
%
16
==
0
and
bt_nzs
.
shape
[
1
]
%
16
==
0
)
assert
(
out_dtype
is
torch
.
bfloat16
or
out_dtype
is
torch
.
float16
)
assert
bias
is
None
or
bias
.
shape
[
0
]
==
bt_nzs
.
shape
[
0
]
\
and
bias
.
dtype
==
out_dtype
m
=
a
.
shape
[
0
]
n
=
bt_nzs
.
shape
[
0
]
out
=
torch
.
empty
((
m
,
n
),
dtype
=
out_dtype
,
device
=
a
.
device
)
torch
.
ops
.
_C
.
cutlass_scaled_sparse_mm
(
out
,
a
,
bt_nzs
,
bt_meta
,
scale_a
,
scale_b
,
bias
)
return
out
# aqlm
# aqlm
def
aqlm_gemm
(
input
:
torch
.
Tensor
,
codes
:
torch
.
Tensor
,
def
aqlm_gemm
(
input
:
torch
.
Tensor
,
codes
:
torch
.
Tensor
,
codebooks
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
codebooks
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
...
@@ -1426,6 +1504,7 @@ def register_graph_buffers(fa: int, handles: List[List[int]],
...
@@ -1426,6 +1504,7 @@ def register_graph_buffers(fa: int, handles: List[List[int]],
offsets
:
List
[
List
[
int
]])
->
None
:
offsets
:
List
[
List
[
int
]])
->
None
:
torch
.
ops
.
_C_custom_ar
.
register_graph_buffers
(
fa
,
handles
,
offsets
)
torch
.
ops
.
_C_custom_ar
.
register_graph_buffers
(
fa
,
handles
,
offsets
)
def
read_cache
(
def
read_cache
(
keys
:
torch
.
Tensor
,
keys
:
torch
.
Tensor
,
values
:
torch
.
Tensor
,
values
:
torch
.
Tensor
,
...
@@ -1449,26 +1528,3 @@ def write_cache_multi_layers(
...
@@ -1449,26 +1528,3 @@ def write_cache_multi_layers(
torch
.
ops
.
_C_cache_ops
.
write_cache_multi_layers
(
keys
,
values
,
key_caches
,
torch
.
ops
.
_C_cache_ops
.
write_cache_multi_layers
(
keys
,
values
,
key_caches
,
value_caches
,
slot_mapping
,
value_caches
,
slot_mapping
,
kv_cache_dtype
)
kv_cache_dtype
)
# temporary fix for https://github.com/vllm-project/vllm/issues/5456
# TODO: remove this in v0.6.0
names_and_values
=
globals
()
names_and_values_to_update
=
{}
# prepare variables to avoid dict size change during iteration
k
,
v
,
arg
=
None
,
None
,
None
fn_type
=
type
(
lambda
x
:
x
)
for
k
,
v
in
names_and_values
.
items
():
# find functions that are defined in this file and have torch.Tensor
# in their annotations. `arg == "torch.Tensor"` is used to handle
# the case when users use `import __annotations__` to turn type
# hints into strings.
if
isinstance
(
v
,
fn_type
)
\
and
v
.
__code__
.
co_filename
==
__file__
\
and
any
(
arg
is
torch
.
Tensor
or
arg
==
"torch.Tensor"
for
arg
in
v
.
__annotations__
.
values
()):
names_and_values_to_update
[
k
]
=
hint_on_error
(
v
)
names_and_values
.
update
(
names_and_values_to_update
)
del
names_and_values_to_update
,
names_and_values
,
v
,
k
,
fn_type
vllm/adapter_commons/models.py
View file @
96ae75ad
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
Callable
,
Dict
,
Hashable
,
Optional
,
TypeVar
from
typing
import
Any
,
Callable
,
Dict
,
Optional
,
TypeVar
from
torch
import
nn
from
torch
import
nn
...
@@ -24,14 +24,13 @@ class AdapterModel(ABC):
...
@@ -24,14 +24,13 @@ class AdapterModel(ABC):
T
=
TypeVar
(
'T'
)
T
=
TypeVar
(
'T'
)
class
AdapterLRUCache
(
LRUCache
[
T
]):
class
AdapterLRUCache
(
LRUCache
[
int
,
T
]):
def
__init__
(
self
,
capacity
:
int
,
deactivate_fn
:
Callable
[[
Hashable
],
def
__init__
(
self
,
capacity
:
int
,
deactivate_fn
:
Callable
[[
int
],
object
]):
None
]):
super
().
__init__
(
capacity
)
super
().
__init__
(
capacity
)
self
.
deactivate_fn
=
deactivate_fn
self
.
deactivate_fn
=
deactivate_fn
def
_on_remove
(
self
,
key
:
Hashable
,
value
:
Optional
[
T
]):
def
_on_remove
(
self
,
key
:
int
,
value
:
Optional
[
T
]):
logger
.
debug
(
"Removing adapter int id: %d"
,
key
)
logger
.
debug
(
"Removing adapter int id: %d"
,
key
)
self
.
deactivate_fn
(
key
)
self
.
deactivate_fn
(
key
)
return
super
().
_on_remove
(
key
,
value
)
return
super
().
_on_remove
(
key
,
value
)
...
...
vllm/assets/audio.py
View file @
96ae75ad
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Literal
,
Tuple
from
typing
import
Literal
from
urllib.parse
import
urljoin
from
urllib.parse
import
urljoin
import
librosa
import
numpy.typing
as
npt
import
numpy
as
np
from
vllm.assets.base
import
get_vllm_public_assets
,
vLLM_S3_BUCKET_URL
from
vllm.utils
import
PlaceholderModule
from
.base
import
VLLM_S3_BUCKET_URL
,
get_vllm_public_assets
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
ASSET_DIR
=
"multimodal_asset"
ASSET_DIR
=
"multimodal_asset"
...
@@ -15,8 +21,7 @@ class AudioAsset:
...
@@ -15,8 +21,7 @@ class AudioAsset:
name
:
Literal
[
"winning_call"
,
"mary_had_lamb"
]
name
:
Literal
[
"winning_call"
,
"mary_had_lamb"
]
@
property
@
property
def
audio_and_sample_rate
(
self
)
->
Tuple
[
np
.
ndarray
,
int
]:
def
audio_and_sample_rate
(
self
)
->
tuple
[
npt
.
NDArray
,
int
]:
audio_path
=
get_vllm_public_assets
(
filename
=
f
"
{
self
.
name
}
.ogg"
,
audio_path
=
get_vllm_public_assets
(
filename
=
f
"
{
self
.
name
}
.ogg"
,
s3_prefix
=
ASSET_DIR
)
s3_prefix
=
ASSET_DIR
)
y
,
sr
=
librosa
.
load
(
audio_path
,
sr
=
None
)
y
,
sr
=
librosa
.
load
(
audio_path
,
sr
=
None
)
...
@@ -25,4 +30,4 @@ class AudioAsset:
...
@@ -25,4 +30,4 @@ class AudioAsset:
@
property
@
property
def
url
(
self
)
->
str
:
def
url
(
self
)
->
str
:
return
urljoin
(
v
LLM_S3_BUCKET_URL
,
f
"
{
ASSET_DIR
}
/
{
self
.
name
}
.ogg"
)
return
urljoin
(
V
LLM_S3_BUCKET_URL
,
f
"
{
ASSET_DIR
}
/
{
self
.
name
}
.ogg"
)
vllm/assets/base.py
View file @
96ae75ad
...
@@ -4,9 +4,8 @@ from typing import Optional
...
@@ -4,9 +4,8 @@ from typing import Optional
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.envs
import
VLLM_IMAGE_FETCH_TIMEOUT
v
LLM_S3_BUCKET_URL
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com"
V
LLM_S3_BUCKET_URL
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com"
def
get_cache_dir
()
->
Path
:
def
get_cache_dir
()
->
Path
:
...
@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
...
@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
if
s3_prefix
is
not
None
:
if
s3_prefix
is
not
None
:
filename
=
s3_prefix
+
"/"
+
filename
filename
=
s3_prefix
+
"/"
+
filename
global_http_connection
.
download_file
(
global_http_connection
.
download_file
(
f
"
{
v
LLM_S3_BUCKET_URL
}
/
{
filename
}
"
,
f
"
{
V
LLM_S3_BUCKET_URL
}
/
{
filename
}
"
,
asset_path
,
asset_path
,
timeout
=
VLLM_IMAGE_FETCH_TIMEOUT
)
timeout
=
envs
.
VLLM_IMAGE_FETCH_TIMEOUT
)
return
asset_path
return
asset_path
vllm/assets/image.py
View file @
96ae75ad
...
@@ -4,7 +4,7 @@ from typing import Literal
...
@@ -4,7 +4,7 @@ from typing import Literal
import
torch
import
torch
from
PIL
import
Image
from
PIL
import
Image
from
vllm.assets
.base
import
get_vllm_public_assets
from
.base
import
get_vllm_public_assets
VLM_IMAGES_DIR
=
"vision_model_images"
VLM_IMAGES_DIR
=
"vision_model_images"
...
@@ -15,7 +15,6 @@ class ImageAsset:
...
@@ -15,7 +15,6 @@ class ImageAsset:
@
property
@
property
def
pil_image
(
self
)
->
Image
.
Image
:
def
pil_image
(
self
)
->
Image
.
Image
:
image_path
=
get_vllm_public_assets
(
filename
=
f
"
{
self
.
name
}
.jpg"
,
image_path
=
get_vllm_public_assets
(
filename
=
f
"
{
self
.
name
}
.jpg"
,
s3_prefix
=
VLM_IMAGES_DIR
)
s3_prefix
=
VLM_IMAGES_DIR
)
return
Image
.
open
(
image_path
)
return
Image
.
open
(
image_path
)
...
...
vllm/assets/video.py
View file @
96ae75ad
...
@@ -2,13 +2,13 @@ from dataclasses import dataclass
...
@@ -2,13 +2,13 @@ from dataclasses import dataclass
from
functools
import
lru_cache
from
functools
import
lru_cache
from
typing
import
List
,
Literal
from
typing
import
List
,
Literal
import
cv2
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
from
huggingface_hub
import
hf_hub_download
from
huggingface_hub
import
hf_hub_download
from
PIL
import
Image
from
PIL
import
Image
from
vllm.multimodal.utils
import
(
sample_frames_from_video
,
from
vllm.multimodal.video
import
sample_frames_from_video
try_import_video_packages
)
from
.base
import
get_cache_dir
from
.base
import
get_cache_dir
...
@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
...
@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
Download and open an image from huggingface
Download and open an image from huggingface
repo: raushan-testing-hf/videos-test
repo: raushan-testing-hf/videos-test
"""
"""
video_directory
=
get_cache_dir
()
/
"video-eample-data"
video_directory
=
get_cache_dir
()
/
"video-e
x
ample-data"
video_directory
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
video_directory
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
video_path
=
video_directory
/
filename
video_path
=
video_directory
/
filename
...
@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
...
@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
def
video_to_ndarrays
(
path
:
str
,
num_frames
:
int
=
-
1
)
->
npt
.
NDArray
:
def
video_to_ndarrays
(
path
:
str
,
num_frames
:
int
=
-
1
)
->
npt
.
NDArray
:
cv2
,
_
=
try_import_video_packages
()
cap
=
cv2
.
VideoCapture
(
path
)
cap
=
cv2
.
VideoCapture
(
path
)
if
not
cap
.
isOpened
():
if
not
cap
.
isOpened
():
raise
ValueError
(
f
"Could not open video file
{
path
}
"
)
raise
ValueError
(
f
"Could not open video file
{
path
}
"
)
...
@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
...
@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
def
video_to_pil_images_list
(
path
:
str
,
def
video_to_pil_images_list
(
path
:
str
,
num_frames
:
int
=
-
1
)
->
List
[
Image
.
Image
]:
num_frames
:
int
=
-
1
)
->
List
[
Image
.
Image
]:
cv2
,
_
=
try_import_video_packages
()
frames
=
video_to_ndarrays
(
path
,
num_frames
)
frames
=
video_to_ndarrays
(
path
,
num_frames
)
return
[
return
[
Image
.
fromarray
(
cv2
.
cvtColor
(
frame
,
cv2
.
COLOR_BGR2RGB
))
Image
.
fromarray
(
cv2
.
cvtColor
(
frame
,
cv2
.
COLOR_BGR2RGB
))
...
...
vllm/attention/backends/rocm_flash_attn.py
View file @
96ae75ad
...
@@ -447,7 +447,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
...
@@ -447,7 +447,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
Returns:
Returns:
shape = [num_tokens, num_heads * head_size]
shape = [num_tokens, num_heads * head_size]
"""
"""
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
# If the feature combo become valid
if
attn_type
!=
AttentionType
.
DECODER
:
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/attention/layer.py
View file @
96ae75ad
...
@@ -191,6 +191,7 @@ class MultiHeadAttention(nn.Module):
...
@@ -191,6 +191,7 @@ class MultiHeadAttention(nn.Module):
kv_cache_dtype
=
None
,
kv_cache_dtype
=
None
,
block_size
=
16
,
block_size
=
16
,
is_attention_free
=
False
)
is_attention_free
=
False
)
attn_backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
attn_backend
in
{
_Backend
.
FLASH_ATTN
,
_Backend
.
FLASH_ATTN_VLLM_V1
}:
if
attn_backend
in
{
_Backend
.
FLASH_ATTN
,
_Backend
.
FLASH_ATTN_VLLM_V1
}:
attn_backend
=
_Backend
.
XFORMERS
attn_backend
=
_Backend
.
XFORMERS
...
...
vllm/benchmarks/benchmark_throughput.py
View file @
96ae75ad
...
@@ -4,7 +4,8 @@ import dataclasses
...
@@ -4,7 +4,8 @@ import dataclasses
import
json
import
json
import
random
import
random
import
time
import
time
from
typing
import
List
,
Optional
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
...
@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
...
@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
build_async_engine_client_from_engine_args
)
from
vllm.inputs
import
TextPrompt
from
vllm.inputs
import
TextPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
...
@@ -31,15 +35,17 @@ class SampleRequest:
...
@@ -31,15 +35,17 @@ class SampleRequest:
Attributes:
Attributes:
prompt: The input text prompt for the model.
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
"""
prompt
:
str
prompt
:
str
prompt_len
:
int
prompt_len
:
int
expected_output_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
...
@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
...
@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
...
@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
# Filter out sequences that are too long or too short
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
dataset
:
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
if
len
(
filtered_dataset
)
==
num_requests
:
break
break
...
@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
continue
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
# Tokenize the prompts and completions.
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
prompt_token_ids
=
request_
tokenizer
(
prompt
).
input_ids
completion_token_ids
=
tokenizer
(
completion
).
input_ids
completion_token_ids
=
request_
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
)
if
fixed_output_len
is
None
else
fixed_output_len
...
@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
SampleRequest
(
prompt
=
prompt
,
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
))
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
return
filtered_dataset
...
@@ -150,11 +188,14 @@ def run_vllm(
...
@@ -150,11 +188,14 @@ def run_vllm(
ignore_eos
=
True
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
max_tokens
=
request
.
expected_output_len
,
))
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
# warmup
# warmup
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
warmup_
promp
ts
:
for
request
in
warmup_
reques
ts
:
warmup_prompts
.
append
(
warmup_prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
multi_modal_data
=
request
.
multi_modal_data
))
...
@@ -191,9 +232,13 @@ def run_vllm(
...
@@ -191,9 +232,13 @@ def run_vllm(
if
not
use_beam_search
:
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
else
:
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
output_len
=
requests
[
0
][
2
]
...
@@ -225,6 +270,7 @@ async def run_vllm_async(
...
@@ -225,6 +270,7 @@ async def run_vllm_async(
# Add the requests to the engine.
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
for
request
in
requests
:
prompts
.
append
(
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
TextPrompt
(
prompt
=
request
.
prompt
,
...
@@ -237,11 +283,16 @@ async def run_vllm_async(
...
@@ -237,11 +283,16 @@ async def run_vllm_async(
ignore_eos
=
True
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
max_tokens
=
request
.
expected_output_len
,
))
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
generators
=
[]
start
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
)
in
enumerate
(
zip
(
prompts
,
sampling_params
)):
for
i
,
(
prompt
,
sp
,
generator
=
llm
.
generate
(
prompt
,
sp
,
request_id
=
f
"test
{
i
}
"
)
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
async
for
i
,
res
in
all_gens
:
...
@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
...
@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
requests
=
[]
requests
=
[]
for
_
in
range
(
args
.
num_prompts
):
for
_
in
range
(
args
.
num_prompts
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
# Synthesize a prompt with the given input length.
candidate_ids
=
[
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
random
.
randint
(
0
,
vocab_size
-
1
)
...
@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
...
@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
# As tokenizer may add additional tokens like BOS, we need to try
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
tokenizer
.
decode
(
candidate_ids
)
candidate_prompt
=
request_
tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
tokenizer
.
encode
(
candidate_prompt
))
tokenized_len
=
len
(
request_
tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
args
.
input_len
:
if
tokenized_len
==
args
.
input_len
:
break
break
...
@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
...
@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
requests
.
append
(
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
args
.
input_len
,
prompt_len
=
args
.
input_len
,
expected_output_len
=
args
.
output_len
))
expected_output_len
=
args
.
output_len
,
lora_request
=
lora_request
))
else
:
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
if
args
.
backend
==
"vllm"
:
# if args.async_engine:
# run_args = [
# requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
# else:
# run_args = [
# warmup_requests, requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
if
args
.
async_engine
:
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
run_vllm_async
(
...
@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
...
@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
args
.
disable_frontend_multiprocessing
,
args
.
disable_frontend_multiprocessing
,
))
))
else
:
else
:
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
elapsed_time
=
run_vllm
(
warmup_requests
,
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
))
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
...
@@ -496,6 +529,14 @@ if __name__ == "__main__":
...
@@ -496,6 +529,14 @@ if __name__ == "__main__":
action
=
'store_true'
,
action
=
'store_true'
,
default
=
False
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
if
args
.
tokenizer
is
None
:
...
@@ -505,6 +546,8 @@ if __name__ == "__main__":
...
@@ -505,6 +546,8 @@ if __name__ == "__main__":
assert
args
.
output_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
else
:
assert
args
.
input_len
is
None
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
if
args
.
hf_max_batch_size
is
not
None
:
...
@@ -514,6 +557,9 @@ if __name__ == "__main__":
...
@@ -514,6 +557,9 @@ if __name__ == "__main__":
raise
ValueError
(
"HF max batch size is required for HF backend."
)
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
raise
ValueError
(
"dtype must be auto for MII backend."
)
...
@@ -526,4 +572,7 @@ if __name__ == "__main__":
...
@@ -526,4 +572,7 @@ if __name__ == "__main__":
if
args
.
tokenizer
!=
args
.
model
:
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
"backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
main
(
args
)
\ No newline at end of file
vllm/block.py
deleted
100644 → 0
View file @
f9f4a735
"""Token blocks."""
from
typing
import
TYPE_CHECKING
,
Iterator
,
List
,
Optional
from
vllm.utils
import
Device
DEFAULT_LAST_ACCESSED_TIME
:
float
=
-
1
class
PhysicalTokenBlock
:
"""Represents the state of a block in the KV cache."""
def
__init__
(
self
,
device
:
Device
,
block_number
:
int
,
block_size
:
int
,
block_hash
:
int
,
num_hashed_tokens
:
int
,
)
->
None
:
self
.
device
=
device
self
.
block_number
=
block_number
self
.
block_size
=
block_size
self
.
block_hash
=
block_hash
self
.
num_hashed_tokens
=
num_hashed_tokens
self
.
ref_count
=
0
self
.
last_accessed
=
DEFAULT_LAST_ACCESSED_TIME
self
.
computed
=
False
def
__repr__
(
self
)
->
str
:
return
(
f
'PhysicalTokenBlock(device=
{
self
.
device
}
, '
f
'block_number=
{
self
.
block_number
}
, '
f
'num_hashed_tokens=
{
self
.
num_hashed_tokens
}
, '
f
'ref_count=
{
self
.
ref_count
}
, '
f
'last_accessed=
{
self
.
last_accessed
}
, '
f
'computed=
{
self
.
computed
}
)'
)
class
BlockTable
:
"""Holds a list of blocks with caching of their associated block_ids
"""
def
__init__
(
self
,
blocks
:
Optional
[
List
[
PhysicalTokenBlock
]]
=
None
):
self
.
_blocks
:
List
[
PhysicalTokenBlock
]
=
[]
self
.
_block_ids
:
List
[
int
]
=
[]
if
blocks
is
not
None
:
for
block
in
blocks
:
self
.
append
(
block
)
def
append
(
self
,
block
:
PhysicalTokenBlock
):
self
.
_blocks
.
append
(
block
)
self
.
_block_ids
.
append
(
block
.
block_number
)
def
__len__
(
self
)
->
int
:
return
len
(
self
.
_blocks
)
def
__getitem__
(
self
,
key
):
return
self
.
_blocks
[
key
]
if
TYPE_CHECKING
:
def
__iter__
(
self
)
->
Iterator
[
PhysicalTokenBlock
]:
raise
RuntimeError
(
"Method should be automatically generated"
)
def
__setitem__
(
self
,
key
,
value
):
if
isinstance
(
key
,
slice
):
blocks
=
value
self
.
_blocks
[
key
]
=
blocks
self
.
_block_ids
[
key
]
=
[
b
.
block_number
for
b
in
blocks
]
else
:
block
=
value
self
.
_blocks
[
key
]
=
block
self
.
_block_ids
[
key
]
=
block
.
block_number
def
reset
(
self
):
self
.
_blocks
=
[]
self
.
_block_ids
=
[]
def
copy
(
self
)
->
"BlockTable"
:
return
BlockTable
(
self
.
_blocks
)
def
list
(
self
)
->
List
[
PhysicalTokenBlock
]:
return
self
.
_blocks
def
ids
(
self
)
->
List
[
int
]:
return
self
.
_block_ids
vllm/compilation/backends.py
View file @
96ae75ad
...
@@ -141,14 +141,14 @@ class AlwaysHitShapeEnv:
...
@@ -141,14 +141,14 @@ class AlwaysHitShapeEnv:
return
""
return
""
def
wrap_inductor
(
graph
,
def
wrap_inductor
(
graph
:
fx
.
GraphModule
,
example_inputs
,
example_inputs
,
additional_inductor_config
,
additional_inductor_config
,
compilation_config
:
CompilationConfig
,
compilation_config
:
CompilationConfig
,
graph_index
:
int
=
0
,
graph_index
:
int
=
0
,
num_graphs
:
int
=
1
,
num_graphs
:
int
=
1
,
runtime_shape
:
Optional
[
int
]
=
None
,
runtime_shape
:
Optional
[
int
]
=
None
,
use_inductor
:
bool
=
True
):
use_inductor
:
bool
=
True
)
->
Any
:
if
graph_index
==
0
:
if
graph_index
==
0
:
# before compiling the first graph, record the start time
# before compiling the first graph, record the start time
global
compilation_start_time
global
compilation_start_time
...
@@ -208,7 +208,7 @@ def wrap_inductor(graph,
...
@@ -208,7 +208,7 @@ def wrap_inductor(graph,
from
torch._inductor.compile_fx
import
graph_returns_tuple
from
torch._inductor.compile_fx
import
graph_returns_tuple
returns_tuple
=
graph_returns_tuple
(
graph
)
returns_tuple
=
graph_returns_tuple
(
graph
)
# this is the
graph
we return to Dynamo to run
# this is the
callable
we return to Dynamo to run
def
compiled_graph
(
*
args
):
def
compiled_graph
(
*
args
):
# convert args to list
# convert args to list
list_args
=
list
(
args
)
list_args
=
list
(
args
)
...
@@ -247,7 +247,7 @@ def wrap_inductor(graph,
...
@@ -247,7 +247,7 @@ def wrap_inductor(graph,
# see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
# see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
return
return
def
_get_shape_env
():
def
_get_shape_env
()
->
AlwaysHitShapeEnv
:
return
AlwaysHitShapeEnv
()
return
AlwaysHitShapeEnv
()
with
patch
(
# for hijacking the hash of the compiled graph
with
patch
(
# for hijacking the hash of the compiled graph
...
@@ -537,6 +537,7 @@ class VllmBackend:
...
@@ -537,6 +537,7 @@ class VllmBackend:
example_inputs
[
x
].
clone
()
for
x
in
self
.
sym_tensor_indices
example_inputs
[
x
].
clone
()
for
x
in
self
.
sym_tensor_indices
]
]
# this is the callable we return to Dynamo to run
def
copy_and_call
(
*
args
):
def
copy_and_call
(
*
args
):
list_args
=
list
(
args
)
list_args
=
list
(
args
)
for
i
,
index
in
enumerate
(
self
.
sym_tensor_indices
):
for
i
,
index
in
enumerate
(
self
.
sym_tensor_indices
):
...
...
vllm/compilation/multi_output_match.py
View file @
96ae75ad
...
@@ -7,6 +7,7 @@ from torch import fx
...
@@ -7,6 +7,7 @@ from torch import fx
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
torch._inductor
import
pattern_matcher
as
pm
from
torch._inductor
import
pattern_matcher
as
pm
from
torch._ops
import
OpOverload
from
torch._ops
import
OpOverload
from
torch.fx
import
Node
from
vllm.compilation.fx_utils
import
find_auto_fn
from
vllm.compilation.fx_utils
import
find_auto_fn
...
@@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC):
...
@@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC):
self
.
graph
.
call_function
(
operator
.
getitem
,
(
tuple_node
,
idx
))
self
.
graph
.
call_function
(
operator
.
getitem
,
(
tuple_node
,
idx
))
for
idx
in
indices
)
for
idx
in
indices
)
def
insert_auto_fn
(
self
,
op
:
OpOverload
,
kwargs
):
def
insert_auto_fn
(
self
,
op
:
OpOverload
,
kwargs
)
->
Node
:
"""
"""
Insert an auto_functionalized node with the given op and kwargs.
Insert an auto_functionalized node with the given op and kwargs.
"""
"""
...
...
vllm/compilation/pass_manager.py
View file @
96ae75ad
from
typing
import
List
from
typing
import
Any
,
Dict
,
List
from
torch
import
fx
as
fx
from
torch
import
fx
as
fx
...
@@ -53,7 +53,7 @@ class PostGradPassManager:
...
@@ -53,7 +53,7 @@ class PostGradPassManager:
assert
isinstance
(
pass_
,
InductorPass
)
assert
isinstance
(
pass_
,
InductorPass
)
self
.
passes
.
append
(
pass_
)
self
.
passes
.
append
(
pass_
)
def
__getstate__
(
self
):
def
__getstate__
(
self
)
->
Dict
[
str
,
List
[
Any
]]
:
"""
"""
Custom pickling for the pass manager, as some passes cannot be pickled.
Custom pickling for the pass manager, as some passes cannot be pickled.
Pickling occurs because the pass manager is set as the value of
Pickling occurs because the pass manager is set as the value of
...
...
vllm/config.py
View file @
96ae75ad
...
@@ -22,12 +22,15 @@ from vllm.logger import init_logger
...
@@ -22,12 +22,15 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.quantization
import
(
QUANTIZATION_METHODS
,
from
vllm.model_executor.layers.quantization
import
(
QUANTIZATION_METHODS
,
get_quantization_config
)
get_quantization_config
)
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
,
interface
from
vllm.tracing
import
is_otel_available
,
otel_import_error_traceback
from
vllm.tracing
import
is_otel_available
,
otel_import_error_traceback
from
vllm.transformers_utils.config
import
(
from
vllm.transformers_utils.config
import
(
ConfigFormat
,
get_config
,
get_hf_image_processor_config
,
ConfigFormat
,
get_config
,
get_hf_image_processor_config
,
get_hf_text_config
,
get_pooling_config
,
get_hf_text_config
,
get_pooling_config
,
get_sentence_transformer_tokenizer_config
,
is_encoder_decoder
,
uses_mrope
)
get_sentence_transformer_tokenizer_config
,
is_encoder_decoder
,
try_get_generation_config
,
uses_mrope
)
from
vllm.transformers_utils.s3_utils
import
S3Model
from
vllm.transformers_utils.utils
import
is_s3
from
vllm.utils
import
(
GiB_bytes
,
LayerBlockType
,
cuda_device_count_stateless
,
from
vllm.utils
import
(
GiB_bytes
,
LayerBlockType
,
cuda_device_count_stateless
,
get_cpu_memory
,
print_warning_once
,
random_uuid
,
get_cpu_memory
,
print_warning_once
,
random_uuid
,
resolve_obj_by_qualname
)
resolve_obj_by_qualname
)
...
@@ -148,9 +151,8 @@ class ModelConfig:
...
@@ -148,9 +151,8 @@ class ModelConfig:
HuggingFace config.
HuggingFace config.
mm_processor_kwargs: Arguments to be forwarded to the model's processor
mm_processor_kwargs: Arguments to be forwarded to the model's processor
for multi-modal data, e.g., image processor.
for multi-modal data, e.g., image processor.
mm_cache_preprocessor: If true, then enables caching of the multi-modal
disable_mm_preprocessor_cache: If true, then disables caching of the
preprocessor/mapper. Otherwise, the mapper executes each time, and
multi-modal preprocessor/mapper. (not recommended)
for better performance consider enabling frontend process.
override_neuron_config: Initialize non default neuron config or
override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices,
override default neuron config that are specific to Neuron devices,
this argument will be used to configure the neuron config that
this argument will be used to configure the neuron config that
...
@@ -159,8 +161,9 @@ class ModelConfig:
...
@@ -159,8 +161,9 @@ class ModelConfig:
override default pooling config for the pooling model.
override default pooling config for the pooling model.
logits_processor_pattern: Optional regex pattern specifying valid
logits_processor_pattern: Optional regex pattern specifying valid
logits processor qualified names that can be passed with the
logits processor qualified names that can be passed with the
`logits_processors` extra completion argument. Defaults to None,
`logits_processors` extra completion argument. Defaults to None,
which allows no processors.
which allows no processors.
generation_config: Configuration parameter file for generation.
"""
"""
def
compute_hash
(
self
)
->
str
:
def
compute_hash
(
self
)
->
str
:
...
@@ -216,10 +219,11 @@ class ModelConfig:
...
@@ -216,10 +219,11 @@ class ModelConfig:
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
,
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
,
hf_overrides
:
Optional
[
HfOverrides
]
=
None
,
hf_overrides
:
Optional
[
HfOverrides
]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
mm_cache
_preprocessor
:
bool
=
False
,
disable_mm
_preprocessor
_cache
:
bool
=
False
,
override_neuron_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
override_neuron_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
override_pooler_config
:
Optional
[
"PoolerConfig"
]
=
None
,
override_pooler_config
:
Optional
[
"PoolerConfig"
]
=
None
,
logits_processor_pattern
:
Optional
[
str
]
=
None
)
->
None
:
logits_processor_pattern
:
Optional
[
str
]
=
None
,
generation_config
:
Optional
[
str
]
=
None
)
->
None
:
self
.
model
=
model
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
tokenizer
=
tokenizer
self
.
tokenizer_mode
=
tokenizer_mode
self
.
tokenizer_mode
=
tokenizer_mode
...
@@ -254,6 +258,8 @@ class ModelConfig:
...
@@ -254,6 +258,8 @@ class ModelConfig:
f
"'Please instead use `--hf-overrides '
{
hf_override
!
r
}
'`"
)
f
"'Please instead use `--hf-overrides '
{
hf_override
!
r
}
'`"
)
warnings
.
warn
(
DeprecationWarning
(
msg
),
stacklevel
=
2
)
warnings
.
warn
(
DeprecationWarning
(
msg
),
stacklevel
=
2
)
self
.
maybe_pull_model_tokenizer_for_s3
(
model
,
tokenizer
)
# The tokenizer version is consistent with the model version by default.
# The tokenizer version is consistent with the model version by default.
if
tokenizer_revision
is
None
:
if
tokenizer_revision
is
None
:
self
.
tokenizer_revision
=
revision
self
.
tokenizer_revision
=
revision
...
@@ -286,7 +292,7 @@ class ModelConfig:
...
@@ -286,7 +292,7 @@ class ModelConfig:
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_text_config
,
dtype
)
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_text_config
,
dtype
)
self
.
use_async_output_proc
=
use_async_output_proc
self
.
use_async_output_proc
=
use_async_output_proc
self
.
mm_processor_kwargs
=
mm_processor_kwargs
self
.
mm_processor_kwargs
=
mm_processor_kwargs
self
.
mm_cache
_preprocessor
=
mm
_cache_preprocessor
self
.
disable_mm
_preprocessor_cache
=
disable_mm
_preprocessor
_cache
# Set enforce_eager to False if the value is unset.
# Set enforce_eager to False if the value is unset.
if
self
.
enforce_eager
is
None
:
if
self
.
enforce_eager
is
None
:
...
@@ -349,10 +355,36 @@ class ModelConfig:
...
@@ -349,10 +355,36 @@ class ModelConfig:
self
.
pooler_config
=
self
.
_init_pooler_config
(
override_pooler_config
)
self
.
pooler_config
=
self
.
_init_pooler_config
(
override_pooler_config
)
self
.
logits_processor_pattern
=
logits_processor_pattern
self
.
logits_processor_pattern
=
logits_processor_pattern
self
.
generation_config
=
generation_config
self
.
_verify_quantization
()
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
self
.
_verify_cuda_graph
()
self
.
_verify_bnb_config
()
self
.
_verify_bnb_config
()
def
maybe_pull_model_tokenizer_for_s3
(
self
,
model
:
str
,
tokenizer
:
str
)
->
None
:
"""
Pull the model config or tokenizer to a temporary
directory in case of S3.
Args:
model: The model name or path.
tokenizer: The tokenizer name or path.
"""
if
is_s3
(
model
)
or
is_s3
(
tokenizer
):
if
is_s3
(
model
):
self
.
s3_model
=
S3Model
()
self
.
s3_model
.
pull_files
(
model
,
allow_pattern
=
[
"*config.json"
])
self
.
model_weights
=
self
.
model
self
.
model
=
self
.
s3_model
.
dir
if
is_s3
(
tokenizer
):
self
.
s3_tokenizer
=
S3Model
()
self
.
s3_tokenizer
.
pull_files
(
model
,
ignore_pattern
=
[
"*.pt"
,
"*.safetensors"
,
"*.bin"
])
self
.
tokenizer
=
self
.
s3_tokenizer
.
dir
def
_init_multimodal_config
(
def
_init_multimodal_config
(
self
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
self
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
)
->
Optional
[
"MultiModalConfig"
]:
)
->
Optional
[
"MultiModalConfig"
]:
...
@@ -564,6 +596,12 @@ class ModelConfig:
...
@@ -564,6 +596,12 @@ class ModelConfig:
self
.
max_seq_len_to_capture
=
min
(
self
.
max_seq_len_to_capture
,
self
.
max_seq_len_to_capture
=
min
(
self
.
max_seq_len_to_capture
,
self
.
max_model_len
)
self
.
max_model_len
)
if
(
self
.
hf_config
.
model_type
==
'deepseek_v3'
and
not
self
.
enforce_eager
):
logger
.
warning
(
"CUDA graph is not supported for Deepseek V3 yet, "
"fallback to the eager mode."
)
self
.
enforce_eager
=
True
def
_verify_bnb_config
(
self
)
->
None
:
def
_verify_bnb_config
(
self
)
->
None
:
"""
"""
The current version of bitsandbytes (0.44.0) with 8-bit models does not
The current version of bitsandbytes (0.44.0) with 8-bit models does not
...
@@ -598,7 +636,7 @@ class ModelConfig:
...
@@ -598,7 +636,7 @@ class ModelConfig:
self
.
use_async_output_proc
=
False
self
.
use_async_output_proc
=
False
return
return
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
# If the feature combo become valid
if
not
current_platform
.
is_async_output_supported
(
self
.
enforce_eager
):
if
not
current_platform
.
is_async_output_supported
(
self
.
enforce_eager
):
logger
.
warning
(
logger
.
warning
(
...
@@ -618,7 +656,7 @@ class ModelConfig:
...
@@ -618,7 +656,7 @@ class ModelConfig:
if
self
.
runner_type
==
"pooling"
:
if
self
.
runner_type
==
"pooling"
:
self
.
use_async_output_proc
=
False
self
.
use_async_output_proc
=
False
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
# If the feature combo become valid
if
speculative_config
:
if
speculative_config
:
logger
.
warning
(
"Async output processing is not supported with"
logger
.
warning
(
"Async output processing is not supported with"
...
@@ -680,8 +718,9 @@ class ModelConfig:
...
@@ -680,8 +718,9 @@ class ModelConfig:
def
get_head_size
(
self
)
->
int
:
def
get_head_size
(
self
)
->
int
:
# TODO remove hard code
# TODO remove hard code
if
hasattr
(
self
.
hf_text_config
,
"model_type"
if
hasattr
(
self
.
hf_text_config
,
)
and
self
.
hf_text_config
.
model_type
==
'deepseek_v2'
:
"model_type"
)
and
(
self
.
hf_text_config
.
model_type
in
(
'deepseek_v2'
,
'deepseek_v3'
)):
# FlashAttention supports only head_size 32, 64, 128, 256,
# FlashAttention supports only head_size 32, 64, 128, 256,
# we need to pad head_size 192 to 256
# we need to pad head_size 192 to 256
return
256
return
256
...
@@ -814,6 +853,56 @@ class ModelConfig:
...
@@ -814,6 +853,56 @@ class ModelConfig:
return
self
.
multimodal_config
return
self
.
multimodal_config
def
try_get_generation_config
(
self
)
->
Dict
[
str
,
Any
]:
if
self
.
generation_config
is
None
or
self
.
generation_config
==
"auto"
:
config
=
try_get_generation_config
(
self
.
model
,
trust_remote_code
=
self
.
trust_remote_code
,
revision
=
self
.
revision
,
)
else
:
config
=
try_get_generation_config
(
self
.
generation_config
,
trust_remote_code
=
self
.
trust_remote_code
,
)
if
config
is
None
:
return
{}
return
config
.
to_diff_dict
()
def
get_diff_sampling_param
(
self
)
->
Dict
[
str
,
Any
]:
"""
This method returns a dictionary containing the parameters
that differ from the default sampling parameters, but only
if `generation_config` is set. If `generation_config` is not
set, an empty dictionary is returned.
Returns:
Dict[str, Any]: A dictionary with the differing sampling
parameters if `generation_config` is set, otherwise an
empty dictionary.
"""
if
self
.
generation_config
is
None
:
# When generation_config is not set
return
{}
config
=
self
.
try_get_generation_config
()
available_params
=
[
"repetition_penalty"
,
"temperature"
,
"top_k"
,
"top_p"
,
"min_p"
,
]
if
any
(
p
in
config
for
p
in
available_params
):
diff_sampling_param
=
{
p
:
config
.
get
(
p
)
for
p
in
available_params
if
config
.
get
(
p
)
is
not
None
}
else
:
diff_sampling_param
=
{}
return
diff_sampling_param
@
property
@
property
def
is_encoder_decoder
(
self
)
->
bool
:
def
is_encoder_decoder
(
self
)
->
bool
:
"""Extract the HF encoder/decoder model flag."""
"""Extract the HF encoder/decoder model flag."""
...
@@ -917,6 +1006,10 @@ class CacheConfig:
...
@@ -917,6 +1006,10 @@ class CacheConfig:
raise
ValueError
(
raise
ValueError
(
"GPU memory utilization must be less than 1.0. Got "
"GPU memory utilization must be less than 1.0. Got "
f
"
{
self
.
gpu_memory_utilization
}
."
)
f
"
{
self
.
gpu_memory_utilization
}
."
)
if
(
current_platform
.
is_cuda
()
and
self
.
block_size
is
not
None
and
self
.
block_size
>
32
):
raise
ValueError
(
"CUDA Paged Attention kernel only supports "
f
"block sizes up to 32. Got
{
self
.
block_size
}
."
)
def
_verify_cache_dtype
(
self
)
->
None
:
def
_verify_cache_dtype
(
self
)
->
None
:
if
self
.
cache_dtype
==
"auto"
:
if
self
.
cache_dtype
==
"auto"
:
...
@@ -1041,6 +1134,7 @@ class LoadFormat(str, enum.Enum):
...
@@ -1041,6 +1134,7 @@ class LoadFormat(str, enum.Enum):
GGUF
=
"gguf"
GGUF
=
"gguf"
BITSANDBYTES
=
"bitsandbytes"
BITSANDBYTES
=
"bitsandbytes"
MISTRAL
=
"mistral"
MISTRAL
=
"mistral"
RUNAI_STREAMER
=
"runai_streamer"
@
dataclass
@
dataclass
...
@@ -1977,7 +2071,7 @@ class LoRAConfig:
...
@@ -1977,7 +2071,7 @@ class LoRAConfig:
model_config
.
quantization
)
model_config
.
quantization
)
def
verify_with_scheduler_config
(
self
,
scheduler_config
:
SchedulerConfig
):
def
verify_with_scheduler_config
(
self
,
scheduler_config
:
SchedulerConfig
):
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
# If the feature combo become valid
if
scheduler_config
.
chunked_prefill_enabled
:
if
scheduler_config
.
chunked_prefill_enabled
:
logger
.
warning
(
"LoRA with chunked prefill is still experimental "
logger
.
warning
(
"LoRA with chunked prefill is still experimental "
...
@@ -2155,6 +2249,17 @@ def _get_and_verify_dtype(
...
@@ -2155,6 +2249,17 @@ def _get_and_verify_dtype(
else
:
else
:
torch_dtype
=
config_dtype
torch_dtype
=
config_dtype
if
(
current_platform
.
is_cpu
()
and
current_platform
.
get_cpu_architecture
()
==
interface
.
CpuArchEnum
.
POWERPC
and
(
config_dtype
==
torch
.
float16
or
config_dtype
==
torch
.
float32
)):
logger
.
info
(
"For POWERPC, we cast models to bfloat16 instead of "
"using float16 by default. Float16 is not currently "
"supported for POWERPC."
)
torch_dtype
=
torch
.
bfloat16
if
current_platform
.
is_hpu
()
and
config_dtype
==
torch
.
float16
:
if
current_platform
.
is_hpu
()
and
config_dtype
==
torch
.
float16
:
logger
.
info
(
logger
.
info
(
"For HPU, we cast models to bfloat16 instead of"
"For HPU, we cast models to bfloat16 instead of"
...
@@ -3165,7 +3270,7 @@ class VllmConfig:
...
@@ -3165,7 +3270,7 @@ class VllmConfig:
f
"enable_prefix_caching=
{
self
.
cache_config
.
enable_prefix_caching
}
, "
f
"enable_prefix_caching=
{
self
.
cache_config
.
enable_prefix_caching
}
, "
f
"chunked_prefill_enabled=
{
self
.
scheduler_config
.
chunked_prefill_enabled
}
, "
# noqa
f
"chunked_prefill_enabled=
{
self
.
scheduler_config
.
chunked_prefill_enabled
}
, "
# noqa
f
"use_async_output_proc=
{
self
.
model_config
.
use_async_output_proc
}
, "
f
"use_async_output_proc=
{
self
.
model_config
.
use_async_output_proc
}
, "
f
"
mm_cache
_preprocessor=
{
self
.
model_config
.
mm_cache
_preprocessor
!
r
}
, "
# noqa
f
"
disable_mm
_preprocessor
_cache
=
{
self
.
model_config
.
disable_mm
_preprocessor
_cache
!
r
}
, "
# noqa
f
"mm_processor_kwargs=
{
self
.
model_config
.
mm_processor_kwargs
}
, "
f
"mm_processor_kwargs=
{
self
.
model_config
.
mm_processor_kwargs
}
, "
f
"pooler_config=
{
self
.
model_config
.
pooler_config
!
r
}
, "
f
"pooler_config=
{
self
.
model_config
.
pooler_config
!
r
}
, "
f
"compilation_config=
{
self
.
compilation_config
!
r
}
"
)
f
"compilation_config=
{
self
.
compilation_config
!
r
}
"
)
...
...
vllm/core/evictor.py
View file @
96ae75ad
...
@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
...
@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
class
Evictor
(
ABC
):
class
Evictor
(
ABC
):
"""The Evictor subclasses should be used by the BlockAllocator class to
"""The Evictor subclasses should be used by the BlockAllocator class to
handle eviction of freed
PhysicalToken
Blocks.
handle eviction of freed Blocks.
"""
"""
@
abstractmethod
@
abstractmethod
...
@@ -70,7 +70,7 @@ class BlockMetaData:
...
@@ -70,7 +70,7 @@ class BlockMetaData:
class
LRUEvictor
(
Evictor
):
class
LRUEvictor
(
Evictor
):
"""Evicts in a least-recently-used order using the last_accessed timestamp
"""Evicts in a least-recently-used order using the last_accessed timestamp
that's recorded in the
PhysicalToken
Block. If there are multiple blocks with
that's recorded in the Block. If there are multiple blocks with
the same last_accessed time, then the one with the largest num_hashed_tokens
the same last_accessed time, then the one with the largest num_hashed_tokens
will be evicted. If two blocks each have the lowest last_accessed time and
will be evicted. If two blocks each have the lowest last_accessed time and
highest num_hashed_tokens value, then one will be chose arbitrarily
highest num_hashed_tokens value, then one will be chose arbitrarily
...
...
vllm/engine/arg_utils.py
View file @
96ae75ad
...
@@ -141,7 +141,7 @@ class EngineArgs:
...
@@ -141,7 +141,7 @@ class EngineArgs:
tokenizer_pool_extra_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
tokenizer_pool_extra_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
=
None
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
=
None
mm_processor_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
mm_processor_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
mm_cache
_preprocessor
:
bool
=
False
disable_mm
_preprocessor
_cache
:
bool
=
False
enable_lora
:
bool
=
False
enable_lora
:
bool
=
False
enable_lora_bias
:
bool
=
False
enable_lora_bias
:
bool
=
False
max_loras
:
int
=
1
max_loras
:
int
=
1
...
@@ -200,6 +200,8 @@ class EngineArgs:
...
@@ -200,6 +200,8 @@ class EngineArgs:
kv_transfer_config
:
Optional
[
KVTransferConfig
]
=
None
kv_transfer_config
:
Optional
[
KVTransferConfig
]
=
None
generation_config
:
Optional
[
str
]
=
None
def
__post_init__
(
self
):
def
__post_init__
(
self
):
if
not
self
.
tokenizer
:
if
not
self
.
tokenizer
:
self
.
tokenizer
=
self
.
model
self
.
tokenizer
=
self
.
model
...
@@ -208,6 +210,7 @@ class EngineArgs:
...
@@ -208,6 +210,7 @@ class EngineArgs:
# by user.
# by user.
if
self
.
enable_prefix_caching
is
None
:
if
self
.
enable_prefix_caching
is
None
:
self
.
enable_prefix_caching
=
bool
(
envs
.
VLLM_USE_V1
)
self
.
enable_prefix_caching
=
bool
(
envs
.
VLLM_USE_V1
)
# Override max_num_seqs if it's not set by user.
# Override max_num_seqs if it's not set by user.
if
self
.
max_num_seqs
is
None
:
if
self
.
max_num_seqs
is
None
:
self
.
max_num_seqs
=
256
if
not
envs
.
VLLM_USE_V1
else
1024
self
.
max_num_seqs
=
256
if
not
envs
.
VLLM_USE_V1
else
1024
...
@@ -316,6 +319,8 @@ class EngineArgs:
...
@@ -316,6 +319,8 @@ class EngineArgs:
'* "tensorizer" will load the weights using tensorizer from '
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples '
'CoreWeave. See the Tensorize vLLM Model script in the Examples '
'section for more information.
\n
'
'section for more information.
\n
'
'* "runai_streamer" will load the Safetensors weights using Run:ai'
'Model Streamer
\n
'
'* "bitsandbytes" will load the weights using bitsandbytes '
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.
\n
'
)
'quantization.
\n
'
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -371,7 +376,7 @@ class EngineArgs:
...
@@ -371,7 +376,7 @@ class EngineArgs:
choices
=
[
'outlines'
,
'lm-format-enforcer'
,
'xgrammar'
],
choices
=
[
'outlines'
,
'lm-format-enforcer'
,
'xgrammar'
],
help
=
'Which engine will be used for guided decoding'
help
=
'Which engine will be used for guided decoding'
' (JSON schema / regex etc) by default. Currently support '
' (JSON schema / regex etc) by default. Currently support '
'https://github.com/outlines-dev/outlines,'
'https://github.com/outlines-dev/outlines,
'
'https://github.com/mlc-ai/xgrammar, and '
'https://github.com/mlc-ai/xgrammar, and '
'https://github.com/noamgat/lm-format-enforcer.'
'https://github.com/noamgat/lm-format-enforcer.'
' Can be overridden per request via guided_decoding_backend'
' Can be overridden per request via guided_decoding_backend'
...
@@ -426,10 +431,12 @@ class EngineArgs:
...
@@ -426,10 +431,12 @@ class EngineArgs:
parser
.
add_argument
(
'--block-size'
,
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
choices
=
[
8
,
16
,
32
,
64
,
128
],
help
=
'Token block size for contiguous chunks of '
help
=
'Token block size for contiguous chunks of '
'tokens. This is ignored on neuron devices and '
'tokens. This is ignored on neuron devices and '
'set to max-model-len'
)
'set to max-model-len. On CUDA devices, '
'only block sizes up to 32 are supported. '
'On HPU devices, block size defaults to 128.'
)
parser
.
add_argument
(
parser
.
add_argument
(
"--enable-prefix-caching"
,
"--enable-prefix-caching"
,
...
@@ -606,11 +613,10 @@ class EngineArgs:
...
@@ -606,11 +613,10 @@ class EngineArgs:
help
=
(
'Overrides for the multimodal input mapping/processing, '
help
=
(
'Overrides for the multimodal input mapping/processing, '
'e.g., image processor. For example: {"num_crops": 4}.'
))
'e.g., image processor. For example: {"num_crops": 4}.'
))
parser
.
add_argument
(
parser
.
add_argument
(
'--
mm-cache
-preprocessor'
,
'--
disable-mm
-preprocessor
-cache
'
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
'If true, then enables caching of the multi-modal '
help
=
'If true, then disables caching of the multi-modal '
'preprocessor/mapper. Otherwise, the mapper executes each time'
'preprocessor/mapper. (not recommended)'
)
', and for better performance consider enabling frontend process.'
)
# LoRA related configs
# LoRA related configs
parser
.
add_argument
(
'--enable-lora'
,
parser
.
add_argument
(
'--enable-lora'
,
...
@@ -957,6 +963,16 @@ class EngineArgs:
...
@@ -957,6 +963,16 @@ class EngineArgs:
default
=
"auto"
,
default
=
"auto"
,
help
=
'The worker class to use for distributed execution.'
)
help
=
'The worker class to use for distributed execution.'
)
parser
.
add_argument
(
"--generation-config"
,
type
=
nullable_str
,
default
=
None
,
help
=
"The folder path to the generation config. "
"Defaults to None, will use the default generation config in vLLM. "
"If set to 'auto', the generation config will be automatically "
"loaded from model. If set to a folder path, the generation config "
"will be loaded from the specified folder path."
)
return
parser
return
parser
@
classmethod
@
classmethod
...
@@ -997,10 +1013,11 @@ class EngineArgs:
...
@@ -997,10 +1013,11 @@ class EngineArgs:
use_async_output_proc
=
not
self
.
disable_async_output_proc
,
use_async_output_proc
=
not
self
.
disable_async_output_proc
,
config_format
=
self
.
config_format
,
config_format
=
self
.
config_format
,
mm_processor_kwargs
=
self
.
mm_processor_kwargs
,
mm_processor_kwargs
=
self
.
mm_processor_kwargs
,
mm_cache
_preprocessor
=
self
.
mm_cache
_preprocessor
,
disable_mm
_preprocessor
_cache
=
self
.
disable_mm
_preprocessor
_cache
,
override_neuron_config
=
self
.
override_neuron_config
,
override_neuron_config
=
self
.
override_neuron_config
,
override_pooler_config
=
self
.
override_pooler_config
,
override_pooler_config
=
self
.
override_pooler_config
,
logits_processor_pattern
=
self
.
logits_processor_pattern
)
logits_processor_pattern
=
self
.
logits_processor_pattern
,
generation_config
=
self
.
generation_config
)
def
create_load_config
(
self
)
->
LoadConfig
:
def
create_load_config
(
self
)
->
LoadConfig
:
return
LoadConfig
(
return
LoadConfig
(
...
@@ -1043,11 +1060,11 @@ class EngineArgs:
...
@@ -1043,11 +1060,11 @@ class EngineArgs:
device_config
=
DeviceConfig
(
device
=
self
.
device
)
device_config
=
DeviceConfig
(
device
=
self
.
device
)
model_config
=
self
.
create_model_config
()
model_config
=
self
.
create_model_config
()
if
model_config
.
is_multimodal_model
:
if
(
model_config
.
is_multimodal_model
and
not
envs
.
VLLM_USE_V1
if
self
.
enable_prefix_caching
:
and
self
.
enable_prefix_caching
)
:
logger
.
warning
(
logger
.
warning
(
"--enable-prefix-caching is currently not "
"--enable-prefix-caching is currently not
"
"supported for multimodal models in v0 and
"
"supported for multimodal models and
has been disabled."
)
"
has been disabled."
)
self
.
enable_prefix_caching
=
False
self
.
enable_prefix_caching
=
False
cache_config
=
CacheConfig
(
cache_config
=
CacheConfig
(
...
@@ -1149,7 +1166,7 @@ class EngineArgs:
...
@@ -1149,7 +1166,7 @@ class EngineArgs:
num_speculative_heads
=
self
.
num_speculative_heads
num_speculative_heads
=
self
.
num_speculative_heads
)
)
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
# If the feature combo become valid
if
self
.
num_scheduler_steps
>
1
:
if
self
.
num_scheduler_steps
>
1
:
if
speculative_config
is
not
None
:
if
speculative_config
is
not
None
:
...
@@ -1269,11 +1286,14 @@ class EngineArgs:
...
@@ -1269,11 +1286,14 @@ class EngineArgs:
# When no user override, set the default values based on the usage
# When no user override, set the default values based on the usage
# context.
# context.
# TODO(woosuk): Tune the default values for different hardware.
# TODO(woosuk): Tune the default values for different hardware.
if
self
.
max_num_batched_tokens
is
None
:
default_max_num_batched_tokens
=
{
if
usage_context
==
UsageContext
.
LLM_CLASS
:
UsageContext
.
LLM_CLASS
:
8192
,
self
.
max_num_batched_tokens
=
8192
UsageContext
.
OPENAI_API_SERVER
:
2048
,
elif
usage_context
==
UsageContext
.
OPENAI_API_SERVER
:
}
self
.
max_num_batched_tokens
=
2048
if
(
self
.
max_num_batched_tokens
is
None
and
usage_context
in
default_max_num_batched_tokens
):
self
.
max_num_batched_tokens
=
default_max_num_batched_tokens
[
usage_context
]
logger
.
warning
(
logger
.
warning
(
"Setting max_num_batched_tokens to %d for %s usage context."
,
"Setting max_num_batched_tokens to %d for %s usage context."
,
self
.
max_num_batched_tokens
,
usage_context
.
value
)
self
.
max_num_batched_tokens
,
usage_context
.
value
)
...
@@ -1283,9 +1303,6 @@ class EngineArgs:
...
@@ -1283,9 +1303,6 @@ class EngineArgs:
Override the EngineConfig's configs based on the usage context for V1.
Override the EngineConfig's configs based on the usage context for V1.
"""
"""
assert
envs
.
VLLM_USE_V1
,
"V1 is not enabled"
assert
envs
.
VLLM_USE_V1
,
"V1 is not enabled"
if
engine_config
.
model_config
.
is_multimodal_model
:
# TODO (ywang96): Enable APC by default when VLM supports it.
assert
not
engine_config
.
cache_config
.
enable_prefix_caching
@
dataclass
@
dataclass
...
...
vllm/engine/async_llm_engine.py
View file @
96ae75ad
...
@@ -1256,3 +1256,10 @@ class AsyncLLMEngine(EngineClient):
...
@@ -1256,3 +1256,10 @@ class AsyncLLMEngine(EngineClient):
self
.
engine
.
model_executor
.
stop_profile
()
self
.
engine
.
model_executor
.
stop_profile
()
else
:
else
:
self
.
engine
.
model_executor
.
_run_workers
(
"stop_profile"
)
self
.
engine
.
model_executor
.
_run_workers
(
"stop_profile"
)
# TODO(v1): Remove this class proxy when V1 goes default.
if
envs
.
VLLM_USE_V1
:
from
vllm.v1.engine.async_llm
import
AsyncLLM
AsyncLLMEngine
=
AsyncLLM
# type: ignore
vllm/engine/llm_engine.py
View file @
96ae75ad
...
@@ -6,8 +6,8 @@ from collections import deque
...
@@ -6,8 +6,8 @@ from collections import deque
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
functools
import
partial
from
functools
import
partial
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
ClassVar
,
Deque
,
Dict
,
from
typing
import
(
TYPE_CHECKING
,
Callable
,
ClassVar
,
Deque
,
Dict
,
Iterable
,
Iterable
,
List
,
Mapping
,
NamedTuple
,
Optional
)
List
,
Mapping
,
NamedTuple
,
Optional
)
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Set
,
Type
,
Union
,
cast
,
overload
from
typing
import
Set
,
Type
,
Union
,
cast
,
overload
...
@@ -53,7 +53,6 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
...
@@ -53,7 +53,6 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
SequenceGroupOutput
,
SequenceStatus
,
CompletionSequenceGroupOutput
,
VLLM_INVALID_TOKEN_ID
)
SequenceGroupOutput
,
SequenceStatus
,
CompletionSequenceGroupOutput
,
VLLM_INVALID_TOKEN_ID
)
from
vllm.tracing
import
(
SpanAttributes
,
SpanKind
,
extract_trace_context
,
from
vllm.tracing
import
(
SpanAttributes
,
SpanKind
,
extract_trace_context
,
init_tracer
)
init_tracer
)
from
vllm.transformers_utils.config
import
try_get_generation_config
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer_group
import
(
from
vllm.transformers_utils.tokenizer_group
import
(
...
@@ -66,20 +65,6 @@ from vllm.version import __version__ as VLLM_VERSION
...
@@ -66,20 +65,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
_LOCAL_LOGGING_INTERVAL_SEC
=
5
_LOCAL_LOGGING_INTERVAL_SEC
=
5
def
_load_generation_config_dict
(
model_config
:
ModelConfig
)
->
Dict
[
str
,
Any
]:
config
=
try_get_generation_config
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
,
revision
=
model_config
.
revision
,
)
if
config
is
None
:
return
{}
return
config
.
to_diff_dict
()
_G
=
TypeVar
(
"_G"
,
bound
=
BaseTokenizerGroup
,
default
=
BaseTokenizerGroup
)
_G
=
TypeVar
(
"_G"
,
bound
=
BaseTokenizerGroup
,
default
=
BaseTokenizerGroup
)
_O
=
TypeVar
(
"_O"
,
RequestOutput
,
PoolingRequestOutput
)
_O
=
TypeVar
(
"_O"
,
RequestOutput
,
PoolingRequestOutput
)
...
@@ -149,7 +134,7 @@ class LLMEngine:
...
@@ -149,7 +134,7 @@ class LLMEngine:
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
The config arguments are derived from :class:`~vllm.EngineArgs`. (See
The config arguments are derived from :class:`~vllm.EngineArgs`. (See
:ref:`engine
_
args`)
:ref:`engine
-
args`)
Args:
Args:
model_config: The configuration related to the LLM model.
model_config: The configuration related to the LLM model.
...
@@ -275,8 +260,8 @@ class LLMEngine:
...
@@ -275,8 +260,8 @@ class LLMEngine:
return
tokenizer_group
.
get_lora_tokenizer
(
sequence
.
lora_request
)
return
tokenizer_group
.
get_lora_tokenizer
(
sequence
.
lora_request
)
self
.
seq_counter
=
Counter
()
self
.
seq_counter
=
Counter
()
self
.
generation_config_fields
=
_load_generation_config_dict
(
self
.
generation_config_fields
=
(
self
.
model_config
)
self
.
model_config
.
try_get_generation_config
()
)
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
model_config
,
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
model_config
,
self
.
tokenizer
,
self
.
tokenizer
,
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment