Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
420 additions
and
276 deletions
+420
-276
tests/weight_loading/run_model_weight_loading_test.sh
tests/weight_loading/run_model_weight_loading_test.sh
+4
-0
tests/weight_loading/test_weight_loading.py
tests/weight_loading/test_weight_loading.py
+7
-0
vllm/_custom_ops.py
vllm/_custom_ops.py
+109
-53
vllm/adapter_commons/models.py
vllm/adapter_commons/models.py
+4
-5
vllm/assets/audio.py
vllm/assets/audio.py
+12
-7
vllm/assets/base.py
vllm/assets/base.py
+3
-4
vllm/assets/image.py
vllm/assets/image.py
+1
-2
vllm/assets/video.py
vllm/assets/video.py
+3
-6
vllm/attention/backends/rocm_flash_attn.py
vllm/attention/backends/rocm_flash_attn.py
+1
-1
vllm/attention/layer.py
vllm/attention/layer.py
+1
-0
vllm/benchmarks/benchmark_throughput.py
vllm/benchmarks/benchmark_throughput.py
+91
-42
vllm/block.py
vllm/block.py
+0
-88
vllm/compilation/backends.py
vllm/compilation/backends.py
+5
-4
vllm/compilation/multi_output_match.py
vllm/compilation/multi_output_match.py
+2
-1
vllm/compilation/pass_manager.py
vllm/compilation/pass_manager.py
+2
-2
vllm/config.py
vllm/config.py
+120
-15
vllm/core/evictor.py
vllm/core/evictor.py
+2
-2
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+41
-24
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+7
-0
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+5
-20
No files found.
tests/weight_loading/run_model_weight_loading_test.sh
View file @
96ae75ad
...
...
@@ -26,6 +26,10 @@ do
export
QUANTIZATION
=
${
array
[0]
}
export
MODEL_NAME
=
${
array
[1]
}
export
REVISION
=
${
array
[2]
}
# If array length is larger than 3, then MIN_CAPABILITY is provided
if
[
${#
array
[@]
}
-gt
3
]
;
then
export
MIN_CAPABILITY
=
${
array
[3]
}
fi
pytest
-s
weight_loading/test_weight_loading.py
||
LOCAL_SUCCESS
=
$?
if
[[
$LOCAL_SUCCESS
==
0
]]
;
then
...
...
tests/weight_loading/test_weight_loading.py
View file @
96ae75ad
import
os
import
pytest
import
torch
from
..utils
import
models_path_prefix
from
vllm.platforms
import
current_platform
MAX_MODEL_LEN
=
1024
MODEL_NAME
=
os
.
environ
.
get
(
"MODEL_NAME"
,
os
.
path
.
join
(
models_path_prefix
,
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
))
REVISION
=
os
.
environ
.
get
(
"REVISION"
,
"main"
)
QUANTIZATION
=
os
.
environ
.
get
(
"QUANTIZATION"
,
"gptq_marlin"
)
MIN_CAPABILITY
=
os
.
environ
.
get
(
"MIN_CAPABILITY"
,
"89"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
int
(
MIN_CAPABILITY
)),
reason
=
"Current system does not have minimum capability."
)
def
test_weight_loading
(
vllm_runner
):
"""
Test parameter weight loading with tp>1.
...
...
vllm/_custom_ops.py
View file @
96ae75ad
import
contextlib
import
functools
import
importlib
from
typing
import
TYPE_CHECKING
,
List
,
Optional
,
Tuple
,
Union
,
Type
...
...
@@ -44,34 +42,6 @@ else:
from
torch.library
import
impl_abstract
as
register_fake
def
hint_on_error
(
fn
):
@
functools
.
wraps
(
fn
)
def
wrapper
(
*
args
,
**
kwargs
):
try
:
return
fn
(
*
args
,
**
kwargs
)
except
NotImplementedError
as
e
:
msg
=
(
"Error in calling custom op %s: %s
\n
"
"Not implemented or built, mostly likely because the current current device "
"does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
"incorrectly while building)"
)
logger
.
error
(
msg
,
fn
.
__name__
,
e
)
raise
NotImplementedError
(
msg
%
(
fn
.
__name__
,
e
))
from
e
except
AttributeError
as
e
:
msg
=
(
"Error in calling custom op %s: %s
\n
"
"Possibly you have built or installed an obsolete version of vllm.
\n
"
"Please try a clean build and install of vllm,"
"or remove old built files such as vllm/*cpython*.so and build/ ."
)
logger
.
error
(
msg
,
fn
.
__name__
,
e
)
raise
e
return
wrapper
# activation ops
def
silu_and_mul
(
out
:
torch
.
Tensor
,
x
:
torch
.
Tensor
)
->
None
:
torch
.
ops
.
_C
.
silu_and_mul
(
out
,
x
)
...
...
@@ -984,6 +954,114 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
return
out
def
cutlass_sparse_scaled_mm_supported
(
cuda_device_capability
:
int
)
->
bool
:
return
torch
.
ops
.
_C
.
cutlass_sparse_scaled_mm_supported
(
cuda_device_capability
)
def
cutlass_sparse_compress
(
a
:
torch
.
Tensor
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Compresses a sparse matrix for use with Cutlass sparse operations.
This function takes a dense tensor and compresses it into two components:
non-zero elements and metadata. The compressed representation is compatible
with Cutlass sparse kernels.
Args:
a (torch.Tensor):
The input tensor to be compressed. Must have one of the following data types:
- `torch.int8`
- `torch.float8_e4m3fn`
- `torch.bfloat16`
- `torch.float16`
Returns:
Tuple[torch.Tensor, torch.Tensor]:
A tuple containing:
- `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
- `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
Raises:
ValueError: If the compression operation fails.
Notes:
- The `a_meta` tensor has a data type of `torch.uint8`.
- Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
- The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
- The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
"""
assert
(
a
.
dtype
in
[
torch
.
int8
,
torch
.
float8_e4m3fn
,
torch
.
bfloat16
,
torch
.
float16
])
assert
(
a
.
is_contiguous
())
# a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
elemsPerMetaElem
=
4
m
=
a
.
shape
[
0
]
k
=
a
.
shape
[
1
]
assert
(
k
%
2
==
0
)
a_nzs
=
torch
.
empty
((
m
,
k
//
2
),
dtype
=
a
.
dtype
,
device
=
a
.
device
)
a_meta
=
torch
.
empty
((
m
,
k
//
2
//
elemsPerMetaElem
),
dtype
=
torch
.
uint8
,
device
=
a
.
device
)
if
not
(
torch
.
ops
.
_C
.
cutlass_sparse_compress_entry
(
a_nzs
,
a_meta
,
a
)):
raise
ValueError
assert
(
a_nzs
.
is_contiguous
())
assert
(
a_meta
.
is_contiguous
())
return
a_nzs
,
a_meta
def
cutlass_scaled_sparse_mm
(
a
:
torch
.
Tensor
,
bt_nzs
:
torch
.
Tensor
,
bt_meta
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
out_dtype
:
torch
.
dtype
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
"""
Performs a scaled sparse matrix multiplication using Cutlass.
Steps:
1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
`a = torch.randn((m, k), device='cuda')`.
2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
`b = torch.randn((k, n), device='cuda')`.
3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
`b = prune_to_2_4(b, dim=0)`.
4. Compress the transposed sparse matrix `b.t()`:
`bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
5. Perform sparse matrix multiplication using the compressed matrix,
applying scaling factors for `a` and `b`, and the output data type:
`out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
Returns:
- The result of the scaled sparse matrix multiplication.
"""
assert
(
bt_nzs
.
shape
[
0
]
%
16
==
0
and
bt_nzs
.
shape
[
1
]
%
16
==
0
)
assert
(
out_dtype
is
torch
.
bfloat16
or
out_dtype
is
torch
.
float16
)
assert
bias
is
None
or
bias
.
shape
[
0
]
==
bt_nzs
.
shape
[
0
]
\
and
bias
.
dtype
==
out_dtype
m
=
a
.
shape
[
0
]
n
=
bt_nzs
.
shape
[
0
]
out
=
torch
.
empty
((
m
,
n
),
dtype
=
out_dtype
,
device
=
a
.
device
)
torch
.
ops
.
_C
.
cutlass_scaled_sparse_mm
(
out
,
a
,
bt_nzs
,
bt_meta
,
scale_a
,
scale_b
,
bias
)
return
out
# aqlm
def
aqlm_gemm
(
input
:
torch
.
Tensor
,
codes
:
torch
.
Tensor
,
codebooks
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
...
...
@@ -1426,6 +1504,7 @@ def register_graph_buffers(fa: int, handles: List[List[int]],
offsets
:
List
[
List
[
int
]])
->
None
:
torch
.
ops
.
_C_custom_ar
.
register_graph_buffers
(
fa
,
handles
,
offsets
)
def
read_cache
(
keys
:
torch
.
Tensor
,
values
:
torch
.
Tensor
,
...
...
@@ -1449,26 +1528,3 @@ def write_cache_multi_layers(
torch
.
ops
.
_C_cache_ops
.
write_cache_multi_layers
(
keys
,
values
,
key_caches
,
value_caches
,
slot_mapping
,
kv_cache_dtype
)
# temporary fix for https://github.com/vllm-project/vllm/issues/5456
# TODO: remove this in v0.6.0
names_and_values
=
globals
()
names_and_values_to_update
=
{}
# prepare variables to avoid dict size change during iteration
k
,
v
,
arg
=
None
,
None
,
None
fn_type
=
type
(
lambda
x
:
x
)
for
k
,
v
in
names_and_values
.
items
():
# find functions that are defined in this file and have torch.Tensor
# in their annotations. `arg == "torch.Tensor"` is used to handle
# the case when users use `import __annotations__` to turn type
# hints into strings.
if
isinstance
(
v
,
fn_type
)
\
and
v
.
__code__
.
co_filename
==
__file__
\
and
any
(
arg
is
torch
.
Tensor
or
arg
==
"torch.Tensor"
for
arg
in
v
.
__annotations__
.
values
()):
names_and_values_to_update
[
k
]
=
hint_on_error
(
v
)
names_and_values
.
update
(
names_and_values_to_update
)
del
names_and_values_to_update
,
names_and_values
,
v
,
k
,
fn_type
vllm/adapter_commons/models.py
View file @
96ae75ad
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
Callable
,
Dict
,
Hashable
,
Optional
,
TypeVar
from
typing
import
Any
,
Callable
,
Dict
,
Optional
,
TypeVar
from
torch
import
nn
...
...
@@ -24,14 +24,13 @@ class AdapterModel(ABC):
T
=
TypeVar
(
'T'
)
class
AdapterLRUCache
(
LRUCache
[
T
]):
class
AdapterLRUCache
(
LRUCache
[
int
,
T
]):
def
__init__
(
self
,
capacity
:
int
,
deactivate_fn
:
Callable
[[
Hashable
],
None
]):
def
__init__
(
self
,
capacity
:
int
,
deactivate_fn
:
Callable
[[
int
],
object
]):
super
().
__init__
(
capacity
)
self
.
deactivate_fn
=
deactivate_fn
def
_on_remove
(
self
,
key
:
Hashable
,
value
:
Optional
[
T
]):
def
_on_remove
(
self
,
key
:
int
,
value
:
Optional
[
T
]):
logger
.
debug
(
"Removing adapter int id: %d"
,
key
)
self
.
deactivate_fn
(
key
)
return
super
().
_on_remove
(
key
,
value
)
...
...
vllm/assets/audio.py
View file @
96ae75ad
from
dataclasses
import
dataclass
from
typing
import
Literal
,
Tuple
from
typing
import
Literal
from
urllib.parse
import
urljoin
import
librosa
import
numpy
as
np
import
numpy.typing
as
npt
from
vllm.assets.base
import
get_vllm_public_assets
,
vLLM_S3_BUCKET_URL
from
vllm.utils
import
PlaceholderModule
from
.base
import
VLLM_S3_BUCKET_URL
,
get_vllm_public_assets
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
ASSET_DIR
=
"multimodal_asset"
...
...
@@ -15,8 +21,7 @@ class AudioAsset:
name
:
Literal
[
"winning_call"
,
"mary_had_lamb"
]
@
property
def
audio_and_sample_rate
(
self
)
->
Tuple
[
np
.
ndarray
,
int
]:
def
audio_and_sample_rate
(
self
)
->
tuple
[
npt
.
NDArray
,
int
]:
audio_path
=
get_vllm_public_assets
(
filename
=
f
"
{
self
.
name
}
.ogg"
,
s3_prefix
=
ASSET_DIR
)
y
,
sr
=
librosa
.
load
(
audio_path
,
sr
=
None
)
...
...
@@ -25,4 +30,4 @@ class AudioAsset:
@
property
def
url
(
self
)
->
str
:
return
urljoin
(
v
LLM_S3_BUCKET_URL
,
f
"
{
ASSET_DIR
}
/
{
self
.
name
}
.ogg"
)
return
urljoin
(
V
LLM_S3_BUCKET_URL
,
f
"
{
ASSET_DIR
}
/
{
self
.
name
}
.ogg"
)
vllm/assets/base.py
View file @
96ae75ad
...
...
@@ -4,9 +4,8 @@ from typing import Optional
import
vllm.envs
as
envs
from
vllm.connections
import
global_http_connection
from
vllm.envs
import
VLLM_IMAGE_FETCH_TIMEOUT
v
LLM_S3_BUCKET_URL
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com"
V
LLM_S3_BUCKET_URL
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com"
def
get_cache_dir
()
->
Path
:
...
...
@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
if
s3_prefix
is
not
None
:
filename
=
s3_prefix
+
"/"
+
filename
global_http_connection
.
download_file
(
f
"
{
v
LLM_S3_BUCKET_URL
}
/
{
filename
}
"
,
f
"
{
V
LLM_S3_BUCKET_URL
}
/
{
filename
}
"
,
asset_path
,
timeout
=
VLLM_IMAGE_FETCH_TIMEOUT
)
timeout
=
envs
.
VLLM_IMAGE_FETCH_TIMEOUT
)
return
asset_path
vllm/assets/image.py
View file @
96ae75ad
...
...
@@ -4,7 +4,7 @@ from typing import Literal
import
torch
from
PIL
import
Image
from
vllm.assets
.base
import
get_vllm_public_assets
from
.base
import
get_vllm_public_assets
VLM_IMAGES_DIR
=
"vision_model_images"
...
...
@@ -15,7 +15,6 @@ class ImageAsset:
@
property
def
pil_image
(
self
)
->
Image
.
Image
:
image_path
=
get_vllm_public_assets
(
filename
=
f
"
{
self
.
name
}
.jpg"
,
s3_prefix
=
VLM_IMAGES_DIR
)
return
Image
.
open
(
image_path
)
...
...
vllm/assets/video.py
View file @
96ae75ad
...
...
@@ -2,13 +2,13 @@ from dataclasses import dataclass
from
functools
import
lru_cache
from
typing
import
List
,
Literal
import
cv2
import
numpy
as
np
import
numpy.typing
as
npt
from
huggingface_hub
import
hf_hub_download
from
PIL
import
Image
from
vllm.multimodal.utils
import
(
sample_frames_from_video
,
try_import_video_packages
)
from
vllm.multimodal.video
import
sample_frames_from_video
from
.base
import
get_cache_dir
...
...
@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
Download and open an image from huggingface
repo: raushan-testing-hf/videos-test
"""
video_directory
=
get_cache_dir
()
/
"video-eample-data"
video_directory
=
get_cache_dir
()
/
"video-e
x
ample-data"
video_directory
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
video_path
=
video_directory
/
filename
...
...
@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
def
video_to_ndarrays
(
path
:
str
,
num_frames
:
int
=
-
1
)
->
npt
.
NDArray
:
cv2
,
_
=
try_import_video_packages
()
cap
=
cv2
.
VideoCapture
(
path
)
if
not
cap
.
isOpened
():
raise
ValueError
(
f
"Could not open video file
{
path
}
"
)
...
...
@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
def
video_to_pil_images_list
(
path
:
str
,
num_frames
:
int
=
-
1
)
->
List
[
Image
.
Image
]:
cv2
,
_
=
try_import_video_packages
()
frames
=
video_to_ndarrays
(
path
,
num_frames
)
return
[
Image
.
fromarray
(
cv2
.
cvtColor
(
frame
,
cv2
.
COLOR_BGR2RGB
))
...
...
vllm/attention/backends/rocm_flash_attn.py
View file @
96ae75ad
...
...
@@ -447,7 +447,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
Returns:
shape = [num_tokens, num_heads * head_size]
"""
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/attention/layer.py
View file @
96ae75ad
...
...
@@ -191,6 +191,7 @@ class MultiHeadAttention(nn.Module):
kv_cache_dtype
=
None
,
block_size
=
16
,
is_attention_free
=
False
)
attn_backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
attn_backend
in
{
_Backend
.
FLASH_ATTN
,
_Backend
.
FLASH_ATTN_VLLM_V1
}:
attn_backend
=
_Backend
.
XFORMERS
...
...
vllm/benchmarks/benchmark_throughput.py
View file @
96ae75ad
...
...
@@ -4,7 +4,8 @@ import dataclasses
import
json
import
random
import
time
from
typing
import
List
,
Optional
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
...
...
@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
from
vllm.inputs
import
TextPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
...
...
@@ -31,15 +35,17 @@ class SampleRequest:
Attributes:
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt
:
str
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
...
...
@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
...
...
@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
dataset
:
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
break
...
...
@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_token_ids
=
request_
tokenizer
(
prompt
).
input_ids
completion_token_ids
=
request_
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
...
...
@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
))
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
...
...
@@ -150,11 +188,14 @@ def run_vllm(
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
# warmup
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
warmup_
promp
ts
:
for
request
in
warmup_
reques
ts
:
warmup_prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
...
...
@@ -191,9 +232,13 @@ def run_vllm(
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
...
...
@@ -225,6 +270,7 @@ async def run_vllm_async(
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
...
...
@@ -237,11 +283,16 @@ async def run_vllm_async(
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
)
in
enumerate
(
zip
(
prompts
,
sampling_params
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
request_id
=
f
"test
{
i
}
"
)
for
i
,
(
prompt
,
sp
,
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
...
...
@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
vocab_size
=
tokenizer
.
vocab_size
requests
=
[]
for
_
in
range
(
args
.
num_prompts
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
...
...
@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
tokenizer
.
encode
(
candidate_prompt
))
candidate_prompt
=
request_
tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
request_
tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
args
.
input_len
:
break
...
...
@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
args
.
input_len
,
expected_output_len
=
args
.
output_len
))
expected_output_len
=
args
.
output_len
,
lora_request
=
lora_request
))
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
# if args.async_engine:
# run_args = [
# requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
# else:
# run_args = [
# warmup_requests, requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
...
...
@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
args
.
disable_frontend_multiprocessing
,
))
else
:
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
elapsed_time
=
run_vllm
(
warmup_requests
,
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
...
...
@@ -496,6 +529,14 @@ if __name__ == "__main__":
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
...
...
@@ -505,6 +546,8 @@ if __name__ == "__main__":
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
...
...
@@ -514,6 +557,9 @@ if __name__ == "__main__":
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
...
...
@@ -526,4 +572,7 @@ if __name__ == "__main__":
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
\ No newline at end of file
vllm/block.py
deleted
100644 → 0
View file @
f9f4a735
"""Token blocks."""
from
typing
import
TYPE_CHECKING
,
Iterator
,
List
,
Optional
from
vllm.utils
import
Device
DEFAULT_LAST_ACCESSED_TIME
:
float
=
-
1
class
PhysicalTokenBlock
:
"""Represents the state of a block in the KV cache."""
def
__init__
(
self
,
device
:
Device
,
block_number
:
int
,
block_size
:
int
,
block_hash
:
int
,
num_hashed_tokens
:
int
,
)
->
None
:
self
.
device
=
device
self
.
block_number
=
block_number
self
.
block_size
=
block_size
self
.
block_hash
=
block_hash
self
.
num_hashed_tokens
=
num_hashed_tokens
self
.
ref_count
=
0
self
.
last_accessed
=
DEFAULT_LAST_ACCESSED_TIME
self
.
computed
=
False
def
__repr__
(
self
)
->
str
:
return
(
f
'PhysicalTokenBlock(device=
{
self
.
device
}
, '
f
'block_number=
{
self
.
block_number
}
, '
f
'num_hashed_tokens=
{
self
.
num_hashed_tokens
}
, '
f
'ref_count=
{
self
.
ref_count
}
, '
f
'last_accessed=
{
self
.
last_accessed
}
, '
f
'computed=
{
self
.
computed
}
)'
)
class
BlockTable
:
"""Holds a list of blocks with caching of their associated block_ids
"""
def
__init__
(
self
,
blocks
:
Optional
[
List
[
PhysicalTokenBlock
]]
=
None
):
self
.
_blocks
:
List
[
PhysicalTokenBlock
]
=
[]
self
.
_block_ids
:
List
[
int
]
=
[]
if
blocks
is
not
None
:
for
block
in
blocks
:
self
.
append
(
block
)
def
append
(
self
,
block
:
PhysicalTokenBlock
):
self
.
_blocks
.
append
(
block
)
self
.
_block_ids
.
append
(
block
.
block_number
)
def
__len__
(
self
)
->
int
:
return
len
(
self
.
_blocks
)
def
__getitem__
(
self
,
key
):
return
self
.
_blocks
[
key
]
if
TYPE_CHECKING
:
def
__iter__
(
self
)
->
Iterator
[
PhysicalTokenBlock
]:
raise
RuntimeError
(
"Method should be automatically generated"
)
def
__setitem__
(
self
,
key
,
value
):
if
isinstance
(
key
,
slice
):
blocks
=
value
self
.
_blocks
[
key
]
=
blocks
self
.
_block_ids
[
key
]
=
[
b
.
block_number
for
b
in
blocks
]
else
:
block
=
value
self
.
_blocks
[
key
]
=
block
self
.
_block_ids
[
key
]
=
block
.
block_number
def
reset
(
self
):
self
.
_blocks
=
[]
self
.
_block_ids
=
[]
def
copy
(
self
)
->
"BlockTable"
:
return
BlockTable
(
self
.
_blocks
)
def
list
(
self
)
->
List
[
PhysicalTokenBlock
]:
return
self
.
_blocks
def
ids
(
self
)
->
List
[
int
]:
return
self
.
_block_ids
vllm/compilation/backends.py
View file @
96ae75ad
...
...
@@ -141,14 +141,14 @@ class AlwaysHitShapeEnv:
return
""
def
wrap_inductor
(
graph
,
def
wrap_inductor
(
graph
:
fx
.
GraphModule
,
example_inputs
,
additional_inductor_config
,
compilation_config
:
CompilationConfig
,
graph_index
:
int
=
0
,
num_graphs
:
int
=
1
,
runtime_shape
:
Optional
[
int
]
=
None
,
use_inductor
:
bool
=
True
):
use_inductor
:
bool
=
True
)
->
Any
:
if
graph_index
==
0
:
# before compiling the first graph, record the start time
global
compilation_start_time
...
...
@@ -208,7 +208,7 @@ def wrap_inductor(graph,
from
torch._inductor.compile_fx
import
graph_returns_tuple
returns_tuple
=
graph_returns_tuple
(
graph
)
# this is the
graph
we return to Dynamo to run
# this is the
callable
we return to Dynamo to run
def
compiled_graph
(
*
args
):
# convert args to list
list_args
=
list
(
args
)
...
...
@@ -247,7 +247,7 @@ def wrap_inductor(graph,
# see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
return
def
_get_shape_env
():
def
_get_shape_env
()
->
AlwaysHitShapeEnv
:
return
AlwaysHitShapeEnv
()
with
patch
(
# for hijacking the hash of the compiled graph
...
...
@@ -537,6 +537,7 @@ class VllmBackend:
example_inputs
[
x
].
clone
()
for
x
in
self
.
sym_tensor_indices
]
# this is the callable we return to Dynamo to run
def
copy_and_call
(
*
args
):
list_args
=
list
(
args
)
for
i
,
index
in
enumerate
(
self
.
sym_tensor_indices
):
...
...
vllm/compilation/multi_output_match.py
View file @
96ae75ad
...
...
@@ -7,6 +7,7 @@ from torch import fx
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
torch._inductor
import
pattern_matcher
as
pm
from
torch._ops
import
OpOverload
from
torch.fx
import
Node
from
vllm.compilation.fx_utils
import
find_auto_fn
...
...
@@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC):
self
.
graph
.
call_function
(
operator
.
getitem
,
(
tuple_node
,
idx
))
for
idx
in
indices
)
def
insert_auto_fn
(
self
,
op
:
OpOverload
,
kwargs
):
def
insert_auto_fn
(
self
,
op
:
OpOverload
,
kwargs
)
->
Node
:
"""
Insert an auto_functionalized node with the given op and kwargs.
"""
...
...
vllm/compilation/pass_manager.py
View file @
96ae75ad
from
typing
import
List
from
typing
import
Any
,
Dict
,
List
from
torch
import
fx
as
fx
...
...
@@ -53,7 +53,7 @@ class PostGradPassManager:
assert
isinstance
(
pass_
,
InductorPass
)
self
.
passes
.
append
(
pass_
)
def
__getstate__
(
self
):
def
__getstate__
(
self
)
->
Dict
[
str
,
List
[
Any
]]
:
"""
Custom pickling for the pass manager, as some passes cannot be pickled.
Pickling occurs because the pass manager is set as the value of
...
...
vllm/config.py
View file @
96ae75ad
...
...
@@ -22,12 +22,15 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.quantization
import
(
QUANTIZATION_METHODS
,
get_quantization_config
)
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
,
interface
from
vllm.tracing
import
is_otel_available
,
otel_import_error_traceback
from
vllm.transformers_utils.config
import
(
ConfigFormat
,
get_config
,
get_hf_image_processor_config
,
get_hf_text_config
,
get_pooling_config
,
get_sentence_transformer_tokenizer_config
,
is_encoder_decoder
,
uses_mrope
)
get_sentence_transformer_tokenizer_config
,
is_encoder_decoder
,
try_get_generation_config
,
uses_mrope
)
from
vllm.transformers_utils.s3_utils
import
S3Model
from
vllm.transformers_utils.utils
import
is_s3
from
vllm.utils
import
(
GiB_bytes
,
LayerBlockType
,
cuda_device_count_stateless
,
get_cpu_memory
,
print_warning_once
,
random_uuid
,
resolve_obj_by_qualname
)
...
...
@@ -148,9 +151,8 @@ class ModelConfig:
HuggingFace config.
mm_processor_kwargs: Arguments to be forwarded to the model's processor
for multi-modal data, e.g., image processor.
mm_cache_preprocessor: If true, then enables caching of the multi-modal
preprocessor/mapper. Otherwise, the mapper executes each time, and
for better performance consider enabling frontend process.
disable_mm_preprocessor_cache: If true, then disables caching of the
multi-modal preprocessor/mapper. (not recommended)
override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices,
this argument will be used to configure the neuron config that
...
...
@@ -159,8 +161,9 @@ class ModelConfig:
override default pooling config for the pooling model.
logits_processor_pattern: Optional regex pattern specifying valid
logits processor qualified names that can be passed with the
`logits_processors` extra completion argument. Defaults to None,
`logits_processors` extra completion argument. Defaults to None,
which allows no processors.
generation_config: Configuration parameter file for generation.
"""
def
compute_hash
(
self
)
->
str
:
...
...
@@ -216,10 +219,11 @@ class ModelConfig:
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
,
hf_overrides
:
Optional
[
HfOverrides
]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
mm_cache
_preprocessor
:
bool
=
False
,
disable_mm
_preprocessor
_cache
:
bool
=
False
,
override_neuron_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
override_pooler_config
:
Optional
[
"PoolerConfig"
]
=
None
,
logits_processor_pattern
:
Optional
[
str
]
=
None
)
->
None
:
logits_processor_pattern
:
Optional
[
str
]
=
None
,
generation_config
:
Optional
[
str
]
=
None
)
->
None
:
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
tokenizer_mode
=
tokenizer_mode
...
...
@@ -254,6 +258,8 @@ class ModelConfig:
f
"'Please instead use `--hf-overrides '
{
hf_override
!
r
}
'`"
)
warnings
.
warn
(
DeprecationWarning
(
msg
),
stacklevel
=
2
)
self
.
maybe_pull_model_tokenizer_for_s3
(
model
,
tokenizer
)
# The tokenizer version is consistent with the model version by default.
if
tokenizer_revision
is
None
:
self
.
tokenizer_revision
=
revision
...
...
@@ -286,7 +292,7 @@ class ModelConfig:
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_text_config
,
dtype
)
self
.
use_async_output_proc
=
use_async_output_proc
self
.
mm_processor_kwargs
=
mm_processor_kwargs
self
.
mm_cache
_preprocessor
=
mm
_cache_preprocessor
self
.
disable_mm
_preprocessor_cache
=
disable_mm
_preprocessor
_cache
# Set enforce_eager to False if the value is unset.
if
self
.
enforce_eager
is
None
:
...
...
@@ -349,10 +355,36 @@ class ModelConfig:
self
.
pooler_config
=
self
.
_init_pooler_config
(
override_pooler_config
)
self
.
logits_processor_pattern
=
logits_processor_pattern
self
.
generation_config
=
generation_config
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
self
.
_verify_bnb_config
()
def
maybe_pull_model_tokenizer_for_s3
(
self
,
model
:
str
,
tokenizer
:
str
)
->
None
:
"""
Pull the model config or tokenizer to a temporary
directory in case of S3.
Args:
model: The model name or path.
tokenizer: The tokenizer name or path.
"""
if
is_s3
(
model
)
or
is_s3
(
tokenizer
):
if
is_s3
(
model
):
self
.
s3_model
=
S3Model
()
self
.
s3_model
.
pull_files
(
model
,
allow_pattern
=
[
"*config.json"
])
self
.
model_weights
=
self
.
model
self
.
model
=
self
.
s3_model
.
dir
if
is_s3
(
tokenizer
):
self
.
s3_tokenizer
=
S3Model
()
self
.
s3_tokenizer
.
pull_files
(
model
,
ignore_pattern
=
[
"*.pt"
,
"*.safetensors"
,
"*.bin"
])
self
.
tokenizer
=
self
.
s3_tokenizer
.
dir
def
_init_multimodal_config
(
self
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
)
->
Optional
[
"MultiModalConfig"
]:
...
...
@@ -564,6 +596,12 @@ class ModelConfig:
self
.
max_seq_len_to_capture
=
min
(
self
.
max_seq_len_to_capture
,
self
.
max_model_len
)
if
(
self
.
hf_config
.
model_type
==
'deepseek_v3'
and
not
self
.
enforce_eager
):
logger
.
warning
(
"CUDA graph is not supported for Deepseek V3 yet, "
"fallback to the eager mode."
)
self
.
enforce_eager
=
True
def
_verify_bnb_config
(
self
)
->
None
:
"""
The current version of bitsandbytes (0.44.0) with 8-bit models does not
...
...
@@ -598,7 +636,7 @@ class ModelConfig:
self
.
use_async_output_proc
=
False
return
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
if
not
current_platform
.
is_async_output_supported
(
self
.
enforce_eager
):
logger
.
warning
(
...
...
@@ -618,7 +656,7 @@ class ModelConfig:
if
self
.
runner_type
==
"pooling"
:
self
.
use_async_output_proc
=
False
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
if
speculative_config
:
logger
.
warning
(
"Async output processing is not supported with"
...
...
@@ -680,8 +718,9 @@ class ModelConfig:
def
get_head_size
(
self
)
->
int
:
# TODO remove hard code
if
hasattr
(
self
.
hf_text_config
,
"model_type"
)
and
self
.
hf_text_config
.
model_type
==
'deepseek_v2'
:
if
hasattr
(
self
.
hf_text_config
,
"model_type"
)
and
(
self
.
hf_text_config
.
model_type
in
(
'deepseek_v2'
,
'deepseek_v3'
)):
# FlashAttention supports only head_size 32, 64, 128, 256,
# we need to pad head_size 192 to 256
return
256
...
...
@@ -814,6 +853,56 @@ class ModelConfig:
return
self
.
multimodal_config
def
try_get_generation_config
(
self
)
->
Dict
[
str
,
Any
]:
if
self
.
generation_config
is
None
or
self
.
generation_config
==
"auto"
:
config
=
try_get_generation_config
(
self
.
model
,
trust_remote_code
=
self
.
trust_remote_code
,
revision
=
self
.
revision
,
)
else
:
config
=
try_get_generation_config
(
self
.
generation_config
,
trust_remote_code
=
self
.
trust_remote_code
,
)
if
config
is
None
:
return
{}
return
config
.
to_diff_dict
()
def
get_diff_sampling_param
(
self
)
->
Dict
[
str
,
Any
]:
"""
This method returns a dictionary containing the parameters
that differ from the default sampling parameters, but only
if `generation_config` is set. If `generation_config` is not
set, an empty dictionary is returned.
Returns:
Dict[str, Any]: A dictionary with the differing sampling
parameters if `generation_config` is set, otherwise an
empty dictionary.
"""
if
self
.
generation_config
is
None
:
# When generation_config is not set
return
{}
config
=
self
.
try_get_generation_config
()
available_params
=
[
"repetition_penalty"
,
"temperature"
,
"top_k"
,
"top_p"
,
"min_p"
,
]
if
any
(
p
in
config
for
p
in
available_params
):
diff_sampling_param
=
{
p
:
config
.
get
(
p
)
for
p
in
available_params
if
config
.
get
(
p
)
is
not
None
}
else
:
diff_sampling_param
=
{}
return
diff_sampling_param
@
property
def
is_encoder_decoder
(
self
)
->
bool
:
"""Extract the HF encoder/decoder model flag."""
...
...
@@ -917,6 +1006,10 @@ class CacheConfig:
raise
ValueError
(
"GPU memory utilization must be less than 1.0. Got "
f
"
{
self
.
gpu_memory_utilization
}
."
)
if
(
current_platform
.
is_cuda
()
and
self
.
block_size
is
not
None
and
self
.
block_size
>
32
):
raise
ValueError
(
"CUDA Paged Attention kernel only supports "
f
"block sizes up to 32. Got
{
self
.
block_size
}
."
)
def
_verify_cache_dtype
(
self
)
->
None
:
if
self
.
cache_dtype
==
"auto"
:
...
...
@@ -1041,6 +1134,7 @@ class LoadFormat(str, enum.Enum):
GGUF
=
"gguf"
BITSANDBYTES
=
"bitsandbytes"
MISTRAL
=
"mistral"
RUNAI_STREAMER
=
"runai_streamer"
@
dataclass
...
...
@@ -1977,7 +2071,7 @@ class LoRAConfig:
model_config
.
quantization
)
def
verify_with_scheduler_config
(
self
,
scheduler_config
:
SchedulerConfig
):
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
if
scheduler_config
.
chunked_prefill_enabled
:
logger
.
warning
(
"LoRA with chunked prefill is still experimental "
...
...
@@ -2155,6 +2249,17 @@ def _get_and_verify_dtype(
else
:
torch_dtype
=
config_dtype
if
(
current_platform
.
is_cpu
()
and
current_platform
.
get_cpu_architecture
()
==
interface
.
CpuArchEnum
.
POWERPC
and
(
config_dtype
==
torch
.
float16
or
config_dtype
==
torch
.
float32
)):
logger
.
info
(
"For POWERPC, we cast models to bfloat16 instead of "
"using float16 by default. Float16 is not currently "
"supported for POWERPC."
)
torch_dtype
=
torch
.
bfloat16
if
current_platform
.
is_hpu
()
and
config_dtype
==
torch
.
float16
:
logger
.
info
(
"For HPU, we cast models to bfloat16 instead of"
...
...
@@ -3165,7 +3270,7 @@ class VllmConfig:
f
"enable_prefix_caching=
{
self
.
cache_config
.
enable_prefix_caching
}
, "
f
"chunked_prefill_enabled=
{
self
.
scheduler_config
.
chunked_prefill_enabled
}
, "
# noqa
f
"use_async_output_proc=
{
self
.
model_config
.
use_async_output_proc
}
, "
f
"
mm_cache
_preprocessor=
{
self
.
model_config
.
mm_cache
_preprocessor
!
r
}
, "
# noqa
f
"
disable_mm
_preprocessor
_cache
=
{
self
.
model_config
.
disable_mm
_preprocessor
_cache
!
r
}
, "
# noqa
f
"mm_processor_kwargs=
{
self
.
model_config
.
mm_processor_kwargs
}
, "
f
"pooler_config=
{
self
.
model_config
.
pooler_config
!
r
}
, "
f
"compilation_config=
{
self
.
compilation_config
!
r
}
"
)
...
...
vllm/core/evictor.py
View file @
96ae75ad
...
...
@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
class
Evictor
(
ABC
):
"""The Evictor subclasses should be used by the BlockAllocator class to
handle eviction of freed
PhysicalToken
Blocks.
handle eviction of freed Blocks.
"""
@
abstractmethod
...
...
@@ -70,7 +70,7 @@ class BlockMetaData:
class
LRUEvictor
(
Evictor
):
"""Evicts in a least-recently-used order using the last_accessed timestamp
that's recorded in the
PhysicalToken
Block. If there are multiple blocks with
that's recorded in the Block. If there are multiple blocks with
the same last_accessed time, then the one with the largest num_hashed_tokens
will be evicted. If two blocks each have the lowest last_accessed time and
highest num_hashed_tokens value, then one will be chose arbitrarily
...
...
vllm/engine/arg_utils.py
View file @
96ae75ad
...
...
@@ -141,7 +141,7 @@ class EngineArgs:
tokenizer_pool_extra_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
=
None
mm_processor_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
mm_cache
_preprocessor
:
bool
=
False
disable_mm
_preprocessor
_cache
:
bool
=
False
enable_lora
:
bool
=
False
enable_lora_bias
:
bool
=
False
max_loras
:
int
=
1
...
...
@@ -200,6 +200,8 @@ class EngineArgs:
kv_transfer_config
:
Optional
[
KVTransferConfig
]
=
None
generation_config
:
Optional
[
str
]
=
None
def
__post_init__
(
self
):
if
not
self
.
tokenizer
:
self
.
tokenizer
=
self
.
model
...
...
@@ -208,6 +210,7 @@ class EngineArgs:
# by user.
if
self
.
enable_prefix_caching
is
None
:
self
.
enable_prefix_caching
=
bool
(
envs
.
VLLM_USE_V1
)
# Override max_num_seqs if it's not set by user.
if
self
.
max_num_seqs
is
None
:
self
.
max_num_seqs
=
256
if
not
envs
.
VLLM_USE_V1
else
1024
...
...
@@ -316,6 +319,8 @@ class EngineArgs:
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples '
'section for more information.
\n
'
'* "runai_streamer" will load the Safetensors weights using Run:ai'
'Model Streamer
\n
'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.
\n
'
)
parser
.
add_argument
(
...
...
@@ -371,7 +376,7 @@ class EngineArgs:
choices
=
[
'outlines'
,
'lm-format-enforcer'
,
'xgrammar'
],
help
=
'Which engine will be used for guided decoding'
' (JSON schema / regex etc) by default. Currently support '
'https://github.com/outlines-dev/outlines,'
'https://github.com/outlines-dev/outlines,
'
'https://github.com/mlc-ai/xgrammar, and '
'https://github.com/noamgat/lm-format-enforcer.'
' Can be overridden per request via guided_decoding_backend'
...
...
@@ -426,10 +431,12 @@ class EngineArgs:
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
choices
=
[
8
,
16
,
32
,
64
,
128
],
help
=
'Token block size for contiguous chunks of '
'tokens. This is ignored on neuron devices and '
'set to max-model-len'
)
'set to max-model-len. On CUDA devices, '
'only block sizes up to 32 are supported. '
'On HPU devices, block size defaults to 128.'
)
parser
.
add_argument
(
"--enable-prefix-caching"
,
...
...
@@ -606,11 +613,10 @@ class EngineArgs:
help
=
(
'Overrides for the multimodal input mapping/processing, '
'e.g., image processor. For example: {"num_crops": 4}.'
))
parser
.
add_argument
(
'--
mm-cache
-preprocessor'
,
'--
disable-mm
-preprocessor
-cache
'
,
action
=
'store_true'
,
help
=
'If true, then enables caching of the multi-modal '
'preprocessor/mapper. Otherwise, the mapper executes each time'
', and for better performance consider enabling frontend process.'
)
help
=
'If true, then disables caching of the multi-modal '
'preprocessor/mapper. (not recommended)'
)
# LoRA related configs
parser
.
add_argument
(
'--enable-lora'
,
...
...
@@ -957,6 +963,16 @@ class EngineArgs:
default
=
"auto"
,
help
=
'The worker class to use for distributed execution.'
)
parser
.
add_argument
(
"--generation-config"
,
type
=
nullable_str
,
default
=
None
,
help
=
"The folder path to the generation config. "
"Defaults to None, will use the default generation config in vLLM. "
"If set to 'auto', the generation config will be automatically "
"loaded from model. If set to a folder path, the generation config "
"will be loaded from the specified folder path."
)
return
parser
@
classmethod
...
...
@@ -997,10 +1013,11 @@ class EngineArgs:
use_async_output_proc
=
not
self
.
disable_async_output_proc
,
config_format
=
self
.
config_format
,
mm_processor_kwargs
=
self
.
mm_processor_kwargs
,
mm_cache
_preprocessor
=
self
.
mm_cache
_preprocessor
,
disable_mm
_preprocessor
_cache
=
self
.
disable_mm
_preprocessor
_cache
,
override_neuron_config
=
self
.
override_neuron_config
,
override_pooler_config
=
self
.
override_pooler_config
,
logits_processor_pattern
=
self
.
logits_processor_pattern
)
logits_processor_pattern
=
self
.
logits_processor_pattern
,
generation_config
=
self
.
generation_config
)
def
create_load_config
(
self
)
->
LoadConfig
:
return
LoadConfig
(
...
...
@@ -1043,11 +1060,11 @@ class EngineArgs:
device_config
=
DeviceConfig
(
device
=
self
.
device
)
model_config
=
self
.
create_model_config
()
if
model_config
.
is_multimodal_model
:
if
self
.
enable_prefix_caching
:
logger
.
warning
(
"--enable-prefix-caching is currently not
"
"supported for multimodal models and
has been disabled."
)
if
(
model_config
.
is_multimodal_model
and
not
envs
.
VLLM_USE_V1
and
self
.
enable_prefix_caching
)
:
logger
.
warning
(
"--enable-prefix-caching is currently not "
"supported for multimodal models in v0 and
"
"
has been disabled."
)
self
.
enable_prefix_caching
=
False
cache_config
=
CacheConfig
(
...
...
@@ -1149,7 +1166,7 @@ class EngineArgs:
num_speculative_heads
=
self
.
num_speculative_heads
)
# Reminder: Please update docs/source/usage/compatibility_matrix.
rst
# Reminder: Please update docs/source/usage/compatibility_matrix.
md
# If the feature combo become valid
if
self
.
num_scheduler_steps
>
1
:
if
speculative_config
is
not
None
:
...
...
@@ -1269,11 +1286,14 @@ class EngineArgs:
# When no user override, set the default values based on the usage
# context.
# TODO(woosuk): Tune the default values for different hardware.
if
self
.
max_num_batched_tokens
is
None
:
if
usage_context
==
UsageContext
.
LLM_CLASS
:
self
.
max_num_batched_tokens
=
8192
elif
usage_context
==
UsageContext
.
OPENAI_API_SERVER
:
self
.
max_num_batched_tokens
=
2048
default_max_num_batched_tokens
=
{
UsageContext
.
LLM_CLASS
:
8192
,
UsageContext
.
OPENAI_API_SERVER
:
2048
,
}
if
(
self
.
max_num_batched_tokens
is
None
and
usage_context
in
default_max_num_batched_tokens
):
self
.
max_num_batched_tokens
=
default_max_num_batched_tokens
[
usage_context
]
logger
.
warning
(
"Setting max_num_batched_tokens to %d for %s usage context."
,
self
.
max_num_batched_tokens
,
usage_context
.
value
)
...
...
@@ -1283,9 +1303,6 @@ class EngineArgs:
Override the EngineConfig's configs based on the usage context for V1.
"""
assert
envs
.
VLLM_USE_V1
,
"V1 is not enabled"
if
engine_config
.
model_config
.
is_multimodal_model
:
# TODO (ywang96): Enable APC by default when VLM supports it.
assert
not
engine_config
.
cache_config
.
enable_prefix_caching
@
dataclass
...
...
vllm/engine/async_llm_engine.py
View file @
96ae75ad
...
...
@@ -1256,3 +1256,10 @@ class AsyncLLMEngine(EngineClient):
self
.
engine
.
model_executor
.
stop_profile
()
else
:
self
.
engine
.
model_executor
.
_run_workers
(
"stop_profile"
)
# TODO(v1): Remove this class proxy when V1 goes default.
if
envs
.
VLLM_USE_V1
:
from
vllm.v1.engine.async_llm
import
AsyncLLM
AsyncLLMEngine
=
AsyncLLM
# type: ignore
vllm/engine/llm_engine.py
View file @
96ae75ad
...
...
@@ -6,8 +6,8 @@ from collections import deque
from
contextlib
import
contextmanager
from
dataclasses
import
dataclass
from
functools
import
partial
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
ClassVar
,
Deque
,
Dict
,
Iterable
,
List
,
Mapping
,
NamedTuple
,
Optional
)
from
typing
import
(
TYPE_CHECKING
,
Callable
,
ClassVar
,
Deque
,
Dict
,
Iterable
,
List
,
Mapping
,
NamedTuple
,
Optional
)
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Set
,
Type
,
Union
,
cast
,
overload
...
...
@@ -53,7 +53,6 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
SequenceGroupOutput
,
SequenceStatus
,
CompletionSequenceGroupOutput
,
VLLM_INVALID_TOKEN_ID
)
from
vllm.tracing
import
(
SpanAttributes
,
SpanKind
,
extract_trace_context
,
init_tracer
)
from
vllm.transformers_utils.config
import
try_get_generation_config
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer_group
import
(
...
...
@@ -66,20 +65,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger
=
init_logger
(
__name__
)
_LOCAL_LOGGING_INTERVAL_SEC
=
5
def
_load_generation_config_dict
(
model_config
:
ModelConfig
)
->
Dict
[
str
,
Any
]:
config
=
try_get_generation_config
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
,
revision
=
model_config
.
revision
,
)
if
config
is
None
:
return
{}
return
config
.
to_diff_dict
()
_G
=
TypeVar
(
"_G"
,
bound
=
BaseTokenizerGroup
,
default
=
BaseTokenizerGroup
)
_O
=
TypeVar
(
"_O"
,
RequestOutput
,
PoolingRequestOutput
)
...
...
@@ -149,7 +134,7 @@ class LLMEngine:
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
The config arguments are derived from :class:`~vllm.EngineArgs`. (See
:ref:`engine
_
args`)
:ref:`engine
-
args`)
Args:
model_config: The configuration related to the LLM model.
...
...
@@ -275,8 +260,8 @@ class LLMEngine:
return
tokenizer_group
.
get_lora_tokenizer
(
sequence
.
lora_request
)
self
.
seq_counter
=
Counter
()
self
.
generation_config_fields
=
_load_generation_config_dict
(
self
.
model_config
)
self
.
generation_config_fields
=
(
self
.
model_config
.
try_get_generation_config
()
)
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
model_config
,
self
.
tokenizer
,
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment