Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a99300bd
Commit
a99300bd
authored
Sep 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev
parents
cc3e01c7
5438967f
Changes
512
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1468 additions
and
316 deletions
+1468
-316
vllm/_custom_ops.py
vllm/_custom_ops.py
+202
-47
vllm/assets/image.py
vllm/assets/image.py
+1
-1
vllm/attention/__init__.py
vllm/attention/__init__.py
+0
-1
vllm/attention/backends/abstract.py
vllm/attention/backends/abstract.py
+5
-8
vllm/attention/backends/differential_flash_attn.py
vllm/attention/backends/differential_flash_attn.py
+15
-5
vllm/attention/backends/dual_chunk_flash_attn.py
vllm/attention/backends/dual_chunk_flash_attn.py
+2
-1
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+5
-3
vllm/attention/backends/mla/common.py
vllm/attention/backends/mla/common.py
+11
-6
vllm/attention/backends/rocm_flash_attn.py
vllm/attention/backends/rocm_flash_attn.py
+15
-10
vllm/attention/backends/utils.py
vllm/attention/backends/utils.py
+1
-1
vllm/attention/backends/xformers.py
vllm/attention/backends/xformers.py
+8
-7
vllm/attention/layer.py
vllm/attention/layer.py
+17
-7
vllm/attention/layers/chunked_local_attention.py
vllm/attention/layers/chunked_local_attention.py
+16
-13
vllm/attention/layers/encoder_only_attention.py
vllm/attention/layers/encoder_only_attention.py
+86
-0
vllm/attention/ops/flashmla.py
vllm/attention/ops/flashmla.py
+24
-19
vllm/beam_search.py
vllm/beam_search.py
+1
-1
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+803
-96
vllm/benchmarks/lib/endpoint_request_func.py
vllm/benchmarks/lib/endpoint_request_func.py
+62
-6
vllm/benchmarks/lib/utils.py
vllm/benchmarks/lib/utils.py
+6
-1
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+188
-83
No files found.
Too many changes to show.
To preserve performance only
512 of 512+
files are displayed.
Plain diff
Email patch
vllm/_custom_ops.py
View file @
a99300bd
...
@@ -790,15 +790,7 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
...
@@ -790,15 +790,7 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
# torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
# torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
# # marlin
# marlin_24
# def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
# b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
# size_n: int, size_k: int) -> torch.Tensor:
# return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
# size_n, size_k)
# # marlin_24
# def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
# def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
# b_meta: torch.Tensor, b_scales: torch.Tensor,
# b_meta: torch.Tensor, b_scales: torch.Tensor,
# workspace: torch.Tensor, b_q_type: ScalarType,
# workspace: torch.Tensor, b_q_type: ScalarType,
...
@@ -840,25 +832,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
...
@@ -840,25 +832,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
# is_zp_float: bool = False) -> torch.Tensor:
# is_zp_float: bool = False) -> torch.Tensor:
# return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
# return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
# @register_fake("_C::marlin_qqq_gemm")
# def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
# s_tok: torch.Tensor, s_ch: torch.Tensor,
# s_group: torch.Tensor, workspace: torch.Tensor,
# size_m: torch.SymInt, size_n: torch.SymInt,
# size_k: torch.SymInt) -> torch.Tensor:
# return torch.empty((size_m, size_n),
# dtype=torch.float16,
# device=a.device)
# @register_fake("_C::marlin_gemm")
# def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
# b_scales: torch.Tensor, workspace: torch.Tensor,
# size_m: torch.SymInt, size_n: torch.SymInt,
# size_k: torch.SymInt) -> torch.Tensor:
# return torch.empty((size_m, size_n),
# dtype=torch.float16,
# device=a.device)
# @register_fake("_C::awq_dequantize")
# @register_fake("_C::awq_dequantize")
# def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
# def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
# zeros: torch.Tensor, split_k_iters: torch.SymInt,
# zeros: torch.Tensor, split_k_iters: torch.SymInt,
...
@@ -904,6 +877,30 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
...
@@ -904,6 +877,30 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
# return torch.empty_like(b_q_weight,
# return torch.empty_like(b_q_weight,
# memory_format=torch.contiguous_format)
# memory_format=torch.contiguous_format)
# @register_fake("_C::cutlass_w4a8_mm")
# def cutlass_w4a8_mm_fake(
# a: torch.Tensor,
# # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
# b_q: torch.Tensor,
# b_group_scales: torch.Tensor,
# b_group_size: int,
# b_channel_scales: torch.Tensor,
# a_token_scales: torch.Tensor,
# out_type: Optional[torch.dtype] = None,
# maybe_schedule: Optional[str] = None) -> torch.Tensor:
# m = a.size(0)
# n = b_q.size(1)
# out_dtype = out_type if out_type is not None else torch.bfloat16
# return torch.empty((m, n), device=a.device, dtype=out_dtype)
# @register_fake("_C::cutlass_pack_scale_fp8")
# def cutlass_pack_scale_fp8_fake(scales: torch.Tensor) -> torch.Tensor:
# return torch.empty_like(scales, memory_format=torch.contiguous_format)
# @register_fake("_C::cutlass_encode_and_reorder_int4b")
# def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor:
# return torch.empty_like(b, memory_format=torch.contiguous_format)
# if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
# if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
...
@@ -920,7 +917,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
...
@@ -920,7 +917,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
# m = a.size(0)
# m = a.size(0)
# return torch.empty((m, n), device=a.device, dtype=a.dtype)
# return torch.empty((m, n), device=a.device, dtype=a.dtype)
if
hasattr
(
torch
.
ops
.
_C
,
"ggml_dequantize"
):
if
hasattr
(
torch
.
ops
.
_C
,
"ggml_dequantize"
):
@
register_fake
(
"_C::ggml_dequantize"
)
@
register_fake
(
"_C::ggml_dequantize"
)
...
@@ -1291,6 +1287,28 @@ def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
...
@@ -1291,6 +1287,28 @@ def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
blockscale_offsets
)
blockscale_offsets
)
def
get_cutlass_moe_mm_problem_sizes
(
topk_ids
:
torch
.
Tensor
,
problem_sizes1
:
torch
.
Tensor
,
problem_sizes2
:
torch
.
Tensor
,
num_experts
:
int
,
n
:
int
,
k
:
int
,
blockscale_offsets
:
Optional
[
torch
.
Tensor
]
=
None
):
"""
Compute only the per-expert problem sizes needed by the two grouped matrix
multiplications used in CUTLASS-based fused MoE.
The function takes in topk_ids (token→expert mapping) and computes:
- problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
multiplication for the two grouped MMs
used in the fused MoE operation.
"""
return
torch
.
ops
.
_C
.
get_cutlass_moe_mm_problem_sizes
(
topk_ids
,
problem_sizes1
,
problem_sizes2
,
num_experts
,
n
,
k
,
blockscale_offsets
)
def
shuffle_rows
(
input_tensor
:
torch
.
Tensor
,
dst2src_map
:
torch
.
Tensor
):
def
shuffle_rows
(
input_tensor
:
torch
.
Tensor
,
dst2src_map
:
torch
.
Tensor
):
"""
"""
Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
...
@@ -1484,6 +1502,30 @@ def machete_prepack_B(
...
@@ -1484,6 +1502,30 @@ def machete_prepack_B(
group_scales_type
)
group_scales_type
)
# CUTLASS W4A8
def
cutlass_w4a8_mm
(
a
:
torch
.
Tensor
,
# b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
b_q
:
torch
.
Tensor
,
b_group_scales
:
torch
.
Tensor
,
b_group_size
:
int
,
b_channel_scales
:
torch
.
Tensor
,
a_token_scales
:
torch
.
Tensor
,
out_type
:
Optional
[
torch
.
dtype
]
=
None
,
maybe_schedule
:
Optional
[
str
]
=
None
)
->
torch
.
Tensor
:
return
torch
.
ops
.
_C
.
cutlass_w4a8_mm
(
a
,
b_q
,
b_group_scales
,
b_group_size
,
b_channel_scales
,
a_token_scales
,
out_type
,
maybe_schedule
)
def
cutlass_pack_scale_fp8
(
scales
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
ops
.
_C
.
cutlass_pack_scale_fp8
(
scales
)
def
cutlass_encode_and_reorder_int4b
(
b
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
ops
.
_C
.
cutlass_encode_and_reorder_int4b
(
b
)
if
hasattr
(
torch
.
ops
.
_C
,
"permute_cols"
):
if
hasattr
(
torch
.
ops
.
_C
,
"permute_cols"
):
@
register_fake
(
"_C::permute_cols"
)
@
register_fake
(
"_C::permute_cols"
)
...
@@ -1773,15 +1815,6 @@ def scaled_int8_quant(
...
@@ -1773,15 +1815,6 @@ def scaled_int8_quant(
return
output
,
input_scales
,
input_azp
return
output
,
input_scales
,
input_azp
# qqq ops
def
marlin_qqq_gemm
(
a
:
torch
.
Tensor
,
b_q_weight
:
torch
.
Tensor
,
s_tok
:
torch
.
Tensor
,
s_ch
:
torch
.
Tensor
,
s_group
:
torch
.
Tensor
,
workspace
:
torch
.
Tensor
,
size_m
:
int
,
size_n
:
int
,
size_k
:
int
)
->
torch
.
Tensor
:
return
torch
.
ops
.
_C
.
marlin_qqq_gemm
(
a
,
b_q_weight
,
s_tok
,
s_ch
,
s_group
,
workspace
,
size_m
,
size_n
,
size_k
)
# gguf
# gguf
def
ggml_dequantize
(
W
:
torch
.
Tensor
,
quant_type
:
int
,
m
:
int
,
n
:
int
,
def
ggml_dequantize
(
W
:
torch
.
Tensor
,
quant_type
:
int
,
m
:
int
,
n
:
int
,
dtype
:
Optional
[
torch
.
dtype
])
->
torch
.
Tensor
:
dtype
:
Optional
[
torch
.
dtype
])
->
torch
.
Tensor
:
...
@@ -1918,6 +1951,17 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
...
@@ -1918,6 +1951,17 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
gating_output
)
gating_output
)
def
grouped_topk
(
scores
:
torch
.
Tensor
,
scores_with_bias
:
torch
.
Tensor
,
num_expert_group
:
int
,
topk_group
:
int
,
topk
:
int
,
renormalize
:
bool
,
routed_scaling_factor
:
float
):
if
not
current_platform
.
is_cuda
():
raise
NotImplementedError
(
"The fused grouped_topk kernel is only "
"available on CUDA platforms"
)
return
torch
.
ops
.
_moe_C
.
grouped_topk
(
scores
,
scores_with_bias
,
num_expert_group
,
topk_group
,
topk
,
renormalize
,
routed_scaling_factor
)
def
moe_wna16_marlin_gemm
(
input
:
torch
.
Tensor
,
output
:
Optional
[
torch
.
Tensor
],
def
moe_wna16_marlin_gemm
(
input
:
torch
.
Tensor
,
output
:
Optional
[
torch
.
Tensor
],
b_qweight
:
torch
.
Tensor
,
b_qweight
:
torch
.
Tensor
,
b_bias
:
Optional
[
torch
.
Tensor
],
b_bias
:
Optional
[
torch
.
Tensor
],
...
@@ -2045,6 +2089,20 @@ def concat_and_cache_mla(
...
@@ -2045,6 +2089,20 @@ def concat_and_cache_mla(
scale
)
scale
)
def
cp_fused_concat_and_cache_mla
(
kv_c
:
torch
.
Tensor
,
k_pe
:
torch
.
Tensor
,
cp_local_token_select_indices
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
slot_mapping
:
torch
.
Tensor
,
kv_cache_dtype
:
str
,
scale
:
torch
.
Tensor
,
)
->
None
:
torch
.
ops
.
_C_cache_ops
.
cp_fused_concat_and_cache_mla
(
kv_c
,
k_pe
,
cp_local_token_select_indices
,
kv_cache
,
slot_mapping
,
kv_cache_dtype
,
scale
)
def
copy_blocks
(
key_caches
:
list
[
torch
.
Tensor
],
def
copy_blocks
(
key_caches
:
list
[
torch
.
Tensor
],
value_caches
:
list
[
torch
.
Tensor
],
value_caches
:
list
[
torch
.
Tensor
],
block_mapping
:
torch
.
Tensor
)
->
None
:
block_mapping
:
torch
.
Tensor
)
->
None
:
...
@@ -2068,14 +2126,28 @@ def convert_fp8(output: torch.Tensor,
...
@@ -2068,14 +2126,28 @@ def convert_fp8(output: torch.Tensor,
torch
.
ops
.
_C_cache_ops
.
convert_fp8
(
output
,
input
,
scale
,
kv_dtype
)
torch
.
ops
.
_C_cache_ops
.
convert_fp8
(
output
,
input
,
scale
,
kv_dtype
)
def
gather_cache
(
src_cache
:
torch
.
Tensor
,
def
gather_and_maybe_dequant_cache
(
dst
:
torch
.
Tensor
,
src_cache
:
torch
.
Tensor
,
block_table
:
torch
.
Tensor
,
dst
:
torch
.
Tensor
,
cu_seq_lens
:
torch
.
Tensor
,
block_table
:
torch
.
Tensor
,
batch_size
:
int
,
cu_seq_lens
:
torch
.
Tensor
,
seq_starts
:
Optional
[
torch
.
Tensor
]
=
None
)
->
None
:
batch_size
:
int
,
torch
.
ops
.
_C_cache_ops
.
gather_cache
(
src_cache
,
dst
,
block_table
,
kv_cache_dtype
:
str
,
cu_seq_lens
,
batch_size
,
seq_starts
)
scale
:
torch
.
Tensor
,
seq_starts
:
Optional
[
torch
.
Tensor
]
=
None
)
->
None
:
torch
.
ops
.
_C_cache_ops
.
gather_and_maybe_dequant_cache
(
src_cache
,
dst
,
block_table
,
cu_seq_lens
,
batch_size
,
kv_cache_dtype
,
scale
,
seq_starts
)
def
cp_gather_cache
(
src_cache
:
torch
.
Tensor
,
dst
:
torch
.
Tensor
,
block_table
:
torch
.
Tensor
,
cu_seq_lens
:
torch
.
Tensor
,
batch_size
:
int
,
seq_starts
:
Optional
[
torch
.
Tensor
]
=
None
)
->
None
:
torch
.
ops
.
_C_cache_ops
.
cp_gather_cache
(
src_cache
,
dst
,
block_table
,
cu_seq_lens
,
batch_size
,
seq_starts
)
def
get_device_attribute
(
attribute
:
int
,
device
:
int
)
->
int
:
def
get_device_attribute
(
attribute
:
int
,
device
:
int
)
->
int
:
...
@@ -2378,9 +2450,92 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
...
@@ -2378,9 +2450,92 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
N
=
mat2
.
size
(
0
)
N
=
mat2
.
size
(
0
)
return
torch
.
empty
((
M
,
N
),
dtype
=
out_dtype
)
return
torch
.
empty
((
M
,
N
),
dtype
=
out_dtype
)
class
CPUDNNLGEMMHandler
:
def
__init__
(
self
)
->
None
:
self
.
handler
:
Optional
[
int
]
=
None
self
.
n
=
-
1
self
.
k
=
-
1
def
__del__
(
self
):
if
self
.
handler
is
not
None
:
torch
.
ops
.
_C
.
release_dnnl_matmul_handler
(
self
.
handler
)
def
create_onednn_scaled_mm
(
weight
:
torch
.
Tensor
,
# [K, N]
weight_scales
:
torch
.
Tensor
,
output_type
:
torch
.
dtype
,
dynamic_quant
:
bool
,
use_azp
:
bool
,
primitive_cache_size
:
int
=
128
,
)
->
CPUDNNLGEMMHandler
:
handler
=
CPUDNNLGEMMHandler
()
handler
.
k
,
handler
.
n
=
weight
.
size
()
handler
.
handler
=
torch
.
ops
.
_C
.
create_onednn_scaled_mm_handler
(
weight
,
weight_scales
,
output_type
,
dynamic_quant
,
use_azp
,
primitive_cache_size
)
return
handler
def
onednn_scaled_int8_quant
(
input
:
torch
.
Tensor
,
scale
:
Optional
[
torch
.
Tensor
]
=
None
,
azp
:
Optional
[
torch
.
Tensor
]
=
None
,
symmetric
:
bool
=
True
):
"""
Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
Args:
input: The input tensor to be quantized to int8.
scale: Optional scaling factor for the int8 quantization.
When not provided, we invoke dynamic-per-token quantization.
azp: Optional zero-point for the int8 quantization.
Must be provided for asymmetric quantization if `scale` is provided.
symmetric: Whether to use symmetric quantization (scale only, azp ignored).
Returns:
tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
"""
output
=
torch
.
empty_like
(
input
,
dtype
=
torch
.
int8
)
token_num
=
input
.
numel
()
//
input
.
shape
[
-
1
]
input
=
input
.
view
((
token_num
,
input
.
shape
[
-
1
]))
if
scale
is
not
None
:
# static-per-tensor quantization.
assert
symmetric
==
(
azp
is
None
),
"azp must only be provided for asymmetric quantization."
torch
.
ops
.
_C
.
static_scaled_int8_quant
(
output
,
input
,
scale
,
azp
)
return
output
,
scale
,
azp
# dynamic-per-token quantization.
input_scales
=
torch
.
empty
((
token_num
,
1
),
device
=
input
.
device
,
dtype
=
torch
.
float32
)
input_azp
=
None
if
symmetric
else
torch
.
empty_like
(
input_scales
,
dtype
=
torch
.
int32
)
torch
.
ops
.
_C
.
dynamic_scaled_int8_quant
(
output
,
input
,
input_scales
,
input_azp
)
return
output
,
input_scales
,
input_azp
def
onednn_scaled_mm
(
dnnl_handler
:
CPUDNNLGEMMHandler
,
x
:
torch
.
Tensor
,
output
:
torch
.
Tensor
,
input_scale
:
Optional
[
torch
.
Tensor
],
input_zp
:
Optional
[
torch
.
Tensor
],
input_zp_adj
:
Optional
[
torch
.
Tensor
],
bias
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
torch
.
ops
.
_C
.
onednn_scaled_mm
(
output
,
x
,
input_scale
,
input_zp
,
input_zp_adj
,
bias
,
dnnl_handler
.
handler
)
return
output
direct_register_custom_op
(
direct_register_custom_op
(
op_name
=
"awq_gemm"
,
op_name
=
"awq_gemm"
,
op_func
=
awq_gemm
,
op_func
=
awq_gemm
,
mutates_args
=
[],
mutates_args
=
[],
fake_impl
=
awq_gemm_fake
,
fake_impl
=
awq_gemm_fake
,
)
)
\ No newline at end of file
vllm/assets/image.py
View file @
a99300bd
...
@@ -11,7 +11,7 @@ from .base import get_vllm_public_assets
...
@@ -11,7 +11,7 @@ from .base import get_vllm_public_assets
VLM_IMAGES_DIR
=
"vision_model_images"
VLM_IMAGES_DIR
=
"vision_model_images"
ImageAssetName
=
Literal
[
"stop_sign"
,
"cherry_blossom"
]
ImageAssetName
=
Literal
[
"stop_sign"
,
"cherry_blossom"
,
"hato"
]
@
dataclass
(
frozen
=
True
)
@
dataclass
(
frozen
=
True
)
...
...
vllm/attention/__init__.py
View file @
a99300bd
...
@@ -14,7 +14,6 @@ __all__ = [
...
@@ -14,7 +14,6 @@ __all__ = [
"AttentionMetadata"
,
"AttentionMetadata"
,
"AttentionType"
,
"AttentionType"
,
"AttentionMetadataBuilder"
,
"AttentionMetadataBuilder"
,
"Attention"
,
"AttentionState"
,
"AttentionState"
,
"get_attn_backend"
,
"get_attn_backend"
,
]
]
vllm/attention/backends/abstract.py
View file @
a99300bd
...
@@ -9,8 +9,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
...
@@ -9,8 +9,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
import
torch
import
torch
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
QuantKey
GroupShape
)
from
vllm.multimodal
import
MultiModalPlaceholderMap
from
vllm.multimodal
import
MultiModalPlaceholderMap
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
...
@@ -285,20 +284,17 @@ class AttentionImpl(ABC, Generic[T]):
...
@@ -285,20 +284,17 @@ class AttentionImpl(ABC, Generic[T]):
attn_metadata
:
T
,
attn_metadata
:
T
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
raise
NotImplementedError
raise
NotImplementedError
def
fused_output_quant_supported
(
self
,
dtype
:
torch
.
dtype
,
static
:
bool
,
def
fused_output_quant_supported
(
self
,
quant_key
:
QuantKey
):
group_shape
:
GroupShape
):
"""
"""
Does this attention implementation support fused output quantization.
Does this attention implementation support fused output quantization.
This is used by the AttnFusionPass to only fuse output quantization
This is used by the AttnFusionPass to only fuse output quantization
onto implementations that support it.
onto implementations that support it.
TODO(luka) merge parameters into QuantDescriptor
:param quant_key: QuantKey object that describes the quantization op
:param dtype: quantized dtype
:param static: static or dynamic quantization
:param group_shape: quant group shape.
:return: is fusion supported for this type of quantization
:return: is fusion supported for this type of quantization
"""
"""
return
False
return
False
...
@@ -317,6 +313,7 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
...
@@ -317,6 +313,7 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
attn_metadata
:
T
,
attn_metadata
:
T
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
raise
NotImplementedError
raise
NotImplementedError
...
...
vllm/attention/backends/differential_flash_attn.py
View file @
a99300bd
...
@@ -800,23 +800,33 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
...
@@ -800,23 +800,33 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
attn_metadata
:
DifferentialFlashAttentionMetadata
,
attn_metadata
:
DifferentialFlashAttentionMetadata
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
"""Forward pass with FlashAttention.
"""Forward pass with FlashAttention.
Args:
Args:
query: shape = [num_tokens, num_heads, head_size]
layer: Attention layer instance.
key: shape = [num_tokens, num_kv_heads, head_size]
q: Query tensor with shape = [num_tokens, num_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size]
v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
for profiling run.
attn_metadata: Metadata for attention.
attn_metadata: Metadata for attention.
output: Output tensor with shape [num_tokens, num_heads, head_size]
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
NOTE: It in-place updates the output tensor.
NOTE: It in-place updates the output tensor.
NOTE: FP8 quantization, flash-attn expect the size of
NOTE: FP8 quantization, flash-attn expect the size of
{q,k,v}_descale to be (num_sequences, num_kv_heads).
{q,k,v}_descale to be (num_sequences, num_kv_heads).
We use torch's .expand() to avoid duplicating values
We use torch's .expand() to avoid duplicating values
"""
"""
if
output_scale
is
not
None
or
output_block_scale
is
not
None
:
raise
NotImplementedError
(
"fused output quantization is not yet supported"
" for DifferentialFlashAttentionImpl"
)
if
self
.
lambda_full
is
None
:
if
self
.
lambda_full
is
None
:
self
.
lambda_init
=
self
.
differential_flash_attention_config
[
self
.
lambda_init
=
self
.
differential_flash_attention_config
[
"lambda_init"
]
"lambda_init"
]
...
...
vllm/attention/backends/dual_chunk_flash_attn.py
View file @
a99300bd
...
@@ -376,6 +376,7 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
...
@@ -376,6 +376,7 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
attn_metadata
:
DualChunkFlashAttentionMetadata
,
attn_metadata
:
DualChunkFlashAttentionMetadata
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
"""Forward pass with DualChunkFlashAttention.
"""Forward pass with DualChunkFlashAttention.
Args:
Args:
...
@@ -391,7 +392,7 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
...
@@ -391,7 +392,7 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
"""
"""
assert
output
is
None
,
"Output tensor not supported for DualChunk"
assert
output
is
None
,
"Output tensor not supported for DualChunk"
if
output_scale
is
not
None
:
if
output_scale
is
not
None
or
output_block_scale
is
not
None
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"fused output quantization is not yet supported"
"fused output quantization is not yet supported"
" for FlashAttentionImpl"
)
" for FlashAttentionImpl"
)
...
...
vllm/attention/backends/flash_attn.py
View file @
a99300bd
...
@@ -603,6 +603,7 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -603,6 +603,7 @@ class FlashAttentionImpl(AttentionImpl):
attn_metadata
:
FlashAttentionMetadata
,
attn_metadata
:
FlashAttentionMetadata
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
"""Forward pass with FlashAttention.
"""Forward pass with FlashAttention.
...
@@ -611,7 +612,8 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -611,7 +612,8 @@ class FlashAttentionImpl(AttentionImpl):
key: shape = [num_tokens, num_kv_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size]
output: shape = [num_tokens, num_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
for profiling run.
attn_metadata: Metadata for attention.
attn_metadata: Metadata for attention.
...
@@ -622,7 +624,7 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -622,7 +624,7 @@ class FlashAttentionImpl(AttentionImpl):
"""
"""
assert
output
is
not
None
,
"Output tensor must be provided."
assert
output
is
not
None
,
"Output tensor must be provided."
if
output_scale
is
not
None
:
if
output_scale
is
not
None
or
output_block_scale
is
not
None
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"fused output quantization is not yet supported"
"fused output quantization is not yet supported"
" for FlashAttentionImpl"
)
" for FlashAttentionImpl"
)
...
@@ -925,7 +927,7 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -925,7 +927,7 @@ class FlashAttentionImpl(AttentionImpl):
def
_get_query_key_seq_metadata
(
def
_get_query_key_seq_metadata
(
attn_metadata
,
attn_metadata
:
FlashAttentionMetadata
,
is_prompt
:
bool
,
is_prompt
:
bool
,
attn_type
:
str
,
attn_type
:
str
,
)
->
tuple
:
)
->
tuple
:
...
...
vllm/attention/backends/mla/common.py
View file @
a99300bd
...
@@ -837,8 +837,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
...
@@ -837,8 +837,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
self
.
context_chunk_workspace_size
//
num_prefills_with_context
self
.
context_chunk_workspace_size
//
num_prefills_with_context
# align max_context_chunk to page_size by rounding down,
# align max_context_chunk to page_size by rounding down,
# currently the `gather_cache` kernel cannot
handle
# currently the `gather_
and_maybe_dequant_
cache` kernel cannot
# `context_chunk_starts` that are not aligned to page_size
#
handle
`context_chunk_starts` that are not aligned to page_size
max_context_chunk
=
round_down
(
max_context_chunk
,
self
.
page_size
)
max_context_chunk
=
round_down
(
max_context_chunk
,
self
.
page_size
)
assert
max_context_chunk
>
0
assert
max_context_chunk
>
0
num_chunks
=
cdiv
(
context_lens_tensor
.
max
(),
max_context_chunk
)
num_chunks
=
cdiv
(
context_lens_tensor
.
max
(),
max_context_chunk
)
...
@@ -1090,6 +1090,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
...
@@ -1090,6 +1090,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
q
:
torch
.
Tensor
,
q
:
torch
.
Tensor
,
kv_c_and_k_pe_cache
:
torch
.
Tensor
,
kv_c_and_k_pe_cache
:
torch
.
Tensor
,
attn_metadata
:
MLACommonMetadata
,
attn_metadata
:
MLACommonMetadata
,
k_scale
:
torch
.
Tensor
,
):
):
prefill_metadata
=
attn_metadata
.
prefill_metadata
prefill_metadata
=
attn_metadata
.
prefill_metadata
assert
prefill_metadata
is
not
None
assert
prefill_metadata
is
not
None
...
@@ -1111,12 +1112,14 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
...
@@ -1111,12 +1112,14 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
for
i
in
range
(
iters
):
for
i
in
range
(
iters
):
toks
=
prefill_metadata
.
context_chunk_seq_tot
[
i
]
toks
=
prefill_metadata
.
context_chunk_seq_tot
[
i
]
ops
.
gather_cache
(
ops
.
gather_
and_maybe_dequant_
cache
(
src_cache
=
kv_c_and_k_pe_cache
,
src_cache
=
kv_c_and_k_pe_cache
,
dst
=
workspace
,
dst
=
workspace
,
block_table
=
prefill_metadata
.
block_tables
,
block_table
=
prefill_metadata
.
block_tables
,
cu_seq_lens
=
prefill_metadata
.
context_chunk_cu_seq_lens
[
i
],
cu_seq_lens
=
prefill_metadata
.
context_chunk_cu_seq_lens
[
i
],
batch_size
=
prefill_metadata
.
num_prefills
,
batch_size
=
prefill_metadata
.
num_prefills
,
kv_cache_dtype
=
self
.
kv_cache_dtype
,
scale
=
k_scale
,
seq_starts
=
prefill_metadata
.
context_chunk_starts
[
i
],
seq_starts
=
prefill_metadata
.
context_chunk_starts
[
i
],
)
)
...
@@ -1173,6 +1176,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
...
@@ -1173,6 +1176,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
k_pe
:
torch
.
Tensor
,
k_pe
:
torch
.
Tensor
,
kv_c_and_k_pe_cache
:
torch
.
Tensor
,
kv_c_and_k_pe_cache
:
torch
.
Tensor
,
attn_metadata
:
MLACommonMetadata
,
attn_metadata
:
MLACommonMetadata
,
k_scale
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
prefill_metadata
=
attn_metadata
.
prefill_metadata
prefill_metadata
=
attn_metadata
.
prefill_metadata
...
@@ -1208,7 +1212,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
...
@@ -1208,7 +1212,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
# ROCm flash_attn_varlen_func will return 3 objects instead of 2
# ROCm flash_attn_varlen_func will return 3 objects instead of 2
suffix_output
,
suffix_lse
=
output
suffix_output
,
suffix_lse
=
output
context_output
,
context_lse
=
self
.
_compute_prefill_context
(
\
context_output
,
context_lse
=
self
.
_compute_prefill_context
(
\
q
,
kv_c_and_k_pe_cache
,
attn_metadata
)
q
,
kv_c_and_k_pe_cache
,
attn_metadata
,
k_scale
)
output
=
torch
.
empty_like
(
suffix_output
)
output
=
torch
.
empty_like
(
suffix_output
)
merge_attn_states
(
merge_attn_states
(
...
@@ -1245,12 +1249,13 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
...
@@ -1245,12 +1249,13 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
attn_metadata
:
T
,
attn_metadata
:
T
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
if
output
is
not
None
:
if
output
is
not
None
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"output is not yet supported for MLAImplBase"
)
"output is not yet supported for MLAImplBase"
)
if
output_scale
is
not
None
:
if
output_scale
is
not
None
or
output_block_scale
is
not
None
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"fused output quantization is not yet supported"
"fused output quantization is not yet supported"
" for MLAImplBase"
)
" for MLAImplBase"
)
...
@@ -1298,7 +1303,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
...
@@ -1298,7 +1303,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
if
has_prefill
:
if
has_prefill
:
output
[:
num_prefill_tokens
]
=
self
.
_forward_prefill
(
output
[:
num_prefill_tokens
]
=
self
.
_forward_prefill
(
prefill_q
,
prefill_k_c_normed
,
prefill_k_pe
,
kv_cache
,
prefill_q
,
prefill_k_c_normed
,
prefill_k_pe
,
kv_cache
,
attn_metadata
)
attn_metadata
,
layer
.
_k_scale
)
if
has_decode
:
if
has_decode
:
decode_q_nope
,
decode_q_pe
=
decode_q
.
split
(
decode_q_nope
,
decode_q_pe
=
decode_q
.
split
(
...
...
vllm/attention/backends/rocm_flash_attn.py
View file @
a99300bd
...
@@ -23,7 +23,7 @@ from vllm.config import get_current_vllm_config
...
@@ -23,7 +23,7 @@ from vllm.config import get_current_vllm_config
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
)
QuantKey
,
kFp8StaticTensorSym
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
SUPPORT_TC
,
gpuname
from
vllm.utils
import
SUPPORT_TC
,
gpuname
...
@@ -549,11 +549,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
...
@@ -549,11 +549,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
head_dim
).
reshape
(
tokens
,
n_kv_heads
*
n_rep
,
head_dim
).
reshape
(
tokens
,
n_kv_heads
*
n_rep
,
head_dim
))
head_dim
))
def
fused_output_quant_supported
(
self
,
dtype
:
torch
.
dtype
,
static
:
bool
,
def
fused_output_quant_supported
(
self
,
quant_key
:
QuantKey
):
group_shape
:
GroupShape
):
if
self
.
use_triton_flash_attn
:
if
self
.
use_triton_flash_attn
:
return
dtype
==
current_platform
.
fp8_dtype
(
return
quant_key
==
kFp8StaticTensorSym
)
and
static
and
group_shape
==
GroupShape
.
PER_TENSOR
# Only supported in the Triton backend
# Only supported in the Triton backend
return
False
return
False
...
@@ -568,6 +566,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
...
@@ -568,6 +566,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
attn_metadata
:
ROCmFlashAttentionMetadata
,
attn_metadata
:
ROCmFlashAttentionMetadata
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
"""Forward pass with FlashAttention and PagedAttention.
"""Forward pass with FlashAttention and PagedAttention.
...
@@ -605,17 +604,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
...
@@ -605,17 +604,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
use prefill sequence attributes
use prefill sequence attributes
Args:
Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size]
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
for profiling run.
attn_metadata: Metadata for attention.
attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention,
output: Optional output tensor.
decoder self-attention, or encoder/decoder cross-
output_scale: Optional output scale tensor.
attention. Defaults to decoder self-attention,
output_block_scale: Optional output block scale tensor.
which is the vLLM default generally
Returns:
Returns:
shape = [num_tokens, num_heads * head_size]
shape = [num_tokens, num_heads * head_size]
"""
"""
...
@@ -626,6 +626,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
...
@@ -626,6 +626,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
"fused output quantization only supported for Triton"
"fused output quantization only supported for Triton"
" implementation in ROCMFlashAttentionImpl for now"
)
" implementation in ROCMFlashAttentionImpl for now"
)
if
output_block_scale
is
not
None
:
raise
NotImplementedError
(
"fused nvfp4 output quantization is not supported"
" for ROCMFlashAttentionImpl"
)
query
=
query
.
view
(
-
1
,
self
.
num_heads
,
self
.
head_size
)
query
=
query
.
view
(
-
1
,
self
.
num_heads
,
self
.
head_size
)
if
key
is
not
None
:
if
key
is
not
None
:
assert
value
is
not
None
assert
value
is
not
None
...
...
vllm/attention/backends/utils.py
View file @
a99300bd
...
@@ -585,7 +585,7 @@ def get_num_prefill_decode_query_kv_tokens(
...
@@ -585,7 +585,7 @@ def get_num_prefill_decode_query_kv_tokens(
Raises:
Raises:
AssertionError: If the number of encoder tokens in `attn_metadata`
AssertionError: If the number of encoder tokens in `attn_metadata`
is `None` when required for the calculations.
is `None` when required for the calculations.
"""
"""
num_prefill_query_tokens
=
0
num_prefill_query_tokens
=
0
num_decode_query_tokens
=
0
num_decode_query_tokens
=
0
...
...
vllm/attention/backends/xformers.py
View file @
a99300bd
...
@@ -439,6 +439,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
...
@@ -439,6 +439,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
attn_metadata
:
"XFormersMetadata"
,
attn_metadata
:
"XFormersMetadata"
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
"""Forward pass with xFormers and PagedAttention.
"""Forward pass with xFormers and PagedAttention.
...
@@ -477,21 +478,22 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
...
@@ -477,21 +478,22 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
max_encoder_seq_len)
max_encoder_seq_len)
Args:
Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size]
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
for profiling run.
attn_metadata: Metadata for attention.
attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention,
output: Optional output tensor.
decoder self-attention, or encoder/decoder cross-
output_scale: Optional output scale tensor.
attention. Defaults to decoder self-attention,
output_block_scale: Optional output block scale tensor.
which is the vLLM default generally
Returns:
Returns:
shape = [num_tokens, num_heads * head_size]
shape = [num_tokens, num_heads * head_size]
"""
"""
if
output_scale
is
not
None
:
if
output_scale
is
not
None
or
output_block_scale
is
not
None
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"fused output quantization is not yet supported"
"fused output quantization is not yet supported"
" for XFormersImpl"
)
" for XFormersImpl"
)
...
@@ -654,7 +656,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
...
@@ -654,7 +656,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
for API spec.
for API spec.
Args:
Args:
output: shape = [num_prefill_tokens, num_heads, head_size]
query: shape = [num_prefill_tokens, num_heads, head_size]
query: shape = [num_prefill_tokens, num_heads, head_size]
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
value: shape = [num_prefill_tokens, num_kv_heads, head_size]
value: shape = [num_prefill_tokens, num_kv_heads, head_size]
...
...
vllm/attention/layer.py
View file @
a99300bd
...
@@ -18,6 +18,7 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
...
@@ -18,6 +18,7 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
is_v1_kv_transfer_group
)
is_v1_kv_transfer_group
)
from
vllm.forward_context
import
ForwardContext
,
get_forward_context
from
vllm.forward_context
import
ForwardContext
,
get_forward_context
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention_layer_base
import
AttentionLayerBase
from
vllm.model_executor.layers.linear
import
UnquantizedLinearMethod
from
vllm.model_executor.layers.linear
import
UnquantizedLinearMethod
from
vllm.model_executor.layers.quantization.base_config
import
(
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
QuantizationConfig
)
...
@@ -54,7 +55,7 @@ def check_xformers_availability():
...
@@ -54,7 +55,7 @@ def check_xformers_availability():
return
USE_XFORMERS_OPS
return
USE_XFORMERS_OPS
class
Attention
(
nn
.
Module
):
class
Attention
(
nn
.
Module
,
AttentionLayerBase
):
"""Attention layer.
"""Attention layer.
This class takes query, key, and value tensors as input. The input tensors
This class takes query, key, and value tensors as input. The input tensors
...
@@ -128,11 +129,17 @@ class Attention(nn.Module):
...
@@ -128,11 +129,17 @@ class Attention(nn.Module):
self
.
_q_scale
=
torch
.
tensor
(
1.0
,
dtype
=
torch
.
float32
)
self
.
_q_scale
=
torch
.
tensor
(
1.0
,
dtype
=
torch
.
float32
)
self
.
_prob_scale
=
torch
.
tensor
(
1.0
,
dtype
=
torch
.
float32
)
self
.
_prob_scale
=
torch
.
tensor
(
1.0
,
dtype
=
torch
.
float32
)
# We also keep the float32 versions of k/v_scale for attention
# We also keep q/k/v_scale on host (cpu) memory for attention
# backends that don't support tensors (Flashinfer)
# backends that require the scales to be on host instead of on device.
# e.g. Flashinfer
self
.
_q_scale_float
=
1.0
self
.
_k_scale_float
=
1.0
self
.
_k_scale_float
=
1.0
self
.
_v_scale_float
=
1.0
self
.
_v_scale_float
=
1.0
# The output scale on host memory. This should be the input scale of
# the quant op after this attention layer.
self
.
_o_scale_float
:
Optional
[
float
]
=
None
self
.
use_mla
=
use_mla
self
.
use_mla
=
use_mla
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
...
@@ -183,8 +190,7 @@ class Attention(nn.Module):
...
@@ -183,8 +190,7 @@ class Attention(nn.Module):
# torch.compile works by registering the attention as one giant
# torch.compile works by registering the attention as one giant
# opaque custom op. For other platforms, we directly call them
# opaque custom op. For other platforms, we directly call them
# and let torch.compile handle them.
# and let torch.compile handle them.
self
.
use_direct_call
=
not
current_platform
.
is_cuda_alike
(
self
.
use_direct_call
=
not
current_platform
.
opaque_attention_op
()
)
and
not
current_platform
.
is_cpu
()
self
.
use_output
=
self
.
attn_backend
.
accept_output_buffer
self
.
use_output
=
self
.
attn_backend
.
accept_output_buffer
compilation_config
=
get_current_vllm_config
().
compilation_config
compilation_config
=
get_current_vllm_config
().
compilation_config
...
@@ -291,6 +297,7 @@ class Attention(nn.Module):
...
@@ -291,6 +297,7 @@ class Attention(nn.Module):
self
.
_q_scale
.
copy_
(
torch
.
abs
(
query
).
max
()
/
self
.
q_range
)
self
.
_q_scale
.
copy_
(
torch
.
abs
(
query
).
max
()
/
self
.
q_range
)
self
.
_k_scale
.
copy_
(
torch
.
abs
(
key
).
max
()
/
self
.
k_range
)
self
.
_k_scale
.
copy_
(
torch
.
abs
(
key
).
max
()
/
self
.
k_range
)
self
.
_v_scale
.
copy_
(
torch
.
abs
(
value
).
max
()
/
self
.
v_range
)
self
.
_v_scale
.
copy_
(
torch
.
abs
(
value
).
max
()
/
self
.
v_range
)
self
.
_q_scale_float
=
self
.
_q_scale
.
item
()
self
.
_k_scale_float
=
self
.
_k_scale
.
item
()
self
.
_k_scale_float
=
self
.
_k_scale
.
item
()
self
.
_v_scale_float
=
self
.
_v_scale
.
item
()
self
.
_v_scale_float
=
self
.
_v_scale
.
item
()
# We only calculate the scales once
# We only calculate the scales once
...
@@ -488,6 +495,7 @@ def unified_attention_with_output(
...
@@ -488,6 +495,7 @@ def unified_attention_with_output(
output
:
torch
.
Tensor
,
output
:
torch
.
Tensor
,
layer_name
:
str
,
layer_name
:
str
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
None
:
)
->
None
:
wait_for_kv_layer_from_connector
(
layer_name
)
wait_for_kv_layer_from_connector
(
layer_name
)
forward_context
:
ForwardContext
=
get_forward_context
()
forward_context
:
ForwardContext
=
get_forward_context
()
...
@@ -503,7 +511,8 @@ def unified_attention_with_output(
...
@@ -503,7 +511,8 @@ def unified_attention_with_output(
kv_cache
,
kv_cache
,
attn_metadata
,
attn_metadata
,
output
=
output
,
output
=
output
,
output_scale
=
output_scale
)
output_scale
=
output_scale
,
output_block_scale
=
output_block_scale
)
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
...
@@ -515,6 +524,7 @@ def unified_attention_with_output_fake(
...
@@ -515,6 +524,7 @@ def unified_attention_with_output_fake(
output
:
torch
.
Tensor
,
output
:
torch
.
Tensor
,
layer_name
:
str
,
layer_name
:
str
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
None
:
)
->
None
:
return
return
...
@@ -522,7 +532,7 @@ def unified_attention_with_output_fake(
...
@@ -522,7 +532,7 @@ def unified_attention_with_output_fake(
direct_register_custom_op
(
direct_register_custom_op
(
op_name
=
"unified_attention_with_output"
,
op_name
=
"unified_attention_with_output"
,
op_func
=
unified_attention_with_output
,
op_func
=
unified_attention_with_output
,
mutates_args
=
[
"output"
],
mutates_args
=
[
"output"
,
"output_block_scale"
],
fake_impl
=
unified_attention_with_output_fake
,
fake_impl
=
unified_attention_with_output_fake
,
dispatch_key
=
current_platform
.
dispatch_key
,
dispatch_key
=
current_platform
.
dispatch_key
,
)
)
vllm/attention/layers/chunked_local_attention.py
View file @
a99300bd
...
@@ -6,12 +6,13 @@ from typing import List, Optional
...
@@ -6,12 +6,13 @@ from typing import List, Optional
import
torch
import
torch
from
vllm
import
envs
from
vllm
import
envs
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionMetadata
)
from
vllm.attention.selector
import
get_attn_backend
from
vllm.attention.selector
import
get_attn_backend
from
vllm.config
import
CacheConfig
,
QuantizationConfig
from
vllm.config
import
CacheConfig
,
QuantizationConfig
from
vllm.v1.attention.backends.utils
import
(
from
vllm.v1.attention.backends.utils
import
(
CommonAttentionMetadata
,
make_local_attention_virtual_batches
,
CommonAttentionMetadata
,
make_local_attention_virtual_batches
,
subclass_attention_backend
,
subclass_attention_metadata_builder
)
subclass_attention_backend
)
from
..layer
import
Attention
from
..layer
import
Attention
...
@@ -24,21 +25,23 @@ def create_chunked_local_attention_backend(
...
@@ -24,21 +25,23 @@ def create_chunked_local_attention_backend(
)
->
type
[
AttentionBackend
]:
)
->
type
[
AttentionBackend
]:
prefix
=
f
"ChunkedLocalAttention_
{
attention_chunk_size
}
_
{
block_size
}
_"
prefix
=
f
"ChunkedLocalAttention_
{
attention_chunk_size
}
_
{
block_size
}
_"
def
build_preprocess_fn
(
cm
:
CommonAttentionMetadata
):
underlying_builder
=
underlying_attn_backend
.
get_builder_cls
()
return
make_local_attention_virtual_batches
(
attention_chunk_size
,
cm
,
block_size
)
class
ChunkedLocalAttentionBuilder
(
underlying_builder
):
# type: ignore
def
build
(
self
,
common_prefix_len
:
int
,
common_attn_metadata
:
CommonAttentionMetadata
,
fast_build
:
bool
=
False
)
->
AttentionMetadata
:
common_attn_metadata
=
make_local_attention_virtual_batches
(
attention_chunk_size
,
common_attn_metadata
,
block_size
)
return
super
().
build
(
common_prefix_len
,
common_attn_metadata
,
fast_build
)
# Dynamically create a new attention backend that wraps the
# underlying attention backend but applies
# `make_local_attention_virtual_batches` before calling `build(...)`
builder_cls
=
subclass_attention_metadata_builder
(
name_prefix
=
prefix
,
builder_cls
=
underlying_attn_backend
.
get_builder_cls
(),
build_preprocess_fn
=
build_preprocess_fn
)
attn_backend
=
subclass_attention_backend
(
attn_backend
=
subclass_attention_backend
(
name_prefix
=
prefix
,
name_prefix
=
prefix
,
attention_backend_cls
=
underlying_attn_backend
,
attention_backend_cls
=
underlying_attn_backend
,
builder_cls
=
b
uilder
_cls
)
builder_cls
=
ChunkedLocalAttentionB
uilder
)
return
attn_backend
return
attn_backend
...
...
vllm/attention/layers/encoder_only_attention.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
functools
from
copy
import
copy
from
typing
import
Optional
import
torch
from
vllm
import
envs
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionMetadata
,
AttentionType
)
from
vllm.attention.layer
import
Attention
from
vllm.attention.selector
import
get_attn_backend
from
vllm.config
import
CacheConfig
from
vllm.v1.attention.backends.utils
import
(
CommonAttentionMetadata
,
subclass_attention_backend
)
@
functools
.
lru_cache
def
create_encoder_only_attention_backend
(
underlying_attn_backend
:
AttentionBackend
,
)
->
type
[
AttentionBackend
]:
prefix
=
"EncoderOnlyAttention_"
underlying_builder
=
underlying_attn_backend
.
get_builder_cls
()
class
EncoderOnlyAttentionBuilder
(
underlying_builder
):
# type: ignore
def
build
(
self
,
common_prefix_len
:
int
,
common_attn_metadata
:
CommonAttentionMetadata
,
fast_build
:
bool
=
False
)
->
AttentionMetadata
:
new_common_attn_metadata
=
copy
(
common_attn_metadata
)
new_common_attn_metadata
.
causal
=
False
return
super
().
build
(
common_prefix_len
,
new_common_attn_metadata
,
fast_build
)
attn_backend
=
subclass_attention_backend
(
name_prefix
=
prefix
,
attention_backend_cls
=
underlying_attn_backend
,
builder_cls
=
EncoderOnlyAttentionBuilder
)
return
attn_backend
class
EncoderOnlyAttention
(
Attention
):
"""
Encoder attention is a special case that doesn't need a KV Cache.
"""
def
__init__
(
self
,
num_heads
:
int
,
head_size
:
int
,
scale
:
float
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
attn_type
:
Optional
[
str
]
=
None
,
**
kwargs
):
dtype
=
torch
.
get_default_dtype
()
if
cache_config
is
not
None
:
kv_cache_dtype
=
cache_config
.
cache_dtype
block_size
=
cache_config
.
block_size
else
:
kv_cache_dtype
=
"auto"
block_size
=
16
if
envs
.
VLLM_USE_V1
:
underlying_attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
kv_cache_dtype
,
block_size
)
attn_backend
=
create_encoder_only_attention_backend
(
underlying_attn_backend
)
else
:
# in v0 encoder only attention is handled inside the backends
attn_backend
=
None
if
attn_type
is
not
None
:
assert
attn_type
==
AttentionType
.
ENCODER_ONLY
,
\
"EncoderOnlyAttention only supports AttentionType.ENCODER_ONLY"
super
().
__init__
(
num_heads
=
num_heads
,
head_size
=
head_size
,
scale
=
scale
,
cache_config
=
cache_config
,
attn_backend
=
attn_backend
,
attn_type
=
AttentionType
.
ENCODER_ONLY
,
**
kwargs
)
vllm/attention/ops/flashmla.py
View file @
a99300bd
...
@@ -75,8 +75,8 @@ def flash_mla_with_kvcache(
...
@@ -75,8 +75,8 @@ def flash_mla_with_kvcache(
num_splits
:
torch
.
Tensor
,
num_splits
:
torch
.
Tensor
,
softmax_scale
:
Optional
[
float
]
=
None
,
softmax_scale
:
Optional
[
float
]
=
None
,
causal
:
bool
=
False
,
causal
:
bool
=
False
,
k_
scale
=
None
,
de
scale
_q
:
Optional
[
torch
.
Tensor
]
=
None
,
kv_cache_dtype
=
"auto"
,
descale_k
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
"""
Arguments:
Arguments:
...
@@ -91,6 +91,8 @@ def flash_mla_with_kvcache(
...
@@ -91,6 +91,8 @@ def flash_mla_with_kvcache(
softmax_scale: float. The scaling of QK^T before applying softmax.
softmax_scale: float. The scaling of QK^T before applying softmax.
Default to 1 / sqrt(head_dim).
Default to 1 / sqrt(head_dim).
causal: bool. Whether to apply causal attention mask.
causal: bool. Whether to apply causal attention mask.
descale_q: (batch_size), torch.float32. Descaling factors for Q.
descale_k: (batch_size), torch.float32. Descaling factors for K.
Return:
Return:
out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
...
@@ -99,22 +101,22 @@ def flash_mla_with_kvcache(
...
@@ -99,22 +101,22 @@ def flash_mla_with_kvcache(
if
softmax_scale
is
None
:
if
softmax_scale
is
None
:
softmax_scale
=
q
.
shape
[
-
1
]
**
(
-
0.5
)
softmax_scale
=
q
.
shape
[
-
1
]
**
(
-
0.5
)
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
if
kv_cache_dtype
==
"fp8"
:
#
if kv_cache_dtype == "fp8":
out
,
softmax_lse
=
flash_mla_cuda
.
fwd_kvcache_quantization_mla
(
#
out, softmax_lse = flash_mla_cuda.fwd_kvcache_quantization_mla(
q
,
#
q,
k_cache
,
#
k_cache,
None
,
#
None,
head_dim_v
,
#
head_dim_v,
cache_seqlens
,
#
cache_seqlens,
block_table
,
#
block_table,
softmax_scale
,
#
softmax_scale,
causal
,
#
causal,
tile_scheduler_metadata
,
#
tile_scheduler_metadata,
num_splits
,
#
num_splits,
k_scale
,
#
k_scale,
"fp8_e4m3"
,
#
"fp8_e4m3",
)
#
)
return
out
,
softmax_lse
#
return out, softmax_lse
out
,
softmax_lse
=
flash_mla_cuda
.
fwd_kvcache_mla
(
out
,
softmax_lse
=
flash_mla_cuda
.
fwd_kvcache_mla
(
q
,
q
,
k_cache
,
k_cache
,
...
@@ -126,12 +128,13 @@ def flash_mla_with_kvcache(
...
@@ -126,12 +128,13 @@ def flash_mla_with_kvcache(
causal
,
causal
,
tile_scheduler_metadata
,
tile_scheduler_metadata
,
num_splits
,
num_splits
,
# descale_q,
# descale_k,
)
)
else
:
else
:
out
,
softmax_lse
=
torch
.
ops
.
_flashmla_C
.
fwd_kvcache_mla
(
out
,
softmax_lse
=
torch
.
ops
.
_flashmla_C
.
fwd_kvcache_mla
(
q
,
q
,
k_cache
,
k_cache
,
None
,
head_dim_v
,
head_dim_v
,
cache_seqlens
,
cache_seqlens
,
block_table
,
block_table
,
...
@@ -139,6 +142,8 @@ def flash_mla_with_kvcache(
...
@@ -139,6 +142,8 @@ def flash_mla_with_kvcache(
causal
,
causal
,
tile_scheduler_metadata
,
tile_scheduler_metadata
,
num_splits
,
num_splits
,
descale_q
,
descale_k
,
)
)
return
out
,
softmax_lse
return
out
,
softmax_lse
...
...
vllm/beam_search.py
View file @
a99300bd
...
@@ -18,7 +18,7 @@ class BeamSearchSequence:
...
@@ -18,7 +18,7 @@ class BeamSearchSequence:
The text field is optional and will only be filled when the sequence is
The text field is optional and will only be filled when the sequence is
about to be returned to the user.
about to be returned to the user.
"""
"""
# The tokens include
s
the prompt.
# The tokens include the prompt.
tokens
:
list
[
int
]
tokens
:
list
[
int
]
logprobs
:
list
[
dict
[
int
,
Logprob
]]
logprobs
:
list
[
dict
[
int
,
Logprob
]]
lora_request
:
Optional
[
LoRARequest
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
...
...
vllm/benchmarks/datasets.py
View file @
a99300bd
...
@@ -11,17 +11,21 @@ generation. Supported dataset types include:
...
@@ -11,17 +11,21 @@ generation. Supported dataset types include:
- HuggingFace
- HuggingFace
- VisionArena
- VisionArena
"""
"""
import
ast
import
base64
import
base64
import
io
import
io
import
json
import
json
import
logging
import
logging
import
math
import
random
import
random
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Mapping
from
collections.abc
import
Iterator
,
Mapping
from
contextlib
import
suppress
from
copy
import
deepcopy
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
functools
import
cache
from
functools
import
cache
from
io
import
BytesIO
from
io
import
BytesIO
from
typing
import
Any
,
Callable
,
Optional
,
Union
from
typing
import
Any
,
Callable
,
Optional
,
Union
,
cast
import
numpy
as
np
import
numpy
as
np
from
PIL
import
Image
from
PIL
import
Image
...
@@ -69,13 +73,14 @@ class SampleRequest:
...
@@ -69,13 +73,14 @@ class SampleRequest:
Represents a single inference request for benchmarking.
Represents a single inference request for benchmarking.
"""
"""
prompt
:
Union
[
str
,
Any
]
prompt
:
Union
[
str
,
list
[
str
]
]
prompt_len
:
int
prompt_len
:
int
expected_output_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
multi_modal_data
:
Optional
[
Union
[
MultiModalDataDict
,
dict
,
list
[
dict
]]
Union
[
MultiModalDataDict
,
dict
,
list
[
dict
]]
]
=
None
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
request_id
:
Optional
[
str
]
=
None
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
...
@@ -112,7 +117,9 @@ class BenchmarkDataset(ABC):
...
@@ -112,7 +117,9 @@ class BenchmarkDataset(ABC):
def
apply_multimodal_chat_transformation
(
def
apply_multimodal_chat_transformation
(
self
,
self
,
prompt
:
str
,
prompt
:
str
,
mm_content
:
Optional
[
MultiModalDataDict
]
=
None
)
->
list
[
dict
]:
mm_content
:
Optional
[
Union
[
MultiModalDataDict
,
dict
,
list
[
dict
]]
]
=
None
)
->
list
[
dict
]:
"""
"""
Transform a prompt and optional multimodal content into a chat format.
Transform a prompt and optional multimodal content into a chat format.
This method is used for chat models that expect a specific conversation
This method is used for chat models that expect a specific conversation
...
@@ -120,7 +127,15 @@ class BenchmarkDataset(ABC):
...
@@ -120,7 +127,15 @@ class BenchmarkDataset(ABC):
"""
"""
content
=
[{
"text"
:
prompt
,
"type"
:
"text"
}]
content
=
[{
"text"
:
prompt
,
"type"
:
"text"
}]
if
mm_content
is
not
None
:
if
mm_content
is
not
None
:
content
.
append
(
mm_content
)
if
isinstance
(
mm_content
,
list
):
content
.
extend
(
cast
(
list
[
dict
[
str
,
Any
]],
mm_content
))
elif
isinstance
(
mm_content
,
dict
):
content
.
append
(
mm_content
)
else
:
raise
TypeError
(
"Could not process multimodal content of type: "
+
f
"
{
type
(
mm_content
)
}
"
)
return
[{
"role"
:
"user"
,
"content"
:
content
}]
return
[{
"role"
:
"user"
,
"content"
:
content
}]
def
load_data
(
self
)
->
None
:
def
load_data
(
self
)
->
None
:
...
@@ -183,7 +198,8 @@ class BenchmarkDataset(ABC):
...
@@ -183,7 +198,8 @@ class BenchmarkDataset(ABC):
@
abstractmethod
@
abstractmethod
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
)
->
list
[
SampleRequest
]:
num_requests
:
int
,
request_id_prefix
:
str
=
""
)
->
list
[
SampleRequest
]:
"""
"""
Abstract method to generate sample requests from the dataset.
Abstract method to generate sample requests from the dataset.
...
@@ -194,6 +210,8 @@ class BenchmarkDataset(ABC):
...
@@ -194,6 +210,8 @@ class BenchmarkDataset(ABC):
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.
num_requests (int): The number of sample requests to generate.
request_id_prefix (str) The prefix of request_id.
Returns:
Returns:
list[SampleRequest]: A list of sample requests generated from the
list[SampleRequest]: A list of sample requests generated from the
...
@@ -201,8 +219,12 @@ class BenchmarkDataset(ABC):
...
@@ -201,8 +219,12 @@ class BenchmarkDataset(ABC):
"""
"""
raise
NotImplementedError
(
"sample must be implemented in subclasses."
)
raise
NotImplementedError
(
"sample must be implemented in subclasses."
)
def
maybe_oversample_requests
(
self
,
requests
:
list
[
SampleRequest
],
def
maybe_oversample_requests
(
num_requests
:
int
)
->
None
:
self
,
requests
:
list
[
SampleRequest
],
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
)
->
None
:
"""
"""
Oversamples the list of requests if its size is less than the desired
Oversamples the list of requests if its size is less than the desired
number.
number.
...
@@ -211,11 +233,17 @@ class BenchmarkDataset(ABC):
...
@@ -211,11 +233,17 @@ class BenchmarkDataset(ABC):
requests (List[SampleRequest]): The current list of sampled
requests (List[SampleRequest]): The current list of sampled
requests.
requests.
num_requests (int): The target number of requests.
num_requests (int): The target number of requests.
request_id_prefix (str) The prefix of the request ids.
"""
"""
if
len
(
requests
)
<
num_requests
:
if
len
(
requests
)
<
num_requests
:
random
.
seed
(
self
.
random_seed
)
random
.
seed
(
self
.
random_seed
)
additional
=
random
.
choices
(
requests
,
additional
=
deepcopy
(
k
=
num_requests
-
len
(
requests
))
random
.
choices
(
requests
,
k
=
num_requests
-
len
(
requests
))
)
for
i
in
range
(
len
(
additional
)):
req
=
additional
[
i
]
req
.
request_id
=
request_id_prefix
+
str
(
len
(
requests
)
+
i
)
requests
.
extend
(
additional
)
requests
.
extend
(
additional
)
logger
.
info
(
"Oversampled requests to reach %d total samples."
,
logger
.
info
(
"Oversampled requests to reach %d total samples."
,
num_requests
)
num_requests
)
...
@@ -266,7 +294,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
...
@@ -266,7 +294,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
"""
"""
Process a single image input and return a multimedia content dictionary.
Process a single image input and return a multimedia content dictionary.
Supports th
ree
input types:
Supports th
e following
input types:
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
containing raw image data. - Loads the bytes as a PIL.Image.Image.
...
@@ -306,94 +334,592 @@ def process_image(image: Any) -> Mapping[str, Any]:
...
@@ -306,94 +334,592 @@ def process_image(image: Any) -> Mapping[str, Any]:
" or str or dictionary with raw image bytes."
)
" or str or dictionary with raw image bytes."
)
def
process_video
(
video
:
Any
)
->
Mapping
[
str
,
Any
]:
"""
Process a single video input and return a multimedia content dictionary.
Supports the following input types:
1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
containing raw video data.
2. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
"""
if
isinstance
(
video
,
dict
)
and
'bytes'
in
video
:
video_bytes
=
video
[
'bytes'
]
video_base64
=
base64
.
b64encode
(
video_bytes
).
decode
(
"utf-8"
)
return
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
f
"data:video/mp4;base64,
{
video_base64
}
"
},
}
if
isinstance
(
video
,
str
):
video_url
=
(
video
if
video
.
startswith
(
(
"http://"
,
"file://"
))
else
f
"file://
{
video
}
"
)
return
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
}}
raise
ValueError
(
f
"Invalid video input
{
video
}
. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."
# noqa: E501
)
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data)
# Random Dataset Implementation (Synthetic Data)
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
class
RandomDataset
(
BenchmarkDataset
):
class
RandomDataset
(
BenchmarkDataset
):
"""
Synthetic text-only dataset for serving/throughput benchmarks.
Strategy:
- Sample input/output token lengths per request from integer-uniform ranges
around configured means (controlled by range_ratio).
- Prepend a fixed random prefix of length prefix_len.
- Generate the remaining tokens as a reproducible sequence:
(offset + index + arange(input_len)) % vocab_size.
- Decode then re-encode/truncate to ensure prompt token counts match.
- Uses numpy.default_rng seeded with random_seed for reproducible sampling.
"""
# Default values copied from benchmark_serving.py for the random dataset.
# Default values copied from benchmark_serving.py for the random dataset.
DEFAULT_PREFIX_LEN
=
0
DEFAULT_PREFIX_LEN
=
0
DEFAULT_RANGE_RATIO
=
0.0
DEFAULT_RANGE_RATIO
=
0.0
DEFAULT_INPUT_LEN
=
1024
DEFAULT_INPUT_LEN
=
1024
DEFAULT_OUTPUT_LEN
=
128
DEFAULT_OUTPUT_LEN
=
128
def
__init__
(
def
__init__
(
self
,
**
kwargs
)
->
None
:
self
,
**
kwargs
,
)
->
None
:
super
().
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
random
.
seed
(
self
.
random_seed
)
# Use numpy's default_rng for deterministic sampling
np
.
random
.
seed
(
self
.
random_seed
)
# Do not use random.seed() or np.random.seed() elsewhere in this class.
# This ensures that the RNG is isolated from global RNG state.
self
.
_rng
=
np
.
random
.
default_rng
(
self
.
random_seed
)
def
sample
(
def
sample
(
self
,
self
,
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
prefix_len
:
int
=
DEFAULT_PREFIX_LEN
,
prefix_len
:
int
=
DEFAULT_PREFIX_LEN
,
range_ratio
:
float
=
DEFAULT_RANGE_RATIO
,
range_ratio
:
float
=
DEFAULT_RANGE_RATIO
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
batchsize
:
int
=
1
,
**
kwargs
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
)
->
list
[
SampleRequest
]:
# Enforce range_ratio < 1
assert
range_ratio
<
1.0
,
(
input_lens
,
output_lens
,
offsets
=
self
.
get_sampling_params
(
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
num_requests
,
range_ratio
,
input_len
,
output_len
,
tokenizer
)
)
# Generate prefix once
prefix_token_ids
=
self
.
get_prefix
(
tokenizer
,
prefix_len
)
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
num_special_tokens
=
tokenizer
.
num_special_tokens_to_add
()
real_input_len
=
input_len
-
num_special_tokens
prefix_token_ids
=
(
np
.
random
.
randint
(
requests
=
[]
0
,
vocab_size
,
size
=
prefix_len
).
tolist
()
if
prefix_len
>
0
else
[])
for
i
in
range
(
num_requests
):
prompt
,
total_input_len
=
self
.
generate_token_sequence
(
tokenizer
=
tokenizer
,
prefix_token_ids
=
prefix_token_ids
,
prefix_len
=
prefix_len
,
vocab_size
=
vocab_size
,
input_len
=
int
(
input_lens
[
i
]),
offset
=
int
(
offsets
[
i
]),
index
=
i
,
)
requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
total_input_len
,
expected_output_len
=
int
(
output_lens
[
i
]),
request_id
=
request_id_prefix
+
str
(
i
),
)
)
# only used for embeddings benchmark.
if
batchsize
>
1
:
batch_requests
=
[]
# Create batched requests
for
i
in
range
(
0
,
num_requests
,
batchsize
):
batch
=
requests
[
i
:
i
+
batchsize
]
batch_requests
.
append
(
SampleRequest
(
prompt
=
[
req
.
prompt
for
req
in
batch
],
prompt_len
=
sum
(
req
.
prompt_len
for
req
in
batch
),
expected_output_len
=
0
,
request_id
=
request_id_prefix
+
str
(
i
//
batchsize
),
)
)
requests
=
batch_requests
return
requests
def
get_prefix
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
prefix_len
:
int
)
->
list
[
int
]:
"""
Get the prefix for the dataset.
"""
return
(
self
.
_rng
.
integers
(
0
,
tokenizer
.
vocab_size
,
size
=
prefix_len
).
tolist
()
if
prefix_len
>
0
else
[]
)
# New sampling logic: [X * (1 - b), X * (1 + b)]
def
get_sampling_params
(
input_low
=
int
(
real_input_len
*
(
1
-
range_ratio
))
self
,
input_high
=
int
(
real_input_len
*
(
1
+
range_ratio
))
num_requests
:
int
,
output_low
=
int
(
output_len
*
(
1
-
range_ratio
))
range_ratio
:
float
,
output_high
=
int
(
output_len
*
(
1
+
range_ratio
))
input_len
:
int
,
output_len
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
tuple
[
np
.
ndarray
,
np
.
ndarray
,
np
.
ndarray
]:
"""
Get the sampling parameters for the dataset.
"""
# Enforce range_ratio < 1
if
not
(
0.0
<=
range_ratio
<
1.0
):
raise
ValueError
(
"range_ratio must be in [0, 1)."
)
num_special_tokens
=
int
(
tokenizer
.
num_special_tokens_to_add
())
real_input_len
=
max
(
0
,
int
(
input_len
)
-
num_special_tokens
)
# Bounds use floor for low and ceil for high
input_low
=
math
.
floor
(
real_input_len
*
(
1
-
range_ratio
))
input_high
=
math
.
ceil
(
real_input_len
*
(
1
+
range_ratio
))
output_low
=
math
.
floor
(
output_len
*
(
1
-
range_ratio
))
output_high
=
math
.
ceil
(
output_len
*
(
1
+
range_ratio
))
# Ensure the lower bound for output length is at least 1 to
# prevent sampling 0 tokens.
output_low
=
max
(
output_low
,
1
)
if
input_low
>
input_high
:
raise
ValueError
(
"Invalid input sampling interval: "
f
"low=
{
input_low
}
> high=
{
input_high
}
"
)
if
output_low
>
output_high
:
raise
ValueError
(
"Invalid output sampling interval: "
f
"low=
{
output_low
}
> high=
{
output_high
}
"
)
# Add logging for debugging
logger
.
info
(
logger
.
info
(
"Sampling input_len from [%s, %s] and output_len from [%s, %s]"
,
"Sampling input_len from [%s, %s] and output_len from [%s, %s]"
,
input_low
,
input_high
,
output_low
,
output_high
)
input_low
,
input_high
,
output_low
,
output_high
,
)
input_lens
=
np
.
random
.
randint
(
input_low
,
input_lens
=
self
.
_rng
.
integers
(
input_low
,
input_high
+
1
,
input_high
+
1
,
size
=
num_requests
)
size
=
num_requests
)
output_lens
=
self
.
_rng
.
integers
(
output_low
,
output_high
+
1
,
output_lens
=
np
.
random
.
randint
(
output_low
,
size
=
num_requests
)
output_high
+
1
,
offsets
=
self
.
_rng
.
integers
(
0
,
tokenizer
.
vocab_size
,
size
=
num_requests
)
size
=
num_requests
)
offsets
=
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
num_reques
ts
)
return
input_lens
,
output_lens
,
offse
ts
requests
=
[]
def
generate_token_sequence
(
self
,
*
,
tokenizer
:
PreTrainedTokenizerBase
,
prefix_token_ids
:
list
[
int
],
prefix_len
:
int
,
vocab_size
:
int
,
input_len
:
int
,
offset
:
int
,
index
:
int
,
)
->
tuple
[
str
,
int
]:
"""
Returns (prompt, total_input_len).
NOTE: After decoding the prompt we have to encode and decode it again.
This is done because in some cases N consecutive tokens
give a string tokenized into != N number of tokens.
For example for GPT2Tokenizer:
[6880, 6881] -> ['Ġcalls', 'here'] ->
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
To avoid uncontrolled change of the prompt length,
the encoded sequence is truncated before being decode again.
"""
# Build the inner sequence by sampling sequentially from the vocab
inner_seq
=
((
offset
+
index
+
np
.
arange
(
input_len
))
%
vocab_size
).
tolist
()
token_sequence
=
prefix_token_ids
+
inner_seq
# Decode, then re-encode and truncate to preserve token count invariants
prompt
=
tokenizer
.
decode
(
token_sequence
)
total_input_len
=
prefix_len
+
int
(
input_len
)
re_encoded_sequence
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)[:
total_input_len
]
prompt
=
tokenizer
.
decode
(
re_encoded_sequence
)
total_input_len
=
len
(
re_encoded_sequence
)
return
prompt
,
total_input_len
# -----------------------------------------------------------------------------
# MultiModalDataset Implementation
# -----------------------------------------------------------------------------
class
RandomMultiModalDataset
(
RandomDataset
):
"""
Synthetic multimodal dataset (text + images) that extends RandomDataset.
Status:
- Images: supported via synthetic RGB data.
- Video: not yet supported (TODO: implement video generation method).
- Audio: not yet supported.
Sampling overview:
1) Number of items per request is sampled uniformly from the integer range
[floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is
`num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
The maximum is further clamped to the sum of per-modality limits.
2) Each item’s modality and shape is sampled from `bucket_config`, a dict
mapping (height, width, num_frames) → probability. We treat
`num_frames`=1 as image and and `num_frames` > 1 as video.
Entries with zero probability are removed and the rest are renormalized
to sum to 1.
3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
When a modality reaches its cap, all of its buckets are excluded and the
remaining probabilities are renormalized.
Example bucket configuration:
{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
- Two image buckets (`num_frames`=1) and one video bucket
(`num_frames`=16).
OBS.: Only image sampling is supported for now.
"""
IS_MULTIMODAL
=
True
# NOTE: video sampling is WIP. Setting it to 0.
DEFAULT_LIMIT_MM_PER_PROMPT
=
{
"image"
:
255
,
"video"
:
0
}
DEFAULT_BASE_ITEMS_PER_REQUEST
=
1
DEFAULT_NUM_MM_ITEMS_RANGE_RATIO
=
0.0
DEFAULT_MM_ITEM_BUCKET_CONFIG
=
{
(
256
,
256
,
1
):
0.5
,
(
720
,
1280
,
1
):
0.5
,
(
720
,
1280
,
16
):
0.0
,
}
DEFAULT_ENABLE_MULTIMODAL_CHAT
=
False
def
__init__
(
self
,
**
kwargs
)
->
None
:
super
().
__init__
(
**
kwargs
)
def
generate_synthetic_image
(
self
,
width
:
int
,
height
:
int
)
->
Image
.
Image
:
"""Generate synthetic PIL image with random RGB values.
NOTE: iid pixel sampling results in worst-case compression
(good for stressing I/O), but very unlike real photos.
We could consider a “low-freq” mode (e.g., noise blur)
to emulate network realism instead of max stress.
"""
random_pixels
=
self
.
_rng
.
integers
(
0
,
256
,
(
height
,
width
,
3
),
dtype
=
np
.
uint8
,
)
return
Image
.
fromarray
(
random_pixels
)
def
generate_synthetic_video
(
self
,
width
:
int
,
height
:
int
,
num_frames
:
int
)
->
Any
:
"""Generate synthetic video with random values.
TODO: Finish this method.
"""
raise
NotImplementedError
(
"Video sampling is WIP."
)
def
map_config_to_modality
(
self
,
config
:
tuple
[
int
,
int
,
int
])
->
str
:
"""Map the configuration to the modality."""
if
config
[
-
1
]
==
1
:
return
"image"
elif
config
[
-
1
]
>
1
:
return
"video"
else
:
raise
ValueError
(
f
"Invalid multimodal item configuration:
{
config
}
"
)
def
normalize_bucket_config
(
self
,
bucket_config
:
dict
[
tuple
[
int
,
int
,
int
],
float
])
->
dict
[
tuple
[
int
,
int
,
int
],
float
]:
"""
Remove zero probability entries
and normalize the bucket config to sum to 1.
"""
# Raise error if value is negative
if
any
(
v
<
0
for
v
in
bucket_config
.
values
()):
raise
ValueError
(
"Bucket config values must be non-negative."
)
# Remove zero probability entries
bucket_config
=
{
k
:
v
for
k
,
v
in
bucket_config
.
items
()
if
v
>
0
}
# if bucket config is empty, raise error
if
not
bucket_config
:
raise
ValueError
(
"Got invalid bucket config. "
"Bucket config values must be non-zero."
)
# Normalize the remaining bucket config to sum to 1
total
=
sum
(
bucket_config
.
values
())
return
{
k
:
v
/
total
for
k
,
v
in
bucket_config
.
items
()}
def
generate_mm_item
(
self
,
mm_item_config
:
tuple
[
int
,
int
,
int
],
)
->
Mapping
[
str
,
Any
]:
"""
Create synthetic images and videos and
apply process_image/process_video respectively.
This follows the OpenAI API chat completions
https://github.com/openai/openai-python
"""
if
self
.
map_config_to_modality
(
mm_item_config
)
==
"image"
:
return
process_image
(
self
.
generate_synthetic_image
(
mm_item_config
[
1
],
mm_item_config
[
0
]))
elif
self
.
map_config_to_modality
(
mm_item_config
)
==
"video"
:
return
process_video
(
self
.
generate_synthetic_video
(
mm_item_config
[
1
],
mm_item_config
[
0
],
mm_item_config
[
2
]))
else
:
raise
ValueError
(
f
"Invalid multimodal item configuration: "
f
"
{
mm_item_config
}
"
)
def
get_mm_item_sampling_params
(
self
,
base_items_per_request
:
int
,
num_mm_items_range_ratio
:
float
,
limit_mm_per_prompt
:
dict
[
str
,
int
],
bucket_config
:
dict
[
tuple
[
int
,
int
,
int
],
float
],
)
->
tuple
[
int
,
int
,
dict
[
str
,
int
],
dict
[
tuple
[
int
,
int
,
int
],
float
]]:
"""
Get the sampling parameters for the multimodal items.
"""
# Enforce num_mm_items_range_ratio <= 1
if
not
(
0.0
<=
num_mm_items_range_ratio
<=
1.0
):
raise
ValueError
(
"num_mm_items_range_ratio must be in [0, 1]."
)
# Ensure modalities to sample are in limit_mm_per_prompt
for
k
,
v
in
bucket_config
.
items
():
# get modality from bucket config
modality
=
self
.
map_config_to_modality
(
k
)
if
modality
not
in
limit_mm_per_prompt
:
raise
ValueError
(
f
"Modality
{
modality
}
is not in "
f
"limit_mm_per_prompt: "
f
"
{
limit_mm_per_prompt
.
keys
()
}
"
)
# Remove zero probability entries
# and normalize bucket config to sum to 1
bucket_config
=
self
.
normalize_bucket_config
(
bucket_config
)
logger
.
info
(
"Normalized bucket config: %s"
,
bucket_config
,
)
# Only consider limit per prompt for modalities in bucket config
allowed_modalities
=
{
self
.
map_config_to_modality
(
cfg
)
for
cfg
in
bucket_config
}
limit_mm_per_prompt
=
{
k
:
v
for
k
,
v
in
limit_mm_per_prompt
.
items
()
if
k
in
allowed_modalities
}
if
not
limit_mm_per_prompt
:
raise
ValueError
(
"No valid limits for modalities present in "
"bucket_config."
)
logger
.
info
(
"Updated mm-limit-per-prompt: %s"
,
limit_mm_per_prompt
,
)
# Get max and min num mm items and ensure
# it is at most the sum of limit_mm_per_prompt for all modalities
max_num_mm_items
=
min
(
sum
(
limit_mm_per_prompt
.
values
()),
math
.
ceil
(
base_items_per_request
*
(
1
+
num_mm_items_range_ratio
))
)
# Ensure min num mm items is at least 0
min_num_mm_items
=
max
(
0
,
math
.
floor
(
base_items_per_request
*
(
1
-
num_mm_items_range_ratio
))
)
# Raise error if min num mm items is greater than max num mm items
if
min_num_mm_items
>
max_num_mm_items
:
raise
ValueError
(
f
"Min num mm items is greater than max mm items: "
f
"
{
min_num_mm_items
}
>
{
max_num_mm_items
}
"
)
logger
.
info
(
"Sampling number of multimodal items from [%s, %s]"
,
min_num_mm_items
,
max_num_mm_items
,
)
return
(
min_num_mm_items
,
max_num_mm_items
,
limit_mm_per_prompt
,
bucket_config
,
)
def
get_mm_item_iterator
(
self
,
min_num_mm_items
:
int
,
max_num_mm_items
:
int
,
bucket_config
:
dict
[
tuple
[
int
,
int
,
int
],
float
],
limit_mm_per_prompt
:
dict
[
str
,
int
],
)
->
Iterator
[
tuple
[
int
,
int
,
int
]]:
"""
Iterator over the multimodal items for each request
whose size is between min_num_mm_items and max_num_mm_items.
Loop over the bucket config and sample a multimodal item.
Loop until the number of multimodal items sampled is equal to
request_num_mm_items or limit of multimodal items per prompt
for all modalities is reached.
Note:
- This function operates on a per-request shallow copy of
`bucket_config` (tuple->float). The original dict passed to
`sample` is not mutated. If this ever changes, a test
is implemented and will fail.
"""
# Get the number of multimodal items to sample
request_num_mm_items
=
int
(
self
.
_rng
.
integers
(
min_num_mm_items
,
max_num_mm_items
+
1
)
)
# If request_num_mm_items is 0, yield an empty iterator
if
request_num_mm_items
==
0
:
return
# Initialize modality counters
modality_counter
=
{
self
.
map_config_to_modality
(
k
):
0
for
k
in
bucket_config
}
# Copy the bucket config to avoid modifying the original
bucket_config_copy
=
bucket_config
.
copy
()
# Loop over the number of multimodal items to sample
while
sum
(
modality_counter
.
values
())
<
request_num_mm_items
:
# Sample a multimodal item config
mm_item_config
=
self
.
_rng
.
choice
(
list
(
bucket_config_copy
.
keys
()),
p
=
list
(
bucket_config_copy
.
values
()))
modality
=
self
.
map_config_to_modality
(
mm_item_config
)
# Check that modality count is less than limit per prompt
if
modality_counter
[
modality
]
<
limit_mm_per_prompt
[
modality
]:
modality_counter
[
modality
]
+=
1
yield
(
mm_item_config
)
else
:
# If the counter is greater than the limit per prompt
# set all multimodal items of this modality to 0
for
k
,
v
in
bucket_config_copy
.
items
():
if
self
.
map_config_to_modality
(
k
)
==
modality
:
bucket_config_copy
[
k
]
=
0
# If all configs are 0, break the loop
# This should not happen as request_num_mm_items is at most
# the sum of limit_mm_per_prompt for all modalities
if
all
(
v
==
0
for
v
in
bucket_config_copy
.
values
()):
logger
.
warning
(
"Exhausted all multimodal items "
"of modality %s"
,
modality
)
break
# Renormalize the bucket config
bucket_config_copy
=
self
.
normalize_bucket_config
(
bucket_config_copy
)
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
prefix_len
:
int
=
RandomDataset
.
DEFAULT_PREFIX_LEN
,
range_ratio
:
float
=
RandomDataset
.
DEFAULT_RANGE_RATIO
,
input_len
:
int
=
RandomDataset
.
DEFAULT_INPUT_LEN
,
output_len
:
int
=
RandomDataset
.
DEFAULT_OUTPUT_LEN
,
limit_mm_per_prompt
:
dict
[
str
,
int
]
=
DEFAULT_LIMIT_MM_PER_PROMPT
,
base_items_per_request
:
int
=
DEFAULT_BASE_ITEMS_PER_REQUEST
,
num_mm_items_range_ratio
:
float
=
DEFAULT_NUM_MM_ITEMS_RANGE_RATIO
,
bucket_config
:
dict
[
tuple
[
int
,
int
,
int
],
float
]
=
DEFAULT_MM_ITEM_BUCKET_CONFIG
,
enable_multimodal_chat
:
bool
=
DEFAULT_ENABLE_MULTIMODAL_CHAT
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
# NOTE: Video sampling is WIP. Raise error if video is in bucket config
# and probability is non-zero.
if
any
(
self
.
map_config_to_modality
(
cfg
)
==
"video"
and
p
>
0
for
cfg
,
p
in
bucket_config
.
items
()):
raise
NotImplementedError
(
"Video sampling not implemented; "
"set its probability to 0."
)
# Get the sampling parameters for the dataset
input_lens
,
output_lens
,
offsets
=
self
.
get_sampling_params
(
num_requests
,
range_ratio
,
input_len
,
output_len
,
tokenizer
)
(
min_num_mm_items
,
max_num_mm_items
,
limit_mm_per_prompt
,
bucket_config
,
)
=
self
.
get_mm_item_sampling_params
(
base_items_per_request
,
num_mm_items_range_ratio
,
limit_mm_per_prompt
,
bucket_config
,
)
# Generate prefix once
prefix_token_ids
=
self
.
get_prefix
(
tokenizer
,
prefix_len
)
vocab_size
=
tokenizer
.
vocab_size
# Add synthetic multimodal items to each request
mm_requests
=
[]
for
i
in
range
(
num_requests
):
for
i
in
range
(
num_requests
):
inner_seq
=
((
offsets
[
i
]
+
i
+
np
.
arange
(
input_lens
[
i
]))
%
prompt
,
total_input_len
=
self
.
generate_token_sequence
(
vocab_size
).
tolist
()
tokenizer
=
tokenizer
,
token_sequence
=
prefix_token_ids
+
inner_seq
prefix_token_ids
=
prefix_token_ids
,
prompt
=
tokenizer
.
decode
(
token_sequence
)
prefix_len
=
prefix_len
,
# After decoding the prompt we have to encode and decode it again.
vocab_size
=
vocab_size
,
# This is done because in some cases N consecutive tokens
input_len
=
int
(
input_lens
[
i
]),
# give a string tokenized into != N number of tokens.
offset
=
int
(
offsets
[
i
]),
# For example for GPT2Tokenizer:
index
=
i
,
# [6880, 6881] -> ['Ġcalls', 'here'] ->
)
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# Get multimodal item iterator for a given request
# To avoid uncontrolled change of the prompt length,
mm_item_iterator
=
self
.
get_mm_item_iterator
(
# the encoded sequence is truncated before being decode again.
min_num_mm_items
,
total_input_len
=
prefix_len
+
int
(
input_lens
[
i
])
max_num_mm_items
,
re_encoded_sequence
=
tokenizer
.
encode
(
bucket_config
,
prompt
,
add_special_tokens
=
False
)[:
total_input_len
]
limit_mm_per_prompt
,
prompt
=
tokenizer
.
decode
(
re_encoded_sequence
)
)
total_input_len
=
len
(
re_encoded_sequence
)
requests
.
append
(
mm_content
=
cast
(
list
[
dict
[
str
,
Any
]],
[
SampleRequest
(
self
.
generate_mm_item
(
mm_item_config
)
for
mm_item_config
in
mm_item_iterator
])
if
enable_multimodal_chat
:
# NOTE: For now this option is only provided for completeness
# given that the serve.py benchmark currently does not use it.
mm_chat_prompt
:
Any
=
prompt
mm_chat_prompt
=
self
.
apply_multimodal_chat_transformation
(
prompt
,
mm_content
)
sample_request
=
SampleRequest
(
prompt
=
mm_chat_prompt
,
prompt_len
=
total_input_len
,
expected_output_len
=
int
(
output_lens
[
i
]),
multi_modal_data
=
None
,
request_id
=
request_id_prefix
+
str
(
i
),
)
else
:
sample_request
=
SampleRequest
(
prompt
=
prompt
,
prompt
=
prompt
,
prompt_len
=
total_input_len
,
prompt_len
=
total_input_len
,
expected_output_len
=
int
(
output_lens
[
i
]),
expected_output_len
=
int
(
output_lens
[
i
]),
))
multi_modal_data
=
mm_content
,
return
requests
request_id
=
request_id_prefix
+
str
(
i
),
)
mm_requests
.
append
(
sample_request
)
return
mm_requests
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# ShareGPT Dataset Implementation
# ShareGPT Dataset Implementation
...
@@ -432,9 +958,11 @@ class ShareGPTDataset(BenchmarkDataset):
...
@@ -432,9 +958,11 @@ class ShareGPTDataset(BenchmarkDataset):
max_loras
:
Optional
[
int
]
=
None
,
max_loras
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
:
)
->
list
:
samples
:
list
=
[]
samples
:
list
=
[]
ind
=
0
for
entry
in
self
.
data
:
for
entry
in
self
.
data
:
if
len
(
samples
)
>=
num_requests
:
if
len
(
samples
)
>=
num_requests
:
break
break
...
@@ -455,9 +983,10 @@ class ShareGPTDataset(BenchmarkDataset):
...
@@ -455,9 +983,10 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check
=
output_len
skip_min_output_len_check
=
output_len
is
not
None
):
is
not
None
):
continue
continue
# TODO: Also support ShareGPT4Video.
if
image_path
:
=
entry
.
get
(
"image"
):
if
image_path
:
=
entry
.
get
(
"image"
):
mm_content
=
process_image
(
image_path
)
mm_content
=
process_image
(
image_path
)
elif
video_path
:
=
entry
.
get
(
"video"
):
mm_content
=
process_video
(
video_path
)
else
:
else
:
mm_content
=
None
mm_content
=
None
if
enable_multimodal_chat
:
if
enable_multimodal_chat
:
...
@@ -470,8 +999,10 @@ class ShareGPTDataset(BenchmarkDataset):
...
@@ -470,8 +999,10 @@ class ShareGPTDataset(BenchmarkDataset):
expected_output_len
=
new_output_len
,
expected_output_len
=
new_output_len
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
multi_modal_data
=
mm_content
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
))
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
samples
,
num_requests
,
request_id_prefix
)
return
samples
return
samples
...
@@ -488,8 +1019,8 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
...
@@ -488,8 +1019,8 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
type
=
str
,
type
=
str
,
default
=
"random"
,
default
=
"random"
,
choices
=
[
choices
=
[
"sharegpt"
,
"burstgpt"
,
"sonnet"
,
"random"
,
"
hf"
,
"custom
"
,
"sharegpt"
,
"burstgpt"
,
"sonnet"
,
"random"
,
"
random-mm"
,
"hf
"
,
"prefix_repetition"
"custom"
,
"prefix_repetition"
],
],
help
=
"Name of the dataset to benchmark on."
,
help
=
"Name of the dataset to benchmark on."
,
)
)
...
@@ -589,6 +1120,103 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
...
@@ -589,6 +1120,103 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
"context length sampled from [input_len * (1 - range_ratio), "
"context length sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]."
),
"input_len * (1 + range_ratio)]."
),
)
)
random_group
.
add_argument
(
"--random-batch-size"
,
type
=
int
,
default
=
1
,
help
=
(
"Batch size for random sampling. "
"Only used for embeddings benchmark."
),
)
# random multimodal dataset options
random_mm_group
=
parser
.
add_argument_group
(
"random multimodal dataset options extended from random dataset"
)
random_mm_group
.
add_argument
(
"--random-mm-base-items-per-request"
,
type
=
int
,
default
=
RandomMultiModalDataset
.
DEFAULT_BASE_ITEMS_PER_REQUEST
,
help
=
(
"Base number of multimodal items per request for random-mm. "
"Actual per-request count is sampled around this base using "
"--random-mm-num-mm-items-range-ratio."
),
)
random_mm_group
.
add_argument
(
"--random-mm-num-mm-items-range-ratio"
,
type
=
float
,
default
=
RandomMultiModalDataset
.
DEFAULT_NUM_MM_ITEMS_RANGE_RATIO
,
help
=
(
"Range ratio r in [0, 1] for sampling items per request. "
"We sample uniformly from the closed integer range "
"[floor(n*(1-r)), ceil(n*(1+r))] "
"where n is the base items per request. "
"r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped "
"to the sum of per-modality limits from "
"--random-mm-limit-mm-per-prompt. "
"An error is raised if the computed min exceeds the max."
),
)
random_mm_group
.
add_argument
(
"--random-mm-limit-mm-per-prompt"
,
type
=
json
.
loads
,
default
=
RandomMultiModalDataset
.
DEFAULT_LIMIT_MM_PER_PROMPT
,
help
=
(
"Per-modality hard caps for items attached per request, e.g. "
"'{
\"
image
\"
: 3,
\"
video
\"
: 0}'. The sampled per-request item "
"count is clamped to the sum of these limits. When a modality "
"reaches its cap, its buckets are excluded and probabilities are "
"renormalized."
"OBS.: Only image sampling is supported for now."
),
)
def
_parse_mm_bucket_config
(
v
:
object
)
->
dict
[
tuple
[
int
,
int
,
int
],
float
]:
# If already a dict (e.g., programmatic call), normalize keys
def
normalize
(
d
:
dict
)
->
dict
[
tuple
[
int
,
int
,
int
],
float
]:
out
:
dict
[
tuple
[
int
,
int
,
int
],
float
]
=
{}
for
k
,
val
in
d
.
items
():
key
=
k
if
isinstance
(
key
,
str
):
with
suppress
(
Exception
):
key
=
ast
.
literal_eval
(
key
)
if
not
(
isinstance
(
key
,
tuple
)
and
len
(
key
)
==
3
and
all
(
isinstance
(
x
,
int
)
for
x
in
key
)):
raise
ValueError
(
f
"Invalid bucket key
{
k
!
r
}
. Expected tuple (H, W, T)."
)
out
[(
int
(
key
[
0
]),
int
(
key
[
1
]),
int
(
key
[
2
]))]
=
float
(
val
)
return
out
if
isinstance
(
v
,
dict
):
return
normalize
(
v
)
if
isinstance
(
v
,
str
):
# Python literal (supports tuple keys)
parsed
=
ast
.
literal_eval
(
v
)
if
not
isinstance
(
parsed
,
dict
):
raise
ValueError
(
"Bucket config must parse to a dict."
)
return
normalize
(
parsed
)
raise
ValueError
(
"Unsupported value for --random-mm-bucket-config."
)
random_mm_group
.
add_argument
(
"--random-mm-bucket-config"
,
type
=
_parse_mm_bucket_config
,
default
=
RandomMultiModalDataset
.
DEFAULT_MM_ITEM_BUCKET_CONFIG
,
help
=
(
"The bucket config is a dictionary mapping a multimodal item"
"sampling configuration to a probability."
"Currently allows for 2 modalities: images and videos. "
"An bucket key is a tuple of (height, width, num_frames)"
"The value is the probability of sampling that specific item. "
"Example: "
"--random-mm-bucket-config "
"{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} "
"First item: images with resolution 256x256 w.p. 0.5"
"Second item: images with resolution 720x1280 w.p. 0.4 "
"Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1"
"OBS.: If the probabilities do not sum to 1, they are normalized."
"OBS bis.: Only image sampling is supported for now."
),
)
hf_group
=
parser
.
add_argument_group
(
"hf dataset options"
)
hf_group
=
parser
.
add_argument_group
(
"hf dataset options"
)
hf_group
.
add_argument
(
"--hf-subset"
,
hf_group
.
add_argument
(
"--hf-subset"
,
...
@@ -647,6 +1275,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
...
@@ -647,6 +1275,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
output_len
=
args
.
custom_output_len
,
output_len
=
args
.
custom_output_len
,
skip_chat_template
=
args
.
custom_skip_chat_template
,
skip_chat_template
=
args
.
custom_skip_chat_template
,
request_id_prefix
=
args
.
request_id_prefix
,
)
)
elif
args
.
dataset_name
==
"sonnet"
:
elif
args
.
dataset_name
==
"sonnet"
:
...
@@ -660,6 +1289,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
...
@@ -660,6 +1289,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
prefix_len
=
args
.
sonnet_prefix_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
False
,
return_prompt_formatted
=
False
,
request_id_prefix
=
args
.
request_id_prefix
,
)
)
else
:
else
:
assert
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
,
(
assert
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
,
(
...
@@ -671,6 +1301,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
...
@@ -671,6 +1301,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
prefix_len
=
args
.
sonnet_prefix_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
True
,
return_prompt_formatted
=
True
,
request_id_prefix
=
args
.
request_id_prefix
,
)
)
elif
args
.
dataset_name
==
"hf"
:
elif
args
.
dataset_name
==
"hf"
:
...
@@ -716,10 +1347,11 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
...
@@ -716,10 +1347,11 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
"openai-chat"
,
"openai-chat"
,
"openai-audio"
,
"openai-audio"
,
]:
]:
# multi-modal benchmark is only available on OpenAI Chat backend.
# multi-modal benchmark is only available on OpenAI Chat
# endpoint-type.
raise
ValueError
(
raise
ValueError
(
"Multi-modal content is only supported on 'openai-chat' and "
"Multi-modal content is only supported on 'openai-chat' and "
"'openai-audio'
backend
."
)
"'openai-audio'
endpoint-type
."
)
input_requests
=
dataset_class
(
input_requests
=
dataset_class
(
dataset_path
=
args
.
dataset_path
,
dataset_path
=
args
.
dataset_path
,
dataset_subset
=
args
.
hf_subset
,
dataset_subset
=
args
.
hf_subset
,
...
@@ -730,31 +1362,54 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
...
@@ -730,31 +1362,54 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
output_len
=
args
.
hf_output_len
,
output_len
=
args
.
hf_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
)
)
else
:
else
:
# For datasets that follow a similar structure, use a mapping.
# For datasets that follow a similar structure, use a mapping.
dataset_mapping
=
{
dataset_mapping
=
{
"sharegpt"
:
"sharegpt"
:
lambda
:
ShareGPTDataset
(
lambda
:
ShareGPTDataset
(
random_seed
=
args
.
seed
,
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
dataset_path
=
args
.
dataset_path
).
sample
(
).
sample
(
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
output_len
=
args
.
sharegpt_output_len
,
output_len
=
args
.
sharegpt_output_len
,
),
request_id_prefix
=
args
.
request_id_prefix
,
"burstgpt"
:
),
lambda
:
BurstGPTDataset
(
random_seed
=
args
.
seed
,
"burstgpt"
:
lambda
:
BurstGPTDataset
(
dataset_path
=
args
.
dataset_path
).
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
),
).
sample
(
"random"
:
tokenizer
=
tokenizer
,
lambda
:
RandomDataset
(
random_seed
=
args
.
seed
,
num_requests
=
args
.
num_prompts
,
dataset_path
=
args
.
dataset_path
).
sample
(
request_id_prefix
=
args
.
request_id_prefix
,
),
"random"
:
lambda
:
RandomDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
prefix_len
=
args
.
random_prefix_len
,
prefix_len
=
args
.
random_prefix_len
,
input_len
=
args
.
random_input_len
,
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
output_len
=
args
.
random_output_len
,
range_ratio
=
args
.
random_range_ratio
,
range_ratio
=
args
.
random_range_ratio
,
request_id_prefix
=
args
.
request_id_prefix
,
batchsize
=
args
.
random_batch_size
,
),
"random-mm"
:
lambda
:
RandomMultiModalDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
prefix_len
=
args
.
random_prefix_len
,
range_ratio
=
args
.
random_range_ratio
,
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
base_items_per_request
=
args
.
random_mm_base_items_per_request
,
limit_mm_per_prompt
=
args
.
random_mm_limit_mm_per_prompt
,
num_mm_items_range_ratio
=
args
.
random_mm_num_mm_items_range_ratio
,
bucket_config
=
args
.
random_mm_bucket_config
,
request_id_prefix
=
args
.
request_id_prefix
,
),
),
"prefix_repetition"
:
"prefix_repetition"
:
lambda
:
PrefixRepetitionRandomDataset
(
lambda
:
PrefixRepetitionRandomDataset
(
...
@@ -766,10 +1421,18 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
...
@@ -766,10 +1421,18 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
suffix_len
=
args
.
prefix_repetition_suffix_len
,
suffix_len
=
args
.
prefix_repetition_suffix_len
,
num_prefixes
=
args
.
prefix_repetition_num_prefixes
,
num_prefixes
=
args
.
prefix_repetition_num_prefixes
,
output_len
=
args
.
prefix_repetition_output_len
,
output_len
=
args
.
prefix_repetition_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
),
),
}
}
try
:
try
:
# Enforce endpoint compatibility for multimodal datasets.
if
args
.
dataset_name
==
"random-mm"
and
args
.
endpoint_type
not
in
[
"openai-chat"
]:
raise
ValueError
(
"Multi-modal content (images) is only supported on "
"'openai-chat' backend."
)
input_requests
=
dataset_mapping
[
args
.
dataset_name
]()
input_requests
=
dataset_mapping
[
args
.
dataset_name
]()
except
KeyError
as
err
:
except
KeyError
as
err
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
from
err
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
from
err
...
@@ -839,10 +1502,11 @@ class CustomDataset(BenchmarkDataset):
...
@@ -839,10 +1502,11 @@ class CustomDataset(BenchmarkDataset):
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
enable_multimodal_chat
:
bool
=
False
,
skip_chat_template
:
bool
=
False
,
skip_chat_template
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
:
)
->
list
:
sampled_requests
=
[]
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
break
prompt
=
item
[
"prompt"
]
prompt
=
item
[
"prompt"
]
...
@@ -864,8 +1528,10 @@ class CustomDataset(BenchmarkDataset):
...
@@ -864,8 +1528,10 @@ class CustomDataset(BenchmarkDataset):
prompt
=
prompt
,
prompt
=
prompt
,
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
))
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -909,6 +1575,7 @@ class SonnetDataset(BenchmarkDataset):
...
@@ -909,6 +1575,7 @@ class SonnetDataset(BenchmarkDataset):
input_len
:
int
=
DEFAULT_INPUT_LEN
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
return_prompt_formatted
:
bool
=
False
,
return_prompt_formatted
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
:
)
->
list
:
# Calculate average token length for a poem line.
# Calculate average token length for a poem line.
...
@@ -934,6 +1601,7 @@ class SonnetDataset(BenchmarkDataset):
...
@@ -934,6 +1601,7 @@ class SonnetDataset(BenchmarkDataset):
prefix_lines
=
self
.
data
[:
num_prefix_lines
]
prefix_lines
=
self
.
data
[:
num_prefix_lines
]
samples
=
[]
samples
=
[]
ind
=
0
while
len
(
samples
)
<
num_requests
:
while
len
(
samples
)
<
num_requests
:
extra_lines
=
random
.
choices
(
self
.
data
,
extra_lines
=
random
.
choices
(
self
.
data
,
k
=
num_input_lines
-
num_prefix_lines
)
k
=
num_input_lines
-
num_prefix_lines
)
...
@@ -949,7 +1617,9 @@ class SonnetDataset(BenchmarkDataset):
...
@@ -949,7 +1617,9 @@ class SonnetDataset(BenchmarkDataset):
if
return_prompt_formatted
else
prompt
,
if
return_prompt_formatted
else
prompt
,
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
))
ind
+=
1
return
samples
return
samples
...
@@ -1000,6 +1670,7 @@ class BurstGPTDataset(BenchmarkDataset):
...
@@ -1000,6 +1670,7 @@ class BurstGPTDataset(BenchmarkDataset):
num_requests
:
int
,
num_requests
:
int
,
max_loras
:
Optional
[
int
]
=
None
,
max_loras
:
Optional
[
int
]
=
None
,
lora_path
:
Optional
[
str
]
=
None
,
lora_path
:
Optional
[
str
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
)
->
list
[
SampleRequest
]:
samples
=
[]
samples
=
[]
...
@@ -1020,6 +1691,7 @@ class BurstGPTDataset(BenchmarkDataset):
...
@@ -1020,6 +1691,7 @@ class BurstGPTDataset(BenchmarkDataset):
prompt_len
=
input_len
,
prompt_len
=
input_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
lora_request
=
lora_req
,
lora_request
=
lora_req
,
request_id
=
request_id_prefix
+
str
(
i
),
))
))
return
samples
return
samples
...
@@ -1075,11 +1747,13 @@ class ConversationDataset(HuggingFaceDataset):
...
@@ -1075,11 +1747,13 @@ class ConversationDataset(HuggingFaceDataset):
num_requests
:
int
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
)
->
list
:
**
kwargs
)
->
list
:
# Filter examples with at least 2 conversations
# Filter examples with at least 2 conversations
filtered_data
=
self
.
data
.
filter
(
filtered_data
=
self
.
data
.
filter
(
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
sampled_requests
=
[]
sampled_requests
=
[]
ind
=
0
dynamic_output
=
output_len
is
None
dynamic_output
=
output_len
is
None
for
item
in
filtered_data
:
for
item
in
filtered_data
:
...
@@ -1111,8 +1785,11 @@ class ConversationDataset(HuggingFaceDataset):
...
@@ -1111,8 +1785,11 @@ class ConversationDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -1141,12 +1818,13 @@ class VisionArenaDataset(HuggingFaceDataset):
...
@@ -1141,12 +1818,13 @@ class VisionArenaDataset(HuggingFaceDataset):
num_requests
:
int
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
:
)
->
list
:
output_len
=
(
output_len
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
break
parser_fn
=
self
.
SUPPORTED_DATASET_PATHS
.
get
(
self
.
dataset_path
)
parser_fn
=
self
.
SUPPORTED_DATASET_PATHS
.
get
(
self
.
dataset_path
)
...
@@ -1168,8 +1846,10 @@ class VisionArenaDataset(HuggingFaceDataset):
...
@@ -1168,8 +1846,10 @@ class VisionArenaDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
i
),
))
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -1198,15 +1878,18 @@ class InstructCoderDataset(HuggingFaceDataset):
...
@@ -1198,15 +1878,18 @@ class InstructCoderDataset(HuggingFaceDataset):
num_requests
:
int
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
)
->
list
:
**
kwargs
)
->
list
:
output_len
=
(
output_len
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
break
prompt
=
f
"
{
item
[
'input'
]
}
\n\n
{
item
[
'instruction'
]
}
Just output
\
prompt
=
(
the code, do not include any explanation."
f
"
{
item
[
'input'
]
}
\n\n
{
item
[
'instruction'
]
}
Just output "
"the code, do not include any explanation."
)
# apply template
# apply template
prompt
=
tokenizer
.
apply_chat_template
(
prompt
=
tokenizer
.
apply_chat_template
(
...
@@ -1224,8 +1907,10 @@ class InstructCoderDataset(HuggingFaceDataset):
...
@@ -1224,8 +1907,10 @@ class InstructCoderDataset(HuggingFaceDataset):
prompt
=
prompt
,
prompt
=
prompt
,
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
))
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -1255,13 +1940,14 @@ class MTBenchDataset(HuggingFaceDataset):
...
@@ -1255,13 +1940,14 @@ class MTBenchDataset(HuggingFaceDataset):
num_requests
:
int
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
:
)
->
list
:
output_len
=
(
output_len
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
break
prompt
=
item
[
"turns"
][
0
]
prompt
=
item
[
"turns"
][
0
]
...
@@ -1282,8 +1968,10 @@ class MTBenchDataset(HuggingFaceDataset):
...
@@ -1282,8 +1968,10 @@ class MTBenchDataset(HuggingFaceDataset):
prompt
=
prompt
,
prompt
=
prompt
,
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
))
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -1305,8 +1993,10 @@ class AIMODataset(HuggingFaceDataset):
...
@@ -1305,8 +1993,10 @@ class AIMODataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
)
->
list
:
**
kwargs
)
->
list
:
sampled_requests
=
[]
sampled_requests
=
[]
ind
=
0
dynamic_output
=
output_len
is
None
dynamic_output
=
output_len
is
None
for
item
in
self
.
data
:
for
item
in
self
.
data
:
...
@@ -1331,8 +2021,12 @@ class AIMODataset(HuggingFaceDataset):
...
@@ -1331,8 +2021,12 @@ class AIMODataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
None
,
multi_modal_data
=
None
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -1403,13 +2097,14 @@ class NextEditPredictionDataset(HuggingFaceDataset):
...
@@ -1403,13 +2097,14 @@ class NextEditPredictionDataset(HuggingFaceDataset):
}
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
**
kwargs
):
**
kwargs
):
formatting_prompt_func
=
self
.
MAPPING_PROMPT_FUNCS
.
get
(
formatting_prompt_func
=
self
.
MAPPING_PROMPT_FUNCS
.
get
(
self
.
dataset_path
)
self
.
dataset_path
)
if
formatting_prompt_func
is
None
:
if
formatting_prompt_func
is
None
:
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
dataset_path
}
"
)
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
dataset_path
}
"
)
samples
=
[]
samples
=
[]
for
sample
in
self
.
data
:
for
i
,
sample
in
enumerate
(
self
.
data
)
:
sample
=
formatting_prompt_func
(
sample
)
sample
=
formatting_prompt_func
(
sample
)
samples
.
append
(
samples
.
append
(
SampleRequest
(
SampleRequest
(
...
@@ -1417,10 +2112,11 @@ class NextEditPredictionDataset(HuggingFaceDataset):
...
@@ -1417,10 +2112,11 @@ class NextEditPredictionDataset(HuggingFaceDataset):
prompt_len
=
len
(
tokenizer
(
sample
[
"prompt"
]).
input_ids
),
prompt_len
=
len
(
tokenizer
(
sample
[
"prompt"
]).
input_ids
),
expected_output_len
=
len
(
expected_output_len
=
len
(
tokenizer
(
sample
[
"expected_output"
]).
input_ids
),
tokenizer
(
sample
[
"expected_output"
]).
input_ids
),
request_id
=
request_id_prefix
+
str
(
i
),
))
))
if
len
(
samples
)
>=
num_requests
:
if
len
(
samples
)
>=
num_requests
:
break
break
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
self
.
maybe_oversample_requests
(
samples
,
num_requests
,
request_id_prefix
)
return
samples
return
samples
...
@@ -1470,6 +2166,7 @@ class ASRDataset(HuggingFaceDataset):
...
@@ -1470,6 +2166,7 @@ class ASRDataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
:
)
->
list
:
output_len
=
(
output_len
output_len
=
(
output_len
...
@@ -1477,6 +2174,7 @@ class ASRDataset(HuggingFaceDataset):
...
@@ -1477,6 +2174,7 @@ class ASRDataset(HuggingFaceDataset):
prompt
=
ASRDataset
.
TRANSCRIPTION_PREAMBLE
prompt
=
ASRDataset
.
TRANSCRIPTION_PREAMBLE
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
sampled_requests
=
[]
sampled_requests
=
[]
ind
=
0
skipped
=
0
skipped
=
0
for
item
in
self
.
data
:
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
if
len
(
sampled_requests
)
>=
num_requests
:
...
@@ -1496,7 +2194,9 @@ class ASRDataset(HuggingFaceDataset):
...
@@ -1496,7 +2194,9 @@ class ASRDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
))
ind
+=
1
if
skipped
:
if
skipped
:
logger
.
warning
(
logger
.
warning
(
"%d samples discarded from dataset due to"
"%d samples discarded from dataset due to"
...
@@ -1504,7 +2204,8 @@ class ASRDataset(HuggingFaceDataset):
...
@@ -1504,7 +2204,8 @@ class ASRDataset(HuggingFaceDataset):
" what Whisper supports."
,
" what Whisper supports."
,
skipped
,
skipped
,
)
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -1541,11 +2242,13 @@ class MLPerfDataset(HuggingFaceDataset):
...
@@ -1541,11 +2242,13 @@ class MLPerfDataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
)
->
list
[
SampleRequest
]:
# Force dynamic output length based on reference completion.
# Force dynamic output length based on reference completion.
dynamic_output
=
output_len
is
None
dynamic_output
=
output_len
is
None
sampled_requests
:
list
[
SampleRequest
]
=
[]
sampled_requests
:
list
[
SampleRequest
]
=
[]
ind
=
0
for
item
in
self
.
data
:
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
if
len
(
sampled_requests
)
>=
num_requests
:
...
@@ -1580,10 +2283,13 @@ class MLPerfDataset(HuggingFaceDataset):
...
@@ -1580,10 +2283,13 @@ class MLPerfDataset(HuggingFaceDataset):
prompt
=
prompt_formatted
,
prompt
=
prompt_formatted
,
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
expected_output_len
,
expected_output_len
=
expected_output_len
,
request_id
=
request_id_prefix
+
str
(
ind
),
)
)
)
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
return
sampled_requests
...
@@ -1616,6 +2322,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
...
@@ -1616,6 +2322,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
suffix_len
:
int
=
DEFAULT_SUFFIX_LEN
,
suffix_len
:
int
=
DEFAULT_SUFFIX_LEN
,
num_prefixes
:
int
=
DEFAULT_NUM_PREFIXES
,
num_prefixes
:
int
=
DEFAULT_NUM_PREFIXES
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
)
->
list
[
SampleRequest
]:
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
...
...
vllm/benchmarks/lib/endpoint_request_func.py
View file @
a99300bd
...
@@ -9,7 +9,7 @@ import sys
...
@@ -9,7 +9,7 @@ import sys
import
time
import
time
import
traceback
import
traceback
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
from
typing
import
Optional
,
Union
import
aiohttp
import
aiohttp
from
tqdm.asyncio
import
tqdm
from
tqdm.asyncio
import
tqdm
...
@@ -28,9 +28,10 @@ class RequestFuncInput:
...
@@ -28,9 +28,10 @@ class RequestFuncInput:
model_name
:
Optional
[
str
]
=
None
model_name
:
Optional
[
str
]
=
None
logprobs
:
Optional
[
int
]
=
None
logprobs
:
Optional
[
int
]
=
None
extra_body
:
Optional
[
dict
]
=
None
extra_body
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
|
list
[
dict
]]
=
None
multi_modal_content
:
Optional
[
Union
[
dict
,
list
[
dict
]]
]
=
None
ignore_eos
:
bool
=
False
ignore_eos
:
bool
=
False
language
:
Optional
[
str
]
=
None
language
:
Optional
[
str
]
=
None
request_id
:
Optional
[
str
]
=
None
@
dataclass
@
dataclass
...
@@ -68,8 +69,8 @@ async def async_request_openai_completions(
...
@@ -68,8 +69,8 @@ async def async_request_openai_completions(
),
"OpenAI Completions API URL must end with 'completions' or 'profile'."
),
"OpenAI Completions API URL must end with 'completions' or 'profile'."
payload
=
{
payload
=
{
"model"
:
request_func_input
.
model_name
\
"model"
:
request_func_input
.
model_name
if
request_func_input
.
model_name
else
request_func_input
.
model
,
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"temperature"
:
0.0
,
"repetition_penalty"
:
1.0
,
"repetition_penalty"
:
1.0
,
...
@@ -87,6 +88,8 @@ async def async_request_openai_completions(
...
@@ -87,6 +88,8 @@ async def async_request_openai_completions(
headers
=
{
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
output
=
RequestFuncOutput
()
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
output
.
prompt_len
=
request_func_input
.
prompt_len
...
@@ -132,7 +135,7 @@ async def async_request_openai_completions(
...
@@ -132,7 +135,7 @@ async def async_request_openai_completions(
# Decoding phase
# Decoding phase
else
:
else
:
output
.
itl
.
append
(
timestamp
-
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
most_recent_timestamp
=
timestamp
generated_text
+=
text
or
""
generated_text
+=
text
or
""
...
@@ -210,6 +213,8 @@ async def async_request_openai_chat_completions(
...
@@ -210,6 +213,8 @@ async def async_request_openai_chat_completions(
"Content-Type"
:
"application/json"
,
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
output
=
RequestFuncOutput
()
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
output
.
prompt_len
=
request_func_input
.
prompt_len
...
@@ -249,7 +254,7 @@ async def async_request_openai_chat_completions(
...
@@ -249,7 +254,7 @@ async def async_request_openai_chat_completions(
# Decoding phase
# Decoding phase
else
:
else
:
output
.
itl
.
append
(
timestamp
-
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
)
generated_text
+=
content
or
""
generated_text
+=
content
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
elif
usage
:
=
data
.
get
(
"usage"
):
...
@@ -311,6 +316,8 @@ async def async_request_openai_audio(
...
@@ -311,6 +316,8 @@ async def async_request_openai_audio(
headers
=
{
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
# Send audio file
# Send audio file
def
to_bytes
(
y
,
sr
):
def
to_bytes
(
y
,
sr
):
...
@@ -387,12 +394,61 @@ async def async_request_openai_audio(
...
@@ -387,12 +394,61 @@ async def async_request_openai_audio(
return
output
return
output
async
def
async_request_openai_embeddings
(
request_func_input
:
RequestFuncInput
,
session
:
aiohttp
.
ClientSession
,
pbar
:
Optional
[
tqdm
]
=
None
,
):
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"embeddings"
),
"OpenAI Embeddings API URL must end with 'embeddings'."
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
payload
=
{
"model"
:
request_func_input
.
model
,
"input"
:
request_func_input
.
prompt
,
}
output
=
RequestFuncOutput
()
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
headers
=
headers
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
output
.
latency
=
time
.
perf_counter
()
-
st
data
=
await
response
.
json
()
output
.
success
=
True
output
.
generated_text
=
""
output
.
prompt_len
=
data
.
get
(
"usage"
,
{}).
get
(
"prompt_tokens"
,
0
)
else
:
output
.
success
=
False
output
.
error
=
response
.
reason
or
""
except
Exception
as
e
:
output
.
success
=
False
output
.
error
=
str
(
e
)
if
pbar
:
pbar
.
update
(
1
)
return
output
# TODO: Add more request functions for different API protocols.
# TODO: Add more request functions for different API protocols.
ASYNC_REQUEST_FUNCS
=
{
ASYNC_REQUEST_FUNCS
=
{
"vllm"
:
async_request_openai_completions
,
"vllm"
:
async_request_openai_completions
,
"openai"
:
async_request_openai_completions
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"openai-audio"
:
async_request_openai_audio
,
"openai-audio"
:
async_request_openai_audio
,
"openai-embeddings"
:
async_request_openai_embeddings
,
}
}
OPENAI_COMPATIBLE_BACKENDS
=
[
OPENAI_COMPATIBLE_BACKENDS
=
[
...
...
vllm/benchmarks/lib/utils.py
View file @
a99300bd
...
@@ -54,7 +54,12 @@ class InfEncoder(json.JSONEncoder):
...
@@ -54,7 +54,12 @@ class InfEncoder(json.JSONEncoder):
def
clear_inf
(
self
,
o
:
Any
):
def
clear_inf
(
self
,
o
:
Any
):
if
isinstance
(
o
,
dict
):
if
isinstance
(
o
,
dict
):
return
{
k
:
self
.
clear_inf
(
v
)
for
k
,
v
in
o
.
items
()}
return
{
str
(
k
)
if
not
isinstance
(
k
,
(
str
,
int
,
float
,
bool
,
type
(
None
)))
else
k
:
self
.
clear_inf
(
v
)
for
k
,
v
in
o
.
items
()
}
elif
isinstance
(
o
,
list
):
elif
isinstance
(
o
,
list
):
return
[
self
.
clear_inf
(
v
)
for
v
in
o
]
return
[
self
.
clear_inf
(
v
)
for
v
in
o
]
elif
isinstance
(
o
,
float
)
and
math
.
isinf
(
o
):
elif
isinstance
(
o
,
float
)
and
math
.
isinf
(
o
):
...
...
vllm/benchmarks/serve.py
View file @
a99300bd
...
@@ -4,7 +4,7 @@ r"""Benchmark online serving throughput.
...
@@ -4,7 +4,7 @@ r"""Benchmark online serving throughput.
On the server side, run one of the following commands
On the server side, run one of the following commands
to launch the vLLM OpenAI API server:
to launch the vLLM OpenAI API server:
vllm serve <your_model> <engine arguments>
vllm serve <your_model> <engine arguments>
On the client side, run:
On the client side, run:
vllm bench serve \
vllm bench serve \
...
@@ -26,6 +26,7 @@ import warnings
...
@@ -26,6 +26,7 @@ import warnings
from
collections.abc
import
AsyncGenerator
,
Iterable
from
collections.abc
import
AsyncGenerator
,
Iterable
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
datetime
import
datetime
from
enum
import
Enum
from
typing
import
Any
,
Literal
,
Optional
from
typing
import
Any
,
Literal
,
Optional
import
aiohttp
import
aiohttp
...
@@ -46,6 +47,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
...
@@ -46,6 +47,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
MILLISECONDS_TO_SECONDS_CONVERSION
=
1000
MILLISECONDS_TO_SECONDS_CONVERSION
=
1000
class
TaskType
(
Enum
):
GENERATION
=
"generation"
EMBEDDING
=
"embedding"
@
dataclass
@
dataclass
class
BenchmarkMetrics
:
class
BenchmarkMetrics
:
completed
:
int
completed
:
int
...
@@ -75,6 +81,16 @@ class BenchmarkMetrics:
...
@@ -75,6 +81,16 @@ class BenchmarkMetrics:
std_e2el_ms
:
float
std_e2el_ms
:
float
percentiles_e2el_ms
:
list
[
tuple
[
float
,
float
]]
percentiles_e2el_ms
:
list
[
tuple
[
float
,
float
]]
@
dataclass
class
EmbedBenchmarkMetrics
:
completed
:
int
total_input
:
int
request_throughput
:
float
total_token_throughput
:
float
mean_e2el_ms
:
float
std_e2el_ms
:
float
median_e2el_ms
:
float
percentiles_e2el_ms
:
float
def
_get_current_request_rate
(
def
_get_current_request_rate
(
ramp_up_strategy
:
Optional
[
Literal
[
"linear"
,
"exponential"
]],
ramp_up_strategy
:
Optional
[
Literal
[
"linear"
,
"exponential"
]],
...
@@ -146,11 +162,11 @@ async def get_request(
...
@@ -146,11 +162,11 @@ async def get_request(
delay_ts
=
[]
delay_ts
=
[]
for
request_index
,
request
in
enumerate
(
input_requests
):
for
request_index
,
request
in
enumerate
(
input_requests
):
current_request_rate
=
_get_current_request_rate
(
ramp_up_strategy
,
current_request_rate
=
_get_current_request_rate
(
ramp_up_strategy
,
ramp_up_start_rps
,
ramp_up_start_rps
,
ramp_up_end_rps
,
ramp_up_end_rps
,
request_index
,
request_index
,
total_requests
,
total_requests
,
request_rate
)
request_rate
)
request_rates
.
append
(
current_request_rate
)
request_rates
.
append
(
current_request_rate
)
if
current_request_rate
==
float
(
"inf"
):
if
current_request_rate
==
float
(
"inf"
):
delay_ts
.
append
(
0
)
delay_ts
.
append
(
0
)
...
@@ -160,7 +176,7 @@ async def get_request(
...
@@ -160,7 +176,7 @@ async def get_request(
# Sample the request interval from the gamma distribution.
# Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution.
# If burstiness is 1, it follows exponential distribution.
delay_ts
.
append
(
np
.
random
.
gamma
(
shape
=
burstiness
,
scale
=
theta
))
delay_ts
.
append
(
np
.
random
.
gamma
(
shape
=
burstiness
,
scale
=
theta
))
# Calculate the cumulative delay time from the first sent out requests.
# Calculate the cumulative delay time from the first sent out requests.
for
i
in
range
(
1
,
len
(
delay_ts
)):
for
i
in
range
(
1
,
len
(
delay_ts
)):
delay_ts
[
i
]
+=
delay_ts
[
i
-
1
]
delay_ts
[
i
]
+=
delay_ts
[
i
-
1
]
...
@@ -170,11 +186,11 @@ async def get_request(
...
@@ -170,11 +186,11 @@ async def get_request(
# logic would re-scale delay time to ensure the final delay_ts
# logic would re-scale delay time to ensure the final delay_ts
# align with target_total_delay_s.
# align with target_total_delay_s.
#
#
# NOTE: If we simply accumulate the random delta values
# NOTE: If we simply accumulate the random delta values
# from the gamma distribution, their sum would have 1-2% gap
# from the gamma distribution, their sum would have 1-2% gap
# from target_total_delay_s. The purpose of the following logic is to
# from target_total_delay_s. The purpose of the following logic is to
# close the gap for stablizing the throughput data
# close the gap for stablizing the throughput data
# from different random seeds.
# from different random seeds.
target_total_delay_s
=
total_requests
/
request_rate
target_total_delay_s
=
total_requests
/
request_rate
normalize_factor
=
target_total_delay_s
/
delay_ts
[
-
1
]
normalize_factor
=
target_total_delay_s
/
delay_ts
[
-
1
]
delay_ts
=
[
delay
*
normalize_factor
for
delay
in
delay_ts
]
delay_ts
=
[
delay
*
normalize_factor
for
delay
in
delay_ts
]
...
@@ -189,6 +205,51 @@ async def get_request(
...
@@ -189,6 +205,51 @@ async def get_request(
yield
request
,
request_rates
[
request_index
]
yield
request
,
request_rates
[
request_index
]
def
calculate_metrics_for_embeddings
(
outputs
:
list
[
RequestFuncOutput
],
dur_s
:
float
,
selected_percentiles
:
list
[
float
]
)
->
EmbedBenchmarkMetrics
:
"""Calculate the metrics for the embedding requests.
Args:
outputs: The outputs of the requests.
dur_s: The duration of the benchmark.
selected_percentiles: The percentiles to select.
Returns:
The calculated benchmark metrics.
"""
total_input
=
0
completed
=
0
e2els
:
list
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
e2els
.
append
(
outputs
[
i
].
latency
)
completed
+=
1
total_input
+=
outputs
[
i
].
prompt_len
if
completed
==
0
:
warnings
.
warn
(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments."
,
stacklevel
=
2
)
metrics
=
EmbedBenchmarkMetrics
(
completed
=
completed
,
total_input
=
total_input
,
request_throughput
=
completed
/
dur_s
,
total_token_throughput
=
total_input
/
dur_s
,
mean_e2el_ms
=
np
.
mean
(
e2els
or
0
)
*
1000
,
std_e2el_ms
=
np
.
std
(
e2els
or
0
)
*
1000
,
median_e2el_ms
=
np
.
median
(
e2els
or
0
)
*
1000
,
percentiles_e2el_ms
=
[
(
p
,
np
.
percentile
(
e2els
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
)
return
metrics
def
calculate_metrics
(
def
calculate_metrics
(
input_requests
:
list
[
SampleRequest
],
input_requests
:
list
[
SampleRequest
],
outputs
:
list
[
RequestFuncOutput
],
outputs
:
list
[
RequestFuncOutput
],
...
@@ -334,8 +395,16 @@ async def benchmark(
...
@@ -334,8 +395,16 @@ async def benchmark(
ramp_up_end_rps
:
Optional
[
int
]
=
None
,
ramp_up_end_rps
:
Optional
[
int
]
=
None
,
ready_check_timeout_sec
:
int
=
600
,
ready_check_timeout_sec
:
int
=
600
,
):
):
task_type
=
(
TaskType
.
EMBEDDING
if
api_url
.
endswith
(
"/v1/embeddings"
)
else
TaskType
.
GENERATION
)
if
endpoint_type
in
ASYNC_REQUEST_FUNCS
:
if
endpoint_type
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
endpoint_type
]
if
task_type
==
TaskType
.
EMBEDDING
:
request_func
=
ASYNC_REQUEST_FUNCS
[
"openai-embeddings"
]
else
:
request_func
=
ASYNC_REQUEST_FUNCS
[
endpoint_type
]
else
:
else
:
raise
ValueError
(
f
"Unknown endpoint_type:
{
endpoint_type
}
"
)
raise
ValueError
(
f
"Unknown endpoint_type:
{
endpoint_type
}
"
)
...
@@ -421,8 +490,8 @@ async def benchmark(
...
@@ -421,8 +490,8 @@ async def benchmark(
if
profile_output
.
success
:
if
profile_output
.
success
:
print
(
"Profiler started"
)
print
(
"Profiler started"
)
distribution
=
(
"Poisson process"
if
burstiness
==
1.0
distribution
=
(
"Poisson process"
if
burstiness
==
1.0
else
"Gamma distribution"
)
else
"Gamma distribution"
)
if
ramp_up_strategy
is
not
None
:
if
ramp_up_strategy
is
not
None
:
print
(
f
"Traffic ramp-up strategy:
{
ramp_up_strategy
}
."
)
print
(
f
"Traffic ramp-up strategy:
{
ramp_up_strategy
}
."
)
...
@@ -449,7 +518,7 @@ async def benchmark(
...
@@ -449,7 +518,7 @@ async def benchmark(
session
=
session
,
session
=
session
,
pbar
=
pbar
)
pbar
=
pbar
)
async
with
semaphore
:
async
with
semaphore
:
return
await
request_func
(
request_func_input
=
request_func_input
,
return
await
request_func
(
request_func_input
=
request_func_input
,
session
=
session
,
session
=
session
,
pbar
=
pbar
)
pbar
=
pbar
)
...
@@ -478,11 +547,12 @@ async def benchmark(
...
@@ -478,11 +547,12 @@ async def benchmark(
"timestamp"
:
timestamp
"timestamp"
:
timestamp
})
})
last_int_rps
=
current_int_rps
last_int_rps
=
current_int_rps
prompt
,
prompt_len
,
output_len
,
mm_content
=
(
prompt
,
prompt_len
,
output_len
,
mm_content
,
request_id
=
(
request
.
prompt
,
request
.
prompt
,
request
.
prompt_len
,
request
.
prompt_len
,
request
.
expected_output_len
,
request
.
expected_output_len
,
request
.
multi_modal_data
,
request
.
multi_modal_data
,
request
.
request_id
,
)
)
req_model_id
,
req_model_name
=
model_id
,
model_name
req_model_id
,
req_model_name
=
model_id
,
model_name
if
lora_modules
:
if
lora_modules
:
...
@@ -498,7 +568,8 @@ async def benchmark(
...
@@ -498,7 +568,8 @@ async def benchmark(
logprobs
=
logprobs
,
logprobs
=
logprobs
,
multi_modal_content
=
mm_content
,
multi_modal_content
=
mm_content
,
ignore_eos
=
ignore_eos
,
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
)
extra_body
=
extra_body
,
request_id
=
request_id
,)
tasks
.
append
(
tasks
.
append
(
asyncio
.
create_task
(
asyncio
.
create_task
(
limited_request_func
(
request_func_input
=
request_func_input
,
limited_request_func
(
request_func_input
=
request_func_input
,
...
@@ -511,14 +582,22 @@ async def benchmark(
...
@@ -511,14 +582,22 @@ async def benchmark(
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
metrics
,
actual_output_lens
=
calculate_metrics
(
if
task_type
==
TaskType
.
GENERATION
:
input_requests
=
input_requests
,
metrics
,
actual_output_lens
=
calculate_metrics
(
outputs
=
outputs
,
input_requests
=
input_requests
,
dur_s
=
benchmark_duration
,
outputs
=
outputs
,
tokenizer
=
tokenizer
,
dur_s
=
benchmark_duration
,
selected_percentiles
=
selected_percentiles
,
tokenizer
=
tokenizer
,
goodput_config_dict
=
goodput_config_dict
,
selected_percentiles
=
selected_percentiles
,
)
goodput_config_dict
=
goodput_config_dict
,
)
else
:
metrics
=
calculate_metrics_for_embeddings
(
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
selected_percentiles
=
selected_percentiles
,
)
actual_output_lens
=
0
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
...
@@ -527,39 +606,55 @@ async def benchmark(
...
@@ -527,39 +606,55 @@ async def benchmark(
max_concurrency
))
max_concurrency
))
if
request_rate
!=
float
(
'inf'
):
if
request_rate
!=
float
(
'inf'
):
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request rate configured (RPS):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request rate configured (RPS):"
,
request_rate
))
request_rate
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
benchmark_duration
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
if
isinstance
(
metrics
,
BenchmarkMetrics
):
metrics
.
total_output
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
metrics
.
request_throughput
))
metrics
.
request_throughput
))
if
goodput_config_dict
:
if
goodput_config_dict
:
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request goodput (req/s):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request goodput (req/s):"
,
metrics
.
request_goodput
))
metrics
.
request_goodput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
if
isinstance
(
metrics
,
BenchmarkMetrics
):
metrics
.
output_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
)
)
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
metrics
.
total_token_throughput
))
metrics
.
total_token_throughput
))
result
=
{
if
isinstance
(
metrics
,
BenchmarkMetrics
):
"duration"
:
benchmark_duration
,
result
=
{
"completed"
:
metrics
.
completed
,
"duration"
:
benchmark_duration
,
"total_input_tokens"
:
metrics
.
total_input
,
"completed"
:
metrics
.
completed
,
"total_output_tokens"
:
metrics
.
total_output
,
"total_input_tokens"
:
metrics
.
total_input
,
"request_throughput"
:
metrics
.
request_throughput
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_goodput"
:
"request_throughput"
:
metrics
.
request_throughput
,
metrics
.
request_goodput
if
goodput_config_dict
else
None
,
"request_goodput"
:
"output_throughput"
:
metrics
.
output_throughput
,
metrics
.
request_goodput
if
goodput_config_dict
else
None
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"output_throughput"
:
metrics
.
output_throughput
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"output_lens"
:
actual_output_lens
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
"output_lens"
:
actual_output_lens
,
"itls"
:
[
output
.
itl
for
output
in
outputs
],
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
"generated_texts"
:
[
output
.
generated_text
for
output
in
outputs
],
"itls"
:
[
output
.
itl
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
"generated_texts"
:
[
output
.
generated_text
for
output
in
outputs
],
}
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
else
:
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"request_throughput"
:
metrics
.
request_throughput
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
if
rps_change_events
:
if
rps_change_events
:
result
[
"rps_change_events"
]
=
rps_change_events
result
[
"rps_change_events"
]
=
rps_change_events
...
@@ -596,10 +691,11 @@ async def benchmark(
...
@@ -596,10 +691,11 @@ async def benchmark(
value
))
value
))
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
if
task_type
==
TaskType
.
GENERATION
:
process_one_metric
(
"tpot"
,
"TPOT"
,
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
process_one_metric
(
"e2el"
,
"E2EL"
,
"End-to-end Latency"
)
process_one_metric
(
"e2el"
,
"E2EL"
,
"End-to-end Latency"
)
print
(
"="
*
50
)
print
(
"="
*
50
)
...
@@ -730,7 +826,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -730,7 +826,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
"initiated, this argument will control how many are actually allowed "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up."
)
"if the server is not processing requests fast enough to keep up."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--model"
,
"--model"
,
...
@@ -741,8 +838,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -741,8 +838,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
parser
.
add_argument
(
parser
.
add_argument
(
"--tokenizer"
,
"--tokenizer"
,
type
=
str
,
type
=
str
,
help
=
help
=
"Name or path of the tokenizer, if not using the default tokenizer."
,
# noqa: E501
"Name or path of the tokenizer, if not using the default tokenizer."
,
# noqa: E501
)
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -865,6 +961,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -865,6 +961,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve"
,
"and the blog: https://hao-ai-lab.github.io/blogs/distserve"
,
)
)
parser
.
add_argument
(
"--request-id-prefix"
,
type
=
str
,
required
=
False
,
default
=
"benchmark-serving"
,
help
=
"Specify the prefix of request id."
,
)
sampling_group
=
parser
.
add_argument_group
(
"sampling parameters"
)
sampling_group
=
parser
.
add_argument_group
(
"sampling parameters"
)
sampling_group
.
add_argument
(
sampling_group
.
add_argument
(
...
@@ -958,6 +1062,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -958,6 +1062,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
def
main
(
args
:
argparse
.
Namespace
)
->
dict
[
str
,
Any
]:
def
main
(
args
:
argparse
.
Namespace
)
->
dict
[
str
,
Any
]:
return
asyncio
.
run
(
main_async
(
args
))
return
asyncio
.
run
(
main_async
(
args
))
async
def
main_async
(
args
:
argparse
.
Namespace
)
->
dict
[
str
,
Any
]:
async
def
main_async
(
args
:
argparse
.
Namespace
)
->
dict
[
str
,
Any
]:
print
(
args
)
print
(
args
)
random
.
seed
(
args
.
seed
)
random
.
seed
(
args
.
seed
)
...
@@ -1036,32 +1141,32 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
...
@@ -1036,32 +1141,32 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
gc
.
freeze
()
gc
.
freeze
()
benchmark_result
=
await
benchmark
(
benchmark_result
=
await
benchmark
(
endpoint_type
=
args
.
endpoint_type
,
endpoint_type
=
args
.
endpoint_type
,
api_url
=
api_url
,
api_url
=
api_url
,
base_url
=
base_url
,
base_url
=
base_url
,
model_id
=
model_id
,
model_id
=
model_id
,
model_name
=
model_name
,
model_name
=
model_name
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
input_requests
=
input_requests
,
input_requests
=
input_requests
,
logprobs
=
args
.
logprobs
,
logprobs
=
args
.
logprobs
,
request_rate
=
args
.
request_rate
,
request_rate
=
args
.
request_rate
,
burstiness
=
args
.
burstiness
,
burstiness
=
args
.
burstiness
,
disable_tqdm
=
args
.
disable_tqdm
,
disable_tqdm
=
args
.
disable_tqdm
,
profile
=
args
.
profile
,
profile
=
args
.
profile
,
selected_percentile_metrics
=
args
.
percentile_metrics
.
split
(
","
),
selected_percentile_metrics
=
args
.
percentile_metrics
.
split
(
","
),
selected_percentiles
=
[
selected_percentiles
=
[
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
],
],
ignore_eos
=
args
.
ignore_eos
,
ignore_eos
=
args
.
ignore_eos
,
goodput_config_dict
=
goodput_config_dict
,
goodput_config_dict
=
goodput_config_dict
,
max_concurrency
=
args
.
max_concurrency
,
max_concurrency
=
args
.
max_concurrency
,
lora_modules
=
args
.
lora_modules
,
lora_modules
=
args
.
lora_modules
,
extra_body
=
sampling_params
,
extra_body
=
sampling_params
,
ramp_up_strategy
=
args
.
ramp_up_strategy
,
ramp_up_strategy
=
args
.
ramp_up_strategy
,
ramp_up_start_rps
=
args
.
ramp_up_start_rps
,
ramp_up_start_rps
=
args
.
ramp_up_start_rps
,
ramp_up_end_rps
=
args
.
ramp_up_end_rps
,
ramp_up_end_rps
=
args
.
ramp_up_end_rps
,
ready_check_timeout_sec
=
args
.
ready_check_timeout_sec
,
ready_check_timeout_sec
=
args
.
ready_check_timeout_sec
,
)
)
# Save config and results to json
# Save config and results to json
result_json
:
dict
[
str
,
Any
]
=
{}
result_json
:
dict
[
str
,
Any
]
=
{}
...
@@ -1088,7 +1193,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
...
@@ -1088,7 +1193,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
# Traffic
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
<
float
(
"inf"
)
else
"inf"
)
result_json
[
"burstiness"
]
=
args
.
burstiness
result_json
[
"burstiness"
]
=
args
.
burstiness
result_json
[
"max_concurrency"
]
=
args
.
max_concurrency
result_json
[
"max_concurrency"
]
=
args
.
max_concurrency
...
@@ -1122,7 +1227,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
...
@@ -1122,7 +1227,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
if
args
.
max_concurrency
is
not
None
else
""
)
if
args
.
max_concurrency
is
not
None
else
""
)
label
=
label
or
endpoint_type
label
=
label
or
endpoint_type
if
args
.
ramp_up_strategy
is
not
None
:
if
args
.
ramp_up_strategy
is
not
None
:
file_name
=
f
"
{
label
}
-ramp-up-
{
args
.
ramp_up_strategy
}
-
{
args
.
ramp_up_start_rps
}
qps-
{
args
.
ramp_up_end_rps
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
# noqa
file_name
=
f
"
{
label
}
-ramp-up-
{
args
.
ramp_up_strategy
}
-
{
args
.
ramp_up_start_rps
}
qps-
{
args
.
ramp_up_end_rps
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
# noqa
else
:
else
:
file_name
=
f
"
{
label
}
-
{
args
.
request_rate
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
# noqa
file_name
=
f
"
{
label
}
-
{
args
.
request_rate
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
# noqa
if
args
.
result_filename
:
if
args
.
result_filename
:
...
@@ -1139,4 +1244,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
...
@@ -1139,4 +1244,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
json
.
dump
(
result_json
,
outfile
)
json
.
dump
(
result_json
,
outfile
)
save_to_pytorch_benchmark_format
(
args
,
result_json
,
file_name
)
save_to_pytorch_benchmark_format
(
args
,
result_json
,
file_name
)
return
result_json
return
result_json
\ No newline at end of file
Prev
1
…
15
16
17
18
19
20
21
22
23
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment