Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
752c6ade
Unverified
Commit
752c6ade
authored
Jul 19, 2025
by
Woosuk Kwon
Committed by
GitHub
Jul 19, 2025
Browse files
[V0 Deprecation] Deprecate BlockSparse Attention & Phi3-Small (#21217)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
881e3cbe
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
24 additions
and
1030 deletions
+24
-1030
vllm/attention/ops/blocksparse_attention/interface.py
vllm/attention/ops/blocksparse_attention/interface.py
+0
-239
vllm/attention/ops/blocksparse_attention/utils.py
vllm/attention/ops/blocksparse_attention/utils.py
+0
-246
vllm/attention/selector.py
vllm/attention/selector.py
+0
-9
vllm/model_executor/models/phi3_small.py
vllm/model_executor/models/phi3_small.py
+0
-465
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+0
-1
vllm/platforms/interface.py
vllm/platforms/interface.py
+0
-1
vllm/v1/attention/backends/cpu_attn.py
vllm/v1/attention/backends/cpu_attn.py
+1
-5
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+1
-5
vllm/v1/attention/backends/flashinfer.py
vllm/v1/attention/backends/flashinfer.py
+1
-2
vllm/v1/attention/backends/flex_attention.py
vllm/v1/attention/backends/flex_attention.py
+1
-6
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+1
-2
vllm/v1/attention/backends/mla/cutlass_mla.py
vllm/v1/attention/backends/mla/cutlass_mla.py
+4
-8
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla.py
+4
-8
vllm/v1/attention/backends/mla/rocm_aiter_mla.py
vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+4
-8
vllm/v1/attention/backends/mla/triton_mla.py
vllm/v1/attention/backends/mla/triton_mla.py
+4
-8
vllm/v1/attention/backends/pallas.py
vllm/v1/attention/backends/pallas.py
+1
-7
vllm/v1/attention/backends/rocm_aiter_fa.py
vllm/v1/attention/backends/rocm_aiter_fa.py
+1
-5
vllm/v1/attention/backends/triton_attn.py
vllm/v1/attention/backends/triton_attn.py
+1
-5
No files found.
vllm/attention/ops/blocksparse_attention/interface.py
deleted
100644 → 0
View file @
881e3cbe
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
import
torch
from
vllm.platforms
import
current_platform
from
.utils
import
(
dense_to_crow_col
,
get_head_sliding_step
,
get_sparse_attn_mask
)
IS_COMPUTE_8_OR_ABOVE
=
current_platform
.
has_device_capability
(
80
)
if
IS_COMPUTE_8_OR_ABOVE
:
from
.blocksparse_attention_kernel
import
blocksparse_flash_attn_varlen_fwd
class
LocalStridedBlockSparseAttn
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
n_heads
,
max_seqlen
,
local_blocks
,
vert_stride
,
block_size
,
device
=
None
,
dtype
=
None
,
homo_head
=
False
,
active_head_range
=
None
,
q_block_size
=
None
,
use_spda
=
None
,
):
super
().
__init__
()
if
use_spda
is
None
:
use_spda
=
current_platform
.
is_rocm
()
or
\
current_platform
.
is_cpu
()
or
not
\
IS_COMPUTE_8_OR_ABOVE
device
=
device
or
(
torch
.
cuda
.
current_device
()
if
current_platform
.
is_cuda_alike
()
else
"cpu"
)
device
=
torch
.
device
(
device
)
# NOTE: vllm CPU backend support BF16 instead of FP16.
dtype
=
dtype
or
(
torch
.
bfloat16
if
IS_COMPUTE_8_OR_ABOVE
or
device
.
type
==
"cpu"
else
torch
.
half
)
self
.
n_heads
=
n_heads
self
.
max_seqlen
=
max_seqlen
self
.
local_blocks
=
local_blocks
self
.
vert_stride
=
vert_stride
self
.
use_spda
=
use_spda
self
.
dtype
=
dtype
self
.
device
=
device
self
.
block_size
=
block_size
self
.
q_block_size
=
q_block_size
self
.
homo_head
=
homo_head
self
.
active_head_range
=
active_head_range
self
.
head_sliding_step
=
get_head_sliding_step
(
n_heads
,
vert_stride
,
homo_head
)
sparse_layout
,
sparse_pattern
,
self
.
dense_attn_mask
=
(
self
.
get_attn_pattern
(
dtype
,
device
))
if
q_block_size
is
not
None
and
q_block_size
!=
block_size
:
if
q_block_size
>
block_size
:
assert
q_block_size
%
block_size
==
0
blocks_to_merge
=
q_block_size
//
block_size
shape
=
sparse_pattern
.
shape
sparse_pattern
=
sparse_pattern
.
view
(
shape
[
0
],
-
1
,
blocks_to_merge
,
shape
[
-
1
])
sparse_pattern
=
sparse_pattern
.
sum
(
2
)
sparse_layout
=
dense_to_crow_col
(
sparse_pattern
)
else
:
raise
ValueError
(
"Does not support smaller q_block_size. It will be slower."
)
self
.
sparse_layout
=
sparse_layout
def
get_attn_pattern
(
self
,
dtype
,
device
):
sparse_layout
,
sparse_pattern
,
dense_attn_mask
=
get_sparse_attn_mask
(
self
.
n_heads
,
self
.
max_seqlen
,
self
.
max_seqlen
,
dtype
,
device
,
block_size
=
self
.
block_size
,
local_blocks
=
self
.
local_blocks
,
vert_stride
=
self
.
vert_stride
,
homo_head
=
self
.
homo_head
,
return_dense
=
self
.
use_spda
,
dense_mask_type
=
"bias"
,
)
if
(
not
self
.
homo_head
)
and
(
self
.
active_head_range
is
not
None
):
assert
isinstance
(
self
.
active_head_range
,
tuple
)
assert
(
len
(
self
.
active_head_range
)
==
2
)
h_start
,
h_end
=
self
.
active_head_range
sparse_layout
=
tuple
(
x
[
h_start
:
h_end
]
for
x
in
sparse_layout
)
if
self
.
use_spda
:
dense_attn_mask
=
dense_attn_mask
[
h_start
:
h_end
]
return
sparse_layout
,
sparse_pattern
,
dense_attn_mask
def
varlen_attn
(
self
,
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
None
,
sm_scale
=
None
):
"""
q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
Support grouped attention, with `q[:, i*r:(i*r + r)]`
is correspondent to `k[:, i]`, where `r` is the q/k ratio.
cu_seqlens_k: shape=(batch_size + 1,),
indicating segment of samples,
e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
cu_seqlens_q: shape=(batch_size + 1, ).
Default None: same as cu_seqlens_k for prefilling or
[0, 1, .., batch_size] for decoding.
The only case you need to specify is when q is a mix of
prefilling and decoding.
sm_scale: softmax scale, default to 1/sqrt(head_size).
return: tensor of shape as q.
"""
assert
(
IS_COMPUTE_8_OR_ABOVE
),
"Requires compute capability of 8 or above (Ampere or newer) to use
\
Triton kernel."
sm_scale
=
sm_scale
or
1.0
/
math
.
sqrt
(
q
.
size
(
-
1
))
return
blocksparse_flash_attn_varlen_fwd
(
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
,
sm_scale
,
self
.
sparse_layout
,
block_size
=
self
.
block_size
,
q_block_size
=
self
.
q_block_size
,
max_seqlen
=
self
.
max_seqlen
,
)
@
staticmethod
def
transpose_and_pad
(
x
,
cu_seqlens
,
maxlen
,
head_repeats
=
1
):
"""
:param x: (total_tokens, n_heads, head_size)
:return: (batch, n_heads, length, head_size)
"""
x_padded
=
x
.
new_empty
(
len
(
cu_seqlens
)
-
1
,
x
.
size
(
1
),
head_repeats
,
maxlen
,
x
.
size
(
2
))
cu_seqlens
=
cu_seqlens
.
cpu
()
for
i
,
(
s
,
e
)
in
enumerate
(
zip
(
cu_seqlens
[:
-
1
],
cu_seqlens
[
1
:])):
x_padded
[
i
,
:,
:,
:
e
-
s
].
copy_
(
x
[
s
:
e
].
transpose
(
0
,
1
).
unsqueeze
(
1
))
return
x_padded
.
flatten
(
1
,
2
)
@
staticmethod
def
transpose_and_unpad
(
x_padded
,
cu_seqlens
):
"""
:param x_padded: (batch, n_heads, length, head_size)
:return: (total_tokens, n_heads, head_size)
"""
cu_seqlens
=
cu_seqlens
.
cpu
()
total_n_tokens
=
cu_seqlens
[
-
1
]
x
=
x_padded
.
new_empty
(
total_n_tokens
,
x_padded
.
size
(
1
),
x_padded
.
size
(
3
))
for
i
,
(
s
,
e
)
in
enumerate
(
zip
(
cu_seqlens
[:
-
1
],
cu_seqlens
[
1
:])):
x
[
s
:
e
].
copy_
(
x_padded
[
i
,
:,
:
e
-
s
].
transpose
(
0
,
1
))
return
x
def
spda
(
self
,
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
None
,
sm_scale
=
None
):
"""For CPU, V100 or other older GPUs.
NOTE: torch SPDA supports nested tensor,
but seems extremely slow. Choose to pad instead.
"""
assert
(
cu_seqlens_q
is
None
or
(
cu_seqlens_q
==
cu_seqlens_k
).
all
()),
"Can only handle prompt with SPDA."
assert
q
.
size
(
0
)
==
k
.
size
(
0
),
"can only handle prompt with SPDA."
assert
q
.
size
(
1
)
%
k
.
size
(
1
)
==
0
q_k_ratio
=
q
.
size
(
1
)
//
k
.
size
(
1
)
sm_scale
=
sm_scale
or
1.0
/
math
.
sqrt
(
q
.
size
(
-
1
))
cu_seqlens
=
cu_seqlens_k
.
cpu
()
maxlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
()
if
(
self
.
dense_attn_mask
.
dtype
!=
q
.
dtype
or
self
.
dense_attn_mask
.
device
!=
q
.
device
):
_
,
_
,
self
.
dense_attn_mask
=
self
.
get_attn_pattern
(
q
.
dtype
,
q
.
device
)
attn_mask
=
self
.
dense_attn_mask
[
None
,
:,
:
maxlen
,
:
maxlen
]
q2
=
self
.
transpose_and_pad
(
q
,
cu_seqlens
,
maxlen
,
1
)
k2
,
v2
=
(
self
.
transpose_and_pad
(
x
,
cu_seqlens
,
maxlen
,
q_k_ratio
)
for
x
in
[
k
,
v
])
spda_output
=
torch
.
nn
.
functional
.
scaled_dot_product_attention
(
q2
,
k2
,
v2
,
attn_mask
=
attn_mask
,
scale
=
sm_scale
)
return
self
.
transpose_and_unpad
(
spda_output
,
cu_seqlens
)
def
forward
(
self
,
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
None
,
sm_scale
=
None
):
"""Dispatch to `varlen_attn` (Ampere or newer) or
`self.spda`(cpu, Volta, Turing or older)based on
the type of device used and cuda compute capability.
q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
Support grouped attention, with `q[:, i*r:(i*r + r)]`
is correspondent to `k[:, i]`, where `r` is the q/k ratio.
cu_seqlens_k: shape=(batch_size + 1,), indicating segment of samples,
e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
cu_seqlens_q: shape=(batch_size + 1, ).
Default None: same as cu_seqlens_k for prefilling or
[0, 1, .., batch_size] for decoding.
The only case you need to specify
is when q is a mix of prefilling
and decoding.
sm_scale: softmax scale, default to 1/sqrt(head_size).
return: tensor of shape as q.
"""
assert
k
.
dim
()
==
3
if
self
.
use_spda
:
return
self
.
spda
(
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
cu_seqlens_q
,
sm_scale
=
sm_scale
,
)
return
self
.
varlen_attn
(
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
cu_seqlens_q
,
sm_scale
=
sm_scale
)
vllm/attention/ops/blocksparse_attention/utils.py
deleted
100644 → 0
View file @
881e3cbe
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Helper functions for 3D sparse pattern
# These function are not optimized and very inefficient.
# Avoid calling them too frequent or use a cache mechanism.
from
functools
import
lru_cache
import
numpy
as
np
import
torch
from
vllm.triton_utils
import
triton
class
csr_matrix
:
"""Simple implementation of CSR matrix conversion without scipy.
This replaced scipy.sparse.csr_matrix() previously used."""
def
__init__
(
self
,
input_array
):
if
not
isinstance
(
input_array
,
np
.
ndarray
):
raise
ValueError
(
"Input must be a NumPy array"
)
self
.
shape
=
input_array
.
shape
rows
,
cols
=
self
.
shape
data
=
[]
indices
=
[]
indptr
=
[
0
]
for
i
in
range
(
rows
):
for
j
in
range
(
cols
):
if
input_array
[
i
,
j
]:
data
.
append
(
input_array
[
i
,
j
])
indices
.
append
(
j
)
indptr
.
append
(
len
(
indices
))
self
.
data
=
np
.
array
(
data
)
self
.
indices
=
np
.
array
(
indices
)
self
.
indptr
=
np
.
array
(
indptr
)
def
dense_to_crow_col
(
x
:
torch
.
Tensor
):
"""Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing.
NOTE: col_indices padded -1
"""
device
=
x
.
device
pad
=
-
1
dim
=
x
.
dim
()
assert
x
.
dim
()
in
(
2
,
3
)
if
x
.
dim
()
==
2
:
x
=
x
[
None
]
x
=
[
csr_matrix
(
xi
.
bool
().
cpu
().
numpy
())
for
xi
in
x
]
crows
=
torch
.
vstack
([
torch
.
from_numpy
(
xi
.
indptr
)
for
xi
in
x
])
cols
=
[
torch
.
from_numpy
(
xi
.
indices
)
for
xi
in
x
]
max_cols
=
max
(
len
(
xi
)
for
xi
in
cols
)
cols
=
[
torch
.
cat
([
xi
,
pad
+
xi
.
new_zeros
(
max_cols
-
xi
.
shape
[
0
])])
for
xi
in
cols
]
cols
=
torch
.
vstack
(
cols
)
if
dim
==
2
:
crows
=
crows
[
0
]
cols
=
cols
[
0
]
return
crows
.
to
(
device
),
cols
.
to
(
device
)
def
crow_col_to_dense
(
crows
:
torch
.
Tensor
,
cols
:
torch
.
Tensor
,
dtype
:
torch
.
dtype
=
torch
.
float16
):
dim
=
crows
.
dim
()
if
dim
==
1
:
crows
=
crows
[
None
]
cols
=
cols
[
None
]
device
=
crows
.
device
crows
,
cols
=
crows
.
cpu
(),
cols
.
cpu
()
# faster in cpu
shape
=
(
crows
.
shape
[
0
],
crows
.
shape
[
1
]
-
1
,
cols
.
max
()
+
1
)
x
=
torch
.
zeros
(
shape
,
dtype
=
dtype
)
for
i
in
range
(
shape
[
0
]):
for
j
in
range
(
shape
[
1
]):
x
[
i
,
j
,
cols
[
i
,
crows
[
i
,
j
]:
crows
[
i
,
j
+
1
]]]
=
1
if
dim
==
1
:
x
=
x
[
0
]
return
x
.
to
(
device
)
def
dense_to_ccol_row
(
x
:
torch
.
Tensor
):
"""Similar, but to CSC format"""
x
=
x
.
transpose
(
-
2
,
-
1
)
return
dense_to_crow_col
(
x
)
def
ccol_row_to_dense
(
ccol
:
torch
.
Tensor
,
rows
:
torch
.
Tensor
,
dtype
:
torch
.
dtype
=
torch
.
float16
):
return
crow_col_to_dense
(
ccol
,
rows
,
dtype
).
permute
(
0
,
2
,
1
).
contiguous
()
def
_get_sparse_attn_mask_homo_head
(
q_len
:
int
,
max_seqlen
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
block_size
:
int
=
128
,
local_blocks
:
int
=
4
,
vert_stride
:
int
=
4
,
return_dense
:
bool
=
False
,
):
"""
:return: a tuple of 3:
- tuple of crow_indices, col_indices representation
of CSR format.
- block dense mask
- all token dense mask (be aware that it can be
OOM if it is too big) if `return_dense==True`,
otherwise, None
"""
with
torch
.
no_grad
():
num_blocks
=
triton
.
cdiv
(
max_seqlen
,
block_size
)
q_pos
=
torch
.
arange
(
num_blocks
)[:,
None
]
k_pos
=
torch
.
arange
(
num_blocks
)[
None
]
mask_vert_strided
=
(
torch
.
arange
(
num_blocks
)
+
1
)
%
vert_stride
==
0
block_mask_dense
=
(((
q_pos
>=
k_pos
)
&
((
q_pos
-
k_pos
<
local_blocks
)
|
mask_vert_strided
)).
to
(
device
).
to
(
dtype
))
num_blocks_q
=
triton
.
cdiv
(
q_len
,
block_size
)
block_mask_dense_output
=
(
dense_to_crow_col
(
block_mask_dense
[
-
num_blocks_q
:].
contiguous
()))
if
return_dense
:
mask_dense
=
torch
.
kron
(
block_mask_dense
,
block_mask_dense
.
new_ones
((
block_size
,
block_size
)),
)
causal_mask
=
torch
.
tril
(
torch
.
ones
(
max_seqlen
,
max_seqlen
)).
type_as
(
mask_dense
)[
-
q_len
:]
mask_dense
=
mask_dense
[
-
q_len
:,
:
max_seqlen
]
*
causal_mask
return
(
block_mask_dense_output
,
block_mask_dense
,
mask_dense
,
)
else
:
return
(
block_mask_dense_output
,
block_mask_dense
,
None
,
)
def
binary_mask_to_bias
(
mask_dense
:
torch
.
Tensor
):
mask_dense
=
1
-
mask_dense
mask_dense
.
masked_fill_
(
mask_dense
.
bool
(),
-
torch
.
inf
)
return
mask_dense
def
get_head_sliding_step
(
n_heads
:
int
,
vert_stride
:
int
,
homo_head
:
bool
=
False
):
if
homo_head
:
return
0
return
max
(
1
,
int
(
vert_stride
/
n_heads
))
@
lru_cache
def
get_sparse_attn_mask
(
n_heads
:
int
,
q_len
:
int
,
max_seqlen
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
block_size
:
int
=
64
,
local_blocks
:
int
=
4
,
vert_stride
:
int
=
4
,
homo_head
:
bool
=
True
,
return_dense
:
bool
=
False
,
dense_mask_type
:
str
=
"binary"
,
):
"""
:param dense_mask_type: "binary" (0 for skip token, 1 for others)
or "bias" (-inf for skip token, 0 or others)
:return: a tuple of 3:
- tuple of crow_indices, col_indices representation
of CSR format.
- block dense mask
- all token dense mask (be aware that it can be OOM if it
is too big) if `return_dense==True`, otherwise, None
"""
assert
dense_mask_type
in
(
"binary"
,
"bias"
)
if
homo_head
:
with
torch
.
no_grad
():
(
crow
,
col
),
block_mask_dense
,
mask_dense
=
(
_get_sparse_attn_mask_homo_head
(
q_len
,
max_seqlen
,
dtype
,
device
,
block_size
,
local_blocks
,
vert_stride
,
return_dense
,
))
crow
=
crow
[
None
].
expand
(
n_heads
,
crow
.
shape
[
0
])
col
=
col
[
None
].
expand
(
n_heads
,
col
.
shape
[
0
])
if
return_dense
:
mask_dense
=
mask_dense
[
None
].
expand
(
n_heads
,
*
mask_dense
.
shape
)
if
dense_mask_type
==
"bias"
:
mask_dense
=
binary_mask_to_bias
(
mask_dense
)
return
(
crow
,
col
),
block_mask_dense
,
mask_dense
with
torch
.
no_grad
():
num_blocks
=
triton
.
cdiv
(
max_seqlen
,
block_size
)
q_pos
=
torch
.
arange
(
num_blocks
)[
None
,
:,
None
]
k_pos
=
torch
.
arange
(
num_blocks
)[
None
,
None
]
head_sliding_step
=
get_head_sliding_step
(
n_heads
,
vert_stride
)
mask_vert_strided
=
[
(
torch
.
arange
(
num_blocks
)
+
h
*
head_sliding_step
+
1
)
%
vert_stride
==
0
for
h
in
range
(
n_heads
)
]
mask_vert_strided
=
torch
.
vstack
(
mask_vert_strided
).
unsqueeze
(
1
)
block_mask_dense
=
(((
q_pos
>=
k_pos
)
&
((
q_pos
-
k_pos
<
local_blocks
)
|
mask_vert_strided
)).
to
(
device
).
to
(
dtype
))
num_blocks_q
=
triton
.
cdiv
(
q_len
,
block_size
)
block_mask_dense_output
=
block_mask_dense
[:,
-
num_blocks_q
:]
if
return_dense
:
mask_dense
=
torch
.
kron
(
block_mask_dense
,
block_mask_dense
.
new_ones
((
block_size
,
block_size
)),
)
causal_mask
=
torch
.
tril
(
torch
.
ones
(
max_seqlen
,
max_seqlen
)).
type_as
(
mask_dense
)[
-
q_len
:]
mask_dense
=
mask_dense
[...,
-
q_len
:,
:
max_seqlen
]
*
causal_mask
[
None
]
if
dense_mask_type
==
"bias"
:
mask_dense
=
binary_mask_to_bias
(
mask_dense
)
return
(
dense_to_crow_col
(
block_mask_dense_output
),
block_mask_dense
,
mask_dense
,
)
else
:
return
(
dense_to_crow_col
(
block_mask_dense_output
),
block_mask_dense
,
None
,
)
vllm/attention/selector.py
View file @
752c6ade
...
...
@@ -143,7 +143,6 @@ def get_attn_backend(
kv_cache_dtype
:
Optional
[
str
],
block_size
:
int
,
is_attention_free
:
bool
,
is_blocksparse
:
bool
=
False
,
use_mla
:
bool
=
False
,
)
->
type
[
AttentionBackend
]:
"""Selects which attention backend to use and lazily imports it."""
...
...
@@ -157,7 +156,6 @@ def get_attn_backend(
kv_cache_dtype
=
kv_cache_dtype
,
block_size
=
block_size
,
is_attention_free
=
is_attention_free
,
is_blocksparse
=
is_blocksparse
,
use_v1
=
envs
.
VLLM_USE_V1
,
use_mla
=
use_mla
,
)
...
...
@@ -170,16 +168,9 @@ def _cached_get_attn_backend(
kv_cache_dtype
:
Optional
[
str
],
block_size
:
int
,
is_attention_free
:
bool
,
is_blocksparse
:
bool
=
False
,
use_v1
:
bool
=
False
,
use_mla
:
bool
=
False
,
)
->
type
[
AttentionBackend
]:
if
is_blocksparse
:
logger
.
info
(
"Using BlocksparseFlashAttention backend."
)
from
vllm.attention.backends.blocksparse_attn
import
(
BlocksparseFlashAttentionBackend
)
return
BlocksparseFlashAttentionBackend
# If there are no attention layers (e.g. we are running Mamba),
# use the placeholder NO_ATTENTION
if
is_attention_free
:
...
...
vllm/model_executor/models/phi3_small.py
deleted
100644 → 0
View file @
881e3cbe
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
from
collections.abc
import
Iterable
from
typing
import
Optional
,
Union
import
torch
from
torch
import
nn
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.attention
import
Attention
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
def
load_column_parallel_weight
(
param
:
torch
.
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
):
tp
=
get_tensor_model_parallel_world_size
()
rk
=
get_tensor_model_parallel_rank
()
assert
param
.
size
(
0
)
*
tp
==
loaded_weight
.
size
(
0
)
s
=
rk
*
param
.
size
(
0
)
e
=
(
rk
+
1
)
*
param
.
size
(
0
)
loaded_weight
=
loaded_weight
[
s
:
e
]
assert
param
.
shape
==
loaded_weight
.
shape
param
.
data
.
copy_
(
loaded_weight
)
class
HeadMajorQKVParallelLinear
(
QKVParallelLinear
):
def
weight_loader
(
self
,
param
:
torch
.
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
):
return
load_column_parallel_weight
(
param
,
loaded_weight
)
class
HeadMajorColumnParallelLinear
(
MergedColumnParallelLinear
):
def
weight_loader
(
self
,
param
:
torch
.
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
):
return
load_column_parallel_weight
(
param
,
loaded_weight
)
@
torch
.
compile
(
dynamic
=
True
,
backend
=
current_platform
.
simple_compile_backend
)
def
quick_gelu
(
x
):
return
x
*
torch
.
sigmoid
(
1.702
*
x
)
@
torch
.
compile
(
dynamic
=
True
,
backend
=
current_platform
.
simple_compile_backend
)
def
gegelu
(
input
,
limit
:
Optional
[
float
]
=
None
):
a_gelu
,
a_linear
=
input
[...,
::
2
],
input
[...,
1
::
2
]
if
limit
is
not
None
:
a_gelu
=
torch
.
where
(
torch
.
isinf
(
a_gelu
),
a_gelu
,
a_gelu
.
clamp
(
min
=
None
,
max
=
limit
))
a_linear
=
torch
.
where
(
torch
.
isinf
(
a_linear
),
a_linear
,
a_linear
.
clamp
(
min
=-
limit
,
max
=
limit
),
)
out_gelu
=
quick_gelu
(
a_gelu
)
return
out_gelu
*
(
a_linear
+
1
)
class
Phi3SmallMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
assert
(
self
.
config
.
hidden_act
==
"gegelu"
),
"Only `gegelu` is supported for the 4.7 series of models .."
self
.
hidden_size
=
config
.
hidden_size
self
.
gegelu_limit
=
config
.
gegelu_limit
self
.
intermediate_size
=
config
.
intermediate_size
self
.
up_proj
=
HeadMajorColumnParallelLinear
(
self
.
hidden_size
,
2
*
[
self
.
intermediate_size
],
bias
=
True
,
quant_config
=
quant_config
,
)
self
.
down_proj
=
RowParallelLinear
(
self
.
intermediate_size
,
self
.
hidden_size
,
bias
=
True
,
quant_config
=
quant_config
,
)
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
up_proj
(
x
)
x
=
gegelu
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
Phi3SmallSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
layer_idx
:
int
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
layer_idx
=
layer_idx
self
.
config
=
config
self
.
sparse_block_size
=
config
.
blocksparse_block_size
self
.
homo_heads
=
config
.
blocksparse_homo_head_pattern
self
.
local_blocks
=
config
.
blocksparse_num_local_blocks
self
.
vert_stride
=
config
.
blocksparse_vert_stride
assert
(
config
.
blocksparse_block_size
==
config
.
blocksparse_triton_kernel_block_size
)
self
.
hidden_size
=
config
.
hidden_size
# Number of Query Heads
self
.
num_heads
=
config
.
num_attention_heads
self
.
head_dim
=
self
.
hidden_size
//
self
.
num_heads
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
# Number of total Key Value Heads before tensor parallel
self
.
num_key_value_heads
=
config
.
num_key_value_heads
self
.
num_q_per_kv
=
self
.
num_heads
//
self
.
num_key_value_heads
if
self
.
tp_size
>
1
:
assert
self
.
num_key_value_heads
%
self
.
tp_size
==
0
self
.
num_kv_heads_per_partition
=
max
(
1
,
self
.
num_key_value_heads
//
self
.
tp_size
)
self
.
num_heads_per_partition
=
self
.
num_heads
//
self
.
tp_size
self
.
max_position_embeddings
=
config
.
max_position_embeddings
self
.
rope_embedding_base
=
config
.
rope_embedding_base
self
.
rope_position_scale
=
config
.
rope_position_scale
self
.
is_causal
=
True
norm_factor
=
None
if
config
.
mup_use_scaling
:
norm_factor
=
self
.
head_dim
/
config
.
mup_attn_multiplier
else
:
norm_factor
=
math
.
sqrt
(
self
.
head_dim
)
self
.
scale
=
1
/
norm_factor
self
.
query_key_value
=
HeadMajorQKVParallelLinear
(
self
.
hidden_size
,
self
.
head_dim
,
self
.
num_heads
,
self
.
num_key_value_heads
,
bias
=
True
,
quant_config
=
quant_config
,
)
self
.
dense
=
RowParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
True
,
quant_config
=
quant_config
)
if
getattr
(
self
.
config
,
"rope_scaling"
,
None
)
is
not
None
:
rope_scaling
=
self
.
config
.
rope_scaling
for
key
in
rope_scaling
:
if
isinstance
(
rope_scaling
[
key
],
list
):
rope_scaling
[
key
]
=
tuple
(
rope_scaling
[
key
])
if
"factor"
not
in
rope_scaling
:
rope_scaling
[
"factor"
]
=
self
.
rope_position_scale
else
:
rope_scaling
=
{
"rope_type"
:
"linear"
,
"factor"
:
self
.
rope_position_scale
,
}
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
self
.
max_position_embeddings
,
base
=
self
.
rope_embedding_base
,
rope_scaling
=
rope_scaling
,
)
# blocksparse params
self
.
blocksparse_block_size
=
config
.
blocksparse_block_size
self
.
blocksparse_num_local_blocks
=
config
.
blocksparse_num_local_blocks
self
.
blocksparse_vert_stride
=
config
.
blocksparse_vert_stride
use_dense_attn
=
(
getattr
(
self
.
config
,
"dense_attention_every_n_layers"
,
None
)
and
(
self
.
layer_idx
+
1
)
%
self
.
config
.
dense_attention_every_n_layers
==
0
)
bs_params
=
None
if
not
use_dense_attn
:
bs_params
=
{
'max_seqlen'
:
self
.
max_position_embeddings
,
'num_heads'
:
self
.
num_heads_per_partition
,
"num_kv_heads"
:
self
.
num_kv_heads_per_partition
,
"block_size"
:
self
.
sparse_block_size
,
"local_blocks"
:
self
.
local_blocks
,
"vert_stride"
:
self
.
vert_stride
,
"homo_head"
:
self
.
homo_heads
}
self
.
attn
=
Attention
(
self
.
num_heads_per_partition
,
self
.
head_dim
,
self
.
scale
,
num_kv_heads
=
self
.
num_kv_heads_per_partition
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
blocksparse_params
=
bs_params
,
prefix
=
f
"
{
prefix
}
.attn"
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
tuple
[
torch
.
Tensor
]]]:
qkv
,
_
=
self
.
query_key_value
(
hidden_states
)
qkv
=
qkv
.
view
(
qkv
.
shape
[:
-
1
]
+
(
-
1
,
(
self
.
num_q_per_kv
+
2
),
self
.
head_dim
))
q
,
k
,
v
=
qkv
.
split
([
self
.
num_q_per_kv
,
1
,
1
],
dim
=-
2
)
# NOTE: this is required by RotaryEmbed, which indeed does not have to
# TODO: allow 3D QK for rotary forward
q
=
q
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_heads_per_partition
)
k
=
k
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_partition
)
v
=
v
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_partition
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
output
,
_
=
self
.
dense
(
attn_output
)
return
output
class
Phi3SmallDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
layer_idx
:
int
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
self
.
self_attn
=
Phi3SmallSelfAttention
(
config
,
layer_idx
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.self_attn"
)
self
.
mlp
=
Phi3SmallMLP
(
config
,
quant_config
)
self
.
input_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
post_attention_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
Phi3SmallModel
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
mup_embedding_multiplier
=
config
.
mup_embedding_multiplier
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
config
.
num_hidden_layers
,
lambda
prefix
:
Phi3SmallDecoderLayer
(
config
,
int
(
prefix
.
split
(
'.'
)[
-
1
]),
cache_config
,
quant_config
,
prefix
=
prefix
),
prefix
=
f
"
{
prefix
}
.layers"
)
self
.
final_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
make_empty_intermediate_tensors
=
(
make_empty_intermediate_tensors_factory
([
"hidden_states"
],
config
.
hidden_size
))
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
LongTensor
,
positions
:
Optional
[
torch
.
LongTensor
],
intermediate_tensors
:
Optional
[
IntermediateTensors
],
inputs_embeds
:
Optional
[
torch
.
Tensor
],
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
get_pp_group
().
is_first_rank
:
if
inputs_embeds
is
not
None
:
hidden_states
=
inputs_embeds
else
:
hidden_states
=
self
.
get_input_embeddings
(
input_ids
)
if
(
self
.
mup_embedding_multiplier
is
not
None
and
self
.
mup_embedding_multiplier
>
0.0
):
hidden_states
=
hidden_states
*
self
.
mup_embedding_multiplier
else
:
assert
intermediate_tensors
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]:
hidden_states
=
layer
(
positions
,
hidden_states
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
})
hidden_states
=
self
.
final_layernorm
(
hidden_states
)
return
hidden_states
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
class
Phi3SmallForCausalLM
(
nn
.
Module
,
SupportsPP
):
_tied_weights_keys
=
[
"lm_head.weight"
]
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_suffix
=
{
"rotary_emb.inv_freq"
:
None
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
Phi3SmallModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
vocab_size
=
config
.
vocab_size
self
.
mup_width_multiplier
=
config
.
mup_width_multiplier
self
.
lm_head
=
ParallelLMHead
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
=
quant_config
,
)
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
# tokens in tiktoken but not used
if
hasattr
(
config
,
'dummy_token_indices'
):
device
=
self
.
lm_head
.
weight
.
device
self
.
register_buffer
(
'dummy_token_indices'
,
torch
.
LongTensor
(
config
.
dummy_token_indices
).
to
(
device
),
persistent
=
False
)
else
:
self
.
dummy_token_indices
=
None
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
def
set_input_embeddings
(
self
,
value
):
self
.
model
.
embed_tokens
=
value
def
get_output_embeddings
(
self
):
return
self
.
lm_head
def
set_output_embeddings
(
self
,
value
):
self
.
lm_head
=
value
def
set_decoder
(
self
,
decoder
):
self
.
model
=
decoder
def
get_decoder
(
self
):
return
self
.
model
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
if
self
.
dummy_token_indices
is
not
None
and
logits
is
not
None
:
logits
.
index_fill_
(
-
1
,
self
.
dummy_token_indices
,
-
torch
.
inf
)
logits
=
logits
/
self
.
mup_width_multiplier
return
logits
def
forward
(
self
,
input_ids
:
torch
.
LongTensor
,
positions
:
Optional
[
torch
.
LongTensor
],
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
output_hidden_states
=
self
.
model
(
input_ids
=
input_ids
,
positions
=
positions
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
output_hidden_states
=
output_hidden_states
return
output_hidden_states
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
,
skip_prefixes
=
([
"lm_head.weight"
]
if
self
.
config
.
tie_word_embeddings
else
None
))
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/registry.py
View file @
752c6ade
...
...
@@ -110,7 +110,6 @@ _TEXT_GENERATION_MODELS = {
"PersimmonForCausalLM"
:
(
"persimmon"
,
"PersimmonForCausalLM"
),
"PhiForCausalLM"
:
(
"phi"
,
"PhiForCausalLM"
),
"Phi3ForCausalLM"
:
(
"phi3"
,
"Phi3ForCausalLM"
),
"Phi3SmallForCausalLM"
:
(
"phi3_small"
,
"Phi3SmallForCausalLM"
),
"PhiMoEForCausalLM"
:
(
"phimoe"
,
"PhiMoEForCausalLM"
),
"Phi4FlashForCausalLM"
:
(
"phi4flash"
,
"Phi4FlashForCausalLM"
),
"Plamo2ForCausalLM"
:
(
"plamo2"
,
"Plamo2ForCausalLM"
),
...
...
vllm/platforms/interface.py
View file @
752c6ade
...
...
@@ -57,7 +57,6 @@ class _Backend(enum.Enum):
PALLAS
=
enum
.
auto
()
PALLAS_VLLM_V1
=
enum
.
auto
()
IPEX
=
enum
.
auto
()
BLOCK_SPARSE_FLASH_ATTN
=
enum
.
auto
()
DUAL_CHUNK_FLASH_ATTN
=
enum
.
auto
()
DIFFERENTIAL_FLASH_ATTN
=
enum
.
auto
()
NO_ATTENTION
=
enum
.
auto
()
...
...
vllm/v1/attention/backends/cpu_attn.py
View file @
752c6ade
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
numpy
as
np
import
torch
...
...
@@ -443,7 +443,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
...
...
@@ -451,9 +450,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"Torch SPDA does not support block-sparse attention."
)
if
logits_soft_cap
is
not
None
:
logger
.
warning_once
(
"Torch SPDA does not support logits soft cap. "
"Outputs may be slightly off."
)
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
752c6ade
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with FlashAttention."""
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
numpy
as
np
import
torch
...
...
@@ -349,15 +349,11 @@ class FlashAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
use_irope
:
bool
=
False
,
)
->
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"FlashAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
vllm/v1/attention/backends/flashinfer.py
View file @
752c6ade
...
...
@@ -4,7 +4,7 @@
from
__future__
import
annotations
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
from
typing
import
TYPE_CHECKING
,
Optional
import
torch
from
flashinfer
import
(
BatchDecodeWithPagedKVCacheWrapper
,
...
...
@@ -490,7 +490,6 @@ class FlashInferImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
...
...
vllm/v1/attention/backends/flex_attention.py
View file @
752c6ade
...
...
@@ -3,7 +3,7 @@
"""Attention layer with FlashAttention."""
from
collections
import
defaultdict
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
from
torch.nn.attention.flex_attention
import
(
BlockMask
,
_mask_mod_signature
,
...
...
@@ -342,15 +342,10 @@ class FlexAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
)
->
None
:
if
blocksparse_params
is
not
None
:
# TODO we should support this :think
raise
ValueError
(
"FlashAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
vllm/v1/attention/backends/mla/common.py
View file @
752c6ade
...
...
@@ -190,7 +190,7 @@ return curr_o @ W_O
import
functools
from
abc
import
abstractmethod
from
dataclasses
import
dataclass
,
field
from
typing
import
TYPE_CHECKING
,
Any
,
Generic
,
Optional
,
TypeVar
,
Union
from
typing
import
TYPE_CHECKING
,
Generic
,
Optional
,
TypeVar
,
Union
import
torch
...
...
@@ -754,7 +754,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
...
...
vllm/v1/attention/backends/mla/cutlass_mla.py
View file @
752c6ade
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
...
...
@@ -74,7 +74,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
...
...
@@ -82,17 +81,14 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
if
any
(
unsupported_features
):
raise
NotImplementedError
(
"CutlassMLAImpl does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"logits_soft_cap"
)
"alibi_slopes, sliding_window, logits_soft_cap"
)
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/mla/flashmla.py
View file @
752c6ade
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
torch
...
...
@@ -119,7 +119,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
...
...
@@ -127,20 +126,17 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
assert
is_flashmla_supported
(),
\
"FlashMLA is not supported on this device"
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
if
any
(
unsupported_features
):
raise
NotImplementedError
(
"FlashMLAImpl does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"logits_soft_cap"
)
"alibi_slopes, sliding_window, logits_soft_cap"
)
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/mla/rocm_aiter_mla.py
View file @
752c6ade
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
torch
...
...
@@ -167,7 +167,6 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
...
...
@@ -175,20 +174,17 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
assert
(
num_heads
==
16
or
num_heads
==
128
),
(
f
"Aiter MLA only supports 16 or 128 number of heads.
\n
"
f
"Provided
{
num_heads
}
number of heads.
\n
"
"Try adjusting tensor_parallel_size value."
)
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
if
any
(
unsupported_features
):
raise
NotImplementedError
(
"Aiter MLA does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"logits_soft_cap"
)
"alibi_slopes, sliding_window, logits_soft_cap"
)
from
aiter
import
flash_attn_varlen_func
self
.
flash_attn_varlen_func
=
flash_attn_varlen_func
...
...
vllm/v1/attention/backends/mla/triton_mla.py
View file @
752c6ade
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
...
...
@@ -42,7 +42,6 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
...
...
@@ -50,17 +49,14 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
if
any
(
unsupported_features
):
raise
NotImplementedError
(
"TritonMLAImpl does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"logits_soft_cap"
)
"alibi_slopes, sliding_window, logits_soft_cap"
)
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/pallas.py
View file @
752c6ade
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
import
torch_xla.core.xla_builder
as
xb
...
...
@@ -132,7 +132,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
...
...
@@ -142,9 +141,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
logger
.
warning_once
(
"Using irope in Pallas is not supported yet, it will fall back "
"to global attention for long context."
)
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"Paged attention Pallas kernel does "
"not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
@@ -158,8 +154,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
raise
NotImplementedError
(
"Alibi slopes is not supported."
)
if
kv_cache_dtype
!=
"auto"
:
raise
NotImplementedError
(
"FP8 KV cache dtype is not supported."
)
if
blocksparse_params
is
not
None
:
raise
NotImplementedError
(
"Blocksparse is not supported."
)
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/rocm_aiter_fa.py
View file @
752c6ade
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with AiterFlashAttention."""
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
...
...
@@ -334,15 +334,11 @@ class AiterFlashAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
use_irope
:
bool
=
False
,
)
->
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"AiterFlashAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
vllm/v1/attention/backends/triton_attn.py
View file @
752c6ade
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with PagedAttention and Triton prefix prefill."""
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
torch
...
...
@@ -205,15 +205,11 @@ class TritonAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
use_irope
:
bool
=
False
,
)
->
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"TritonAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment