Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
752c6ade
Unverified
Commit
752c6ade
authored
Jul 19, 2025
by
Woosuk Kwon
Committed by
GitHub
Jul 19, 2025
Browse files
[V0 Deprecation] Deprecate BlockSparse Attention & Phi3-Small (#21217)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
881e3cbe
Changes
38
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
24 additions
and
1030 deletions
+24
-1030
vllm/attention/ops/blocksparse_attention/interface.py
vllm/attention/ops/blocksparse_attention/interface.py
+0
-239
vllm/attention/ops/blocksparse_attention/utils.py
vllm/attention/ops/blocksparse_attention/utils.py
+0
-246
vllm/attention/selector.py
vllm/attention/selector.py
+0
-9
vllm/model_executor/models/phi3_small.py
vllm/model_executor/models/phi3_small.py
+0
-465
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+0
-1
vllm/platforms/interface.py
vllm/platforms/interface.py
+0
-1
vllm/v1/attention/backends/cpu_attn.py
vllm/v1/attention/backends/cpu_attn.py
+1
-5
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+1
-5
vllm/v1/attention/backends/flashinfer.py
vllm/v1/attention/backends/flashinfer.py
+1
-2
vllm/v1/attention/backends/flex_attention.py
vllm/v1/attention/backends/flex_attention.py
+1
-6
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+1
-2
vllm/v1/attention/backends/mla/cutlass_mla.py
vllm/v1/attention/backends/mla/cutlass_mla.py
+4
-8
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla.py
+4
-8
vllm/v1/attention/backends/mla/rocm_aiter_mla.py
vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+4
-8
vllm/v1/attention/backends/mla/triton_mla.py
vllm/v1/attention/backends/mla/triton_mla.py
+4
-8
vllm/v1/attention/backends/pallas.py
vllm/v1/attention/backends/pallas.py
+1
-7
vllm/v1/attention/backends/rocm_aiter_fa.py
vllm/v1/attention/backends/rocm_aiter_fa.py
+1
-5
vllm/v1/attention/backends/triton_attn.py
vllm/v1/attention/backends/triton_attn.py
+1
-5
No files found.
vllm/attention/ops/blocksparse_attention/interface.py
deleted
100644 → 0
View file @
881e3cbe
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
import
torch
from
vllm.platforms
import
current_platform
from
.utils
import
(
dense_to_crow_col
,
get_head_sliding_step
,
get_sparse_attn_mask
)
IS_COMPUTE_8_OR_ABOVE
=
current_platform
.
has_device_capability
(
80
)
if
IS_COMPUTE_8_OR_ABOVE
:
from
.blocksparse_attention_kernel
import
blocksparse_flash_attn_varlen_fwd
class
LocalStridedBlockSparseAttn
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
n_heads
,
max_seqlen
,
local_blocks
,
vert_stride
,
block_size
,
device
=
None
,
dtype
=
None
,
homo_head
=
False
,
active_head_range
=
None
,
q_block_size
=
None
,
use_spda
=
None
,
):
super
().
__init__
()
if
use_spda
is
None
:
use_spda
=
current_platform
.
is_rocm
()
or
\
current_platform
.
is_cpu
()
or
not
\
IS_COMPUTE_8_OR_ABOVE
device
=
device
or
(
torch
.
cuda
.
current_device
()
if
current_platform
.
is_cuda_alike
()
else
"cpu"
)
device
=
torch
.
device
(
device
)
# NOTE: vllm CPU backend support BF16 instead of FP16.
dtype
=
dtype
or
(
torch
.
bfloat16
if
IS_COMPUTE_8_OR_ABOVE
or
device
.
type
==
"cpu"
else
torch
.
half
)
self
.
n_heads
=
n_heads
self
.
max_seqlen
=
max_seqlen
self
.
local_blocks
=
local_blocks
self
.
vert_stride
=
vert_stride
self
.
use_spda
=
use_spda
self
.
dtype
=
dtype
self
.
device
=
device
self
.
block_size
=
block_size
self
.
q_block_size
=
q_block_size
self
.
homo_head
=
homo_head
self
.
active_head_range
=
active_head_range
self
.
head_sliding_step
=
get_head_sliding_step
(
n_heads
,
vert_stride
,
homo_head
)
sparse_layout
,
sparse_pattern
,
self
.
dense_attn_mask
=
(
self
.
get_attn_pattern
(
dtype
,
device
))
if
q_block_size
is
not
None
and
q_block_size
!=
block_size
:
if
q_block_size
>
block_size
:
assert
q_block_size
%
block_size
==
0
blocks_to_merge
=
q_block_size
//
block_size
shape
=
sparse_pattern
.
shape
sparse_pattern
=
sparse_pattern
.
view
(
shape
[
0
],
-
1
,
blocks_to_merge
,
shape
[
-
1
])
sparse_pattern
=
sparse_pattern
.
sum
(
2
)
sparse_layout
=
dense_to_crow_col
(
sparse_pattern
)
else
:
raise
ValueError
(
"Does not support smaller q_block_size. It will be slower."
)
self
.
sparse_layout
=
sparse_layout
def
get_attn_pattern
(
self
,
dtype
,
device
):
sparse_layout
,
sparse_pattern
,
dense_attn_mask
=
get_sparse_attn_mask
(
self
.
n_heads
,
self
.
max_seqlen
,
self
.
max_seqlen
,
dtype
,
device
,
block_size
=
self
.
block_size
,
local_blocks
=
self
.
local_blocks
,
vert_stride
=
self
.
vert_stride
,
homo_head
=
self
.
homo_head
,
return_dense
=
self
.
use_spda
,
dense_mask_type
=
"bias"
,
)
if
(
not
self
.
homo_head
)
and
(
self
.
active_head_range
is
not
None
):
assert
isinstance
(
self
.
active_head_range
,
tuple
)
assert
(
len
(
self
.
active_head_range
)
==
2
)
h_start
,
h_end
=
self
.
active_head_range
sparse_layout
=
tuple
(
x
[
h_start
:
h_end
]
for
x
in
sparse_layout
)
if
self
.
use_spda
:
dense_attn_mask
=
dense_attn_mask
[
h_start
:
h_end
]
return
sparse_layout
,
sparse_pattern
,
dense_attn_mask
def
varlen_attn
(
self
,
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
None
,
sm_scale
=
None
):
"""
q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
Support grouped attention, with `q[:, i*r:(i*r + r)]`
is correspondent to `k[:, i]`, where `r` is the q/k ratio.
cu_seqlens_k: shape=(batch_size + 1,),
indicating segment of samples,
e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
cu_seqlens_q: shape=(batch_size + 1, ).
Default None: same as cu_seqlens_k for prefilling or
[0, 1, .., batch_size] for decoding.
The only case you need to specify is when q is a mix of
prefilling and decoding.
sm_scale: softmax scale, default to 1/sqrt(head_size).
return: tensor of shape as q.
"""
assert
(
IS_COMPUTE_8_OR_ABOVE
),
"Requires compute capability of 8 or above (Ampere or newer) to use
\
Triton kernel."
sm_scale
=
sm_scale
or
1.0
/
math
.
sqrt
(
q
.
size
(
-
1
))
return
blocksparse_flash_attn_varlen_fwd
(
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
,
sm_scale
,
self
.
sparse_layout
,
block_size
=
self
.
block_size
,
q_block_size
=
self
.
q_block_size
,
max_seqlen
=
self
.
max_seqlen
,
)
@
staticmethod
def
transpose_and_pad
(
x
,
cu_seqlens
,
maxlen
,
head_repeats
=
1
):
"""
:param x: (total_tokens, n_heads, head_size)
:return: (batch, n_heads, length, head_size)
"""
x_padded
=
x
.
new_empty
(
len
(
cu_seqlens
)
-
1
,
x
.
size
(
1
),
head_repeats
,
maxlen
,
x
.
size
(
2
))
cu_seqlens
=
cu_seqlens
.
cpu
()
for
i
,
(
s
,
e
)
in
enumerate
(
zip
(
cu_seqlens
[:
-
1
],
cu_seqlens
[
1
:])):
x_padded
[
i
,
:,
:,
:
e
-
s
].
copy_
(
x
[
s
:
e
].
transpose
(
0
,
1
).
unsqueeze
(
1
))
return
x_padded
.
flatten
(
1
,
2
)
@
staticmethod
def
transpose_and_unpad
(
x_padded
,
cu_seqlens
):
"""
:param x_padded: (batch, n_heads, length, head_size)
:return: (total_tokens, n_heads, head_size)
"""
cu_seqlens
=
cu_seqlens
.
cpu
()
total_n_tokens
=
cu_seqlens
[
-
1
]
x
=
x_padded
.
new_empty
(
total_n_tokens
,
x_padded
.
size
(
1
),
x_padded
.
size
(
3
))
for
i
,
(
s
,
e
)
in
enumerate
(
zip
(
cu_seqlens
[:
-
1
],
cu_seqlens
[
1
:])):
x
[
s
:
e
].
copy_
(
x_padded
[
i
,
:,
:
e
-
s
].
transpose
(
0
,
1
))
return
x
def
spda
(
self
,
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
None
,
sm_scale
=
None
):
"""For CPU, V100 or other older GPUs.
NOTE: torch SPDA supports nested tensor,
but seems extremely slow. Choose to pad instead.
"""
assert
(
cu_seqlens_q
is
None
or
(
cu_seqlens_q
==
cu_seqlens_k
).
all
()),
"Can only handle prompt with SPDA."
assert
q
.
size
(
0
)
==
k
.
size
(
0
),
"can only handle prompt with SPDA."
assert
q
.
size
(
1
)
%
k
.
size
(
1
)
==
0
q_k_ratio
=
q
.
size
(
1
)
//
k
.
size
(
1
)
sm_scale
=
sm_scale
or
1.0
/
math
.
sqrt
(
q
.
size
(
-
1
))
cu_seqlens
=
cu_seqlens_k
.
cpu
()
maxlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
()
if
(
self
.
dense_attn_mask
.
dtype
!=
q
.
dtype
or
self
.
dense_attn_mask
.
device
!=
q
.
device
):
_
,
_
,
self
.
dense_attn_mask
=
self
.
get_attn_pattern
(
q
.
dtype
,
q
.
device
)
attn_mask
=
self
.
dense_attn_mask
[
None
,
:,
:
maxlen
,
:
maxlen
]
q2
=
self
.
transpose_and_pad
(
q
,
cu_seqlens
,
maxlen
,
1
)
k2
,
v2
=
(
self
.
transpose_and_pad
(
x
,
cu_seqlens
,
maxlen
,
q_k_ratio
)
for
x
in
[
k
,
v
])
spda_output
=
torch
.
nn
.
functional
.
scaled_dot_product_attention
(
q2
,
k2
,
v2
,
attn_mask
=
attn_mask
,
scale
=
sm_scale
)
return
self
.
transpose_and_unpad
(
spda_output
,
cu_seqlens
)
def
forward
(
self
,
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
None
,
sm_scale
=
None
):
"""Dispatch to `varlen_attn` (Ampere or newer) or
`self.spda`(cpu, Volta, Turing or older)based on
the type of device used and cuda compute capability.
q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
Support grouped attention, with `q[:, i*r:(i*r + r)]`
is correspondent to `k[:, i]`, where `r` is the q/k ratio.
cu_seqlens_k: shape=(batch_size + 1,), indicating segment of samples,
e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
cu_seqlens_q: shape=(batch_size + 1, ).
Default None: same as cu_seqlens_k for prefilling or
[0, 1, .., batch_size] for decoding.
The only case you need to specify
is when q is a mix of prefilling
and decoding.
sm_scale: softmax scale, default to 1/sqrt(head_size).
return: tensor of shape as q.
"""
assert
k
.
dim
()
==
3
if
self
.
use_spda
:
return
self
.
spda
(
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
cu_seqlens_q
,
sm_scale
=
sm_scale
,
)
return
self
.
varlen_attn
(
q
,
k
,
v
,
cu_seqlens_k
,
cu_seqlens_q
=
cu_seqlens_q
,
sm_scale
=
sm_scale
)
vllm/attention/ops/blocksparse_attention/utils.py
deleted
100644 → 0
View file @
881e3cbe
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Helper functions for 3D sparse pattern
# These function are not optimized and very inefficient.
# Avoid calling them too frequent or use a cache mechanism.
from
functools
import
lru_cache
import
numpy
as
np
import
torch
from
vllm.triton_utils
import
triton
class
csr_matrix
:
"""Simple implementation of CSR matrix conversion without scipy.
This replaced scipy.sparse.csr_matrix() previously used."""
def
__init__
(
self
,
input_array
):
if
not
isinstance
(
input_array
,
np
.
ndarray
):
raise
ValueError
(
"Input must be a NumPy array"
)
self
.
shape
=
input_array
.
shape
rows
,
cols
=
self
.
shape
data
=
[]
indices
=
[]
indptr
=
[
0
]
for
i
in
range
(
rows
):
for
j
in
range
(
cols
):
if
input_array
[
i
,
j
]:
data
.
append
(
input_array
[
i
,
j
])
indices
.
append
(
j
)
indptr
.
append
(
len
(
indices
))
self
.
data
=
np
.
array
(
data
)
self
.
indices
=
np
.
array
(
indices
)
self
.
indptr
=
np
.
array
(
indptr
)
def
dense_to_crow_col
(
x
:
torch
.
Tensor
):
"""Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing.
NOTE: col_indices padded -1
"""
device
=
x
.
device
pad
=
-
1
dim
=
x
.
dim
()
assert
x
.
dim
()
in
(
2
,
3
)
if
x
.
dim
()
==
2
:
x
=
x
[
None
]
x
=
[
csr_matrix
(
xi
.
bool
().
cpu
().
numpy
())
for
xi
in
x
]
crows
=
torch
.
vstack
([
torch
.
from_numpy
(
xi
.
indptr
)
for
xi
in
x
])
cols
=
[
torch
.
from_numpy
(
xi
.
indices
)
for
xi
in
x
]
max_cols
=
max
(
len
(
xi
)
for
xi
in
cols
)
cols
=
[
torch
.
cat
([
xi
,
pad
+
xi
.
new_zeros
(
max_cols
-
xi
.
shape
[
0
])])
for
xi
in
cols
]
cols
=
torch
.
vstack
(
cols
)
if
dim
==
2
:
crows
=
crows
[
0
]
cols
=
cols
[
0
]
return
crows
.
to
(
device
),
cols
.
to
(
device
)
def
crow_col_to_dense
(
crows
:
torch
.
Tensor
,
cols
:
torch
.
Tensor
,
dtype
:
torch
.
dtype
=
torch
.
float16
):
dim
=
crows
.
dim
()
if
dim
==
1
:
crows
=
crows
[
None
]
cols
=
cols
[
None
]
device
=
crows
.
device
crows
,
cols
=
crows
.
cpu
(),
cols
.
cpu
()
# faster in cpu
shape
=
(
crows
.
shape
[
0
],
crows
.
shape
[
1
]
-
1
,
cols
.
max
()
+
1
)
x
=
torch
.
zeros
(
shape
,
dtype
=
dtype
)
for
i
in
range
(
shape
[
0
]):
for
j
in
range
(
shape
[
1
]):
x
[
i
,
j
,
cols
[
i
,
crows
[
i
,
j
]:
crows
[
i
,
j
+
1
]]]
=
1
if
dim
==
1
:
x
=
x
[
0
]
return
x
.
to
(
device
)
def
dense_to_ccol_row
(
x
:
torch
.
Tensor
):
"""Similar, but to CSC format"""
x
=
x
.
transpose
(
-
2
,
-
1
)
return
dense_to_crow_col
(
x
)
def
ccol_row_to_dense
(
ccol
:
torch
.
Tensor
,
rows
:
torch
.
Tensor
,
dtype
:
torch
.
dtype
=
torch
.
float16
):
return
crow_col_to_dense
(
ccol
,
rows
,
dtype
).
permute
(
0
,
2
,
1
).
contiguous
()
def
_get_sparse_attn_mask_homo_head
(
q_len
:
int
,
max_seqlen
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
block_size
:
int
=
128
,
local_blocks
:
int
=
4
,
vert_stride
:
int
=
4
,
return_dense
:
bool
=
False
,
):
"""
:return: a tuple of 3:
- tuple of crow_indices, col_indices representation
of CSR format.
- block dense mask
- all token dense mask (be aware that it can be
OOM if it is too big) if `return_dense==True`,
otherwise, None
"""
with
torch
.
no_grad
():
num_blocks
=
triton
.
cdiv
(
max_seqlen
,
block_size
)
q_pos
=
torch
.
arange
(
num_blocks
)[:,
None
]
k_pos
=
torch
.
arange
(
num_blocks
)[
None
]
mask_vert_strided
=
(
torch
.
arange
(
num_blocks
)
+
1
)
%
vert_stride
==
0
block_mask_dense
=
(((
q_pos
>=
k_pos
)
&
((
q_pos
-
k_pos
<
local_blocks
)
|
mask_vert_strided
)).
to
(
device
).
to
(
dtype
))
num_blocks_q
=
triton
.
cdiv
(
q_len
,
block_size
)
block_mask_dense_output
=
(
dense_to_crow_col
(
block_mask_dense
[
-
num_blocks_q
:].
contiguous
()))
if
return_dense
:
mask_dense
=
torch
.
kron
(
block_mask_dense
,
block_mask_dense
.
new_ones
((
block_size
,
block_size
)),
)
causal_mask
=
torch
.
tril
(
torch
.
ones
(
max_seqlen
,
max_seqlen
)).
type_as
(
mask_dense
)[
-
q_len
:]
mask_dense
=
mask_dense
[
-
q_len
:,
:
max_seqlen
]
*
causal_mask
return
(
block_mask_dense_output
,
block_mask_dense
,
mask_dense
,
)
else
:
return
(
block_mask_dense_output
,
block_mask_dense
,
None
,
)
def
binary_mask_to_bias
(
mask_dense
:
torch
.
Tensor
):
mask_dense
=
1
-
mask_dense
mask_dense
.
masked_fill_
(
mask_dense
.
bool
(),
-
torch
.
inf
)
return
mask_dense
def
get_head_sliding_step
(
n_heads
:
int
,
vert_stride
:
int
,
homo_head
:
bool
=
False
):
if
homo_head
:
return
0
return
max
(
1
,
int
(
vert_stride
/
n_heads
))
@
lru_cache
def
get_sparse_attn_mask
(
n_heads
:
int
,
q_len
:
int
,
max_seqlen
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
block_size
:
int
=
64
,
local_blocks
:
int
=
4
,
vert_stride
:
int
=
4
,
homo_head
:
bool
=
True
,
return_dense
:
bool
=
False
,
dense_mask_type
:
str
=
"binary"
,
):
"""
:param dense_mask_type: "binary" (0 for skip token, 1 for others)
or "bias" (-inf for skip token, 0 or others)
:return: a tuple of 3:
- tuple of crow_indices, col_indices representation
of CSR format.
- block dense mask
- all token dense mask (be aware that it can be OOM if it
is too big) if `return_dense==True`, otherwise, None
"""
assert
dense_mask_type
in
(
"binary"
,
"bias"
)
if
homo_head
:
with
torch
.
no_grad
():
(
crow
,
col
),
block_mask_dense
,
mask_dense
=
(
_get_sparse_attn_mask_homo_head
(
q_len
,
max_seqlen
,
dtype
,
device
,
block_size
,
local_blocks
,
vert_stride
,
return_dense
,
))
crow
=
crow
[
None
].
expand
(
n_heads
,
crow
.
shape
[
0
])
col
=
col
[
None
].
expand
(
n_heads
,
col
.
shape
[
0
])
if
return_dense
:
mask_dense
=
mask_dense
[
None
].
expand
(
n_heads
,
*
mask_dense
.
shape
)
if
dense_mask_type
==
"bias"
:
mask_dense
=
binary_mask_to_bias
(
mask_dense
)
return
(
crow
,
col
),
block_mask_dense
,
mask_dense
with
torch
.
no_grad
():
num_blocks
=
triton
.
cdiv
(
max_seqlen
,
block_size
)
q_pos
=
torch
.
arange
(
num_blocks
)[
None
,
:,
None
]
k_pos
=
torch
.
arange
(
num_blocks
)[
None
,
None
]
head_sliding_step
=
get_head_sliding_step
(
n_heads
,
vert_stride
)
mask_vert_strided
=
[
(
torch
.
arange
(
num_blocks
)
+
h
*
head_sliding_step
+
1
)
%
vert_stride
==
0
for
h
in
range
(
n_heads
)
]
mask_vert_strided
=
torch
.
vstack
(
mask_vert_strided
).
unsqueeze
(
1
)
block_mask_dense
=
(((
q_pos
>=
k_pos
)
&
((
q_pos
-
k_pos
<
local_blocks
)
|
mask_vert_strided
)).
to
(
device
).
to
(
dtype
))
num_blocks_q
=
triton
.
cdiv
(
q_len
,
block_size
)
block_mask_dense_output
=
block_mask_dense
[:,
-
num_blocks_q
:]
if
return_dense
:
mask_dense
=
torch
.
kron
(
block_mask_dense
,
block_mask_dense
.
new_ones
((
block_size
,
block_size
)),
)
causal_mask
=
torch
.
tril
(
torch
.
ones
(
max_seqlen
,
max_seqlen
)).
type_as
(
mask_dense
)[
-
q_len
:]
mask_dense
=
mask_dense
[...,
-
q_len
:,
:
max_seqlen
]
*
causal_mask
[
None
]
if
dense_mask_type
==
"bias"
:
mask_dense
=
binary_mask_to_bias
(
mask_dense
)
return
(
dense_to_crow_col
(
block_mask_dense_output
),
block_mask_dense
,
mask_dense
,
)
else
:
return
(
dense_to_crow_col
(
block_mask_dense_output
),
block_mask_dense
,
None
,
)
vllm/attention/selector.py
View file @
752c6ade
...
@@ -143,7 +143,6 @@ def get_attn_backend(
...
@@ -143,7 +143,6 @@ def get_attn_backend(
kv_cache_dtype
:
Optional
[
str
],
kv_cache_dtype
:
Optional
[
str
],
block_size
:
int
,
block_size
:
int
,
is_attention_free
:
bool
,
is_attention_free
:
bool
,
is_blocksparse
:
bool
=
False
,
use_mla
:
bool
=
False
,
use_mla
:
bool
=
False
,
)
->
type
[
AttentionBackend
]:
)
->
type
[
AttentionBackend
]:
"""Selects which attention backend to use and lazily imports it."""
"""Selects which attention backend to use and lazily imports it."""
...
@@ -157,7 +156,6 @@ def get_attn_backend(
...
@@ -157,7 +156,6 @@ def get_attn_backend(
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
block_size
=
block_size
,
block_size
=
block_size
,
is_attention_free
=
is_attention_free
,
is_attention_free
=
is_attention_free
,
is_blocksparse
=
is_blocksparse
,
use_v1
=
envs
.
VLLM_USE_V1
,
use_v1
=
envs
.
VLLM_USE_V1
,
use_mla
=
use_mla
,
use_mla
=
use_mla
,
)
)
...
@@ -170,16 +168,9 @@ def _cached_get_attn_backend(
...
@@ -170,16 +168,9 @@ def _cached_get_attn_backend(
kv_cache_dtype
:
Optional
[
str
],
kv_cache_dtype
:
Optional
[
str
],
block_size
:
int
,
block_size
:
int
,
is_attention_free
:
bool
,
is_attention_free
:
bool
,
is_blocksparse
:
bool
=
False
,
use_v1
:
bool
=
False
,
use_v1
:
bool
=
False
,
use_mla
:
bool
=
False
,
use_mla
:
bool
=
False
,
)
->
type
[
AttentionBackend
]:
)
->
type
[
AttentionBackend
]:
if
is_blocksparse
:
logger
.
info
(
"Using BlocksparseFlashAttention backend."
)
from
vllm.attention.backends.blocksparse_attn
import
(
BlocksparseFlashAttentionBackend
)
return
BlocksparseFlashAttentionBackend
# If there are no attention layers (e.g. we are running Mamba),
# If there are no attention layers (e.g. we are running Mamba),
# use the placeholder NO_ATTENTION
# use the placeholder NO_ATTENTION
if
is_attention_free
:
if
is_attention_free
:
...
...
vllm/model_executor/models/phi3_small.py
deleted
100644 → 0
View file @
881e3cbe
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
from
collections.abc
import
Iterable
from
typing
import
Optional
,
Union
import
torch
from
torch
import
nn
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.attention
import
Attention
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
def
load_column_parallel_weight
(
param
:
torch
.
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
):
tp
=
get_tensor_model_parallel_world_size
()
rk
=
get_tensor_model_parallel_rank
()
assert
param
.
size
(
0
)
*
tp
==
loaded_weight
.
size
(
0
)
s
=
rk
*
param
.
size
(
0
)
e
=
(
rk
+
1
)
*
param
.
size
(
0
)
loaded_weight
=
loaded_weight
[
s
:
e
]
assert
param
.
shape
==
loaded_weight
.
shape
param
.
data
.
copy_
(
loaded_weight
)
class
HeadMajorQKVParallelLinear
(
QKVParallelLinear
):
def
weight_loader
(
self
,
param
:
torch
.
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
):
return
load_column_parallel_weight
(
param
,
loaded_weight
)
class
HeadMajorColumnParallelLinear
(
MergedColumnParallelLinear
):
def
weight_loader
(
self
,
param
:
torch
.
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
):
return
load_column_parallel_weight
(
param
,
loaded_weight
)
@
torch
.
compile
(
dynamic
=
True
,
backend
=
current_platform
.
simple_compile_backend
)
def
quick_gelu
(
x
):
return
x
*
torch
.
sigmoid
(
1.702
*
x
)
@
torch
.
compile
(
dynamic
=
True
,
backend
=
current_platform
.
simple_compile_backend
)
def
gegelu
(
input
,
limit
:
Optional
[
float
]
=
None
):
a_gelu
,
a_linear
=
input
[...,
::
2
],
input
[...,
1
::
2
]
if
limit
is
not
None
:
a_gelu
=
torch
.
where
(
torch
.
isinf
(
a_gelu
),
a_gelu
,
a_gelu
.
clamp
(
min
=
None
,
max
=
limit
))
a_linear
=
torch
.
where
(
torch
.
isinf
(
a_linear
),
a_linear
,
a_linear
.
clamp
(
min
=-
limit
,
max
=
limit
),
)
out_gelu
=
quick_gelu
(
a_gelu
)
return
out_gelu
*
(
a_linear
+
1
)
class
Phi3SmallMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
assert
(
self
.
config
.
hidden_act
==
"gegelu"
),
"Only `gegelu` is supported for the 4.7 series of models .."
self
.
hidden_size
=
config
.
hidden_size
self
.
gegelu_limit
=
config
.
gegelu_limit
self
.
intermediate_size
=
config
.
intermediate_size
self
.
up_proj
=
HeadMajorColumnParallelLinear
(
self
.
hidden_size
,
2
*
[
self
.
intermediate_size
],
bias
=
True
,
quant_config
=
quant_config
,
)
self
.
down_proj
=
RowParallelLinear
(
self
.
intermediate_size
,
self
.
hidden_size
,
bias
=
True
,
quant_config
=
quant_config
,
)
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
up_proj
(
x
)
x
=
gegelu
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
Phi3SmallSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
layer_idx
:
int
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
layer_idx
=
layer_idx
self
.
config
=
config
self
.
sparse_block_size
=
config
.
blocksparse_block_size
self
.
homo_heads
=
config
.
blocksparse_homo_head_pattern
self
.
local_blocks
=
config
.
blocksparse_num_local_blocks
self
.
vert_stride
=
config
.
blocksparse_vert_stride
assert
(
config
.
blocksparse_block_size
==
config
.
blocksparse_triton_kernel_block_size
)
self
.
hidden_size
=
config
.
hidden_size
# Number of Query Heads
self
.
num_heads
=
config
.
num_attention_heads
self
.
head_dim
=
self
.
hidden_size
//
self
.
num_heads
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
# Number of total Key Value Heads before tensor parallel
self
.
num_key_value_heads
=
config
.
num_key_value_heads
self
.
num_q_per_kv
=
self
.
num_heads
//
self
.
num_key_value_heads
if
self
.
tp_size
>
1
:
assert
self
.
num_key_value_heads
%
self
.
tp_size
==
0
self
.
num_kv_heads_per_partition
=
max
(
1
,
self
.
num_key_value_heads
//
self
.
tp_size
)
self
.
num_heads_per_partition
=
self
.
num_heads
//
self
.
tp_size
self
.
max_position_embeddings
=
config
.
max_position_embeddings
self
.
rope_embedding_base
=
config
.
rope_embedding_base
self
.
rope_position_scale
=
config
.
rope_position_scale
self
.
is_causal
=
True
norm_factor
=
None
if
config
.
mup_use_scaling
:
norm_factor
=
self
.
head_dim
/
config
.
mup_attn_multiplier
else
:
norm_factor
=
math
.
sqrt
(
self
.
head_dim
)
self
.
scale
=
1
/
norm_factor
self
.
query_key_value
=
HeadMajorQKVParallelLinear
(
self
.
hidden_size
,
self
.
head_dim
,
self
.
num_heads
,
self
.
num_key_value_heads
,
bias
=
True
,
quant_config
=
quant_config
,
)
self
.
dense
=
RowParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
True
,
quant_config
=
quant_config
)
if
getattr
(
self
.
config
,
"rope_scaling"
,
None
)
is
not
None
:
rope_scaling
=
self
.
config
.
rope_scaling
for
key
in
rope_scaling
:
if
isinstance
(
rope_scaling
[
key
],
list
):
rope_scaling
[
key
]
=
tuple
(
rope_scaling
[
key
])
if
"factor"
not
in
rope_scaling
:
rope_scaling
[
"factor"
]
=
self
.
rope_position_scale
else
:
rope_scaling
=
{
"rope_type"
:
"linear"
,
"factor"
:
self
.
rope_position_scale
,
}
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
self
.
max_position_embeddings
,
base
=
self
.
rope_embedding_base
,
rope_scaling
=
rope_scaling
,
)
# blocksparse params
self
.
blocksparse_block_size
=
config
.
blocksparse_block_size
self
.
blocksparse_num_local_blocks
=
config
.
blocksparse_num_local_blocks
self
.
blocksparse_vert_stride
=
config
.
blocksparse_vert_stride
use_dense_attn
=
(
getattr
(
self
.
config
,
"dense_attention_every_n_layers"
,
None
)
and
(
self
.
layer_idx
+
1
)
%
self
.
config
.
dense_attention_every_n_layers
==
0
)
bs_params
=
None
if
not
use_dense_attn
:
bs_params
=
{
'max_seqlen'
:
self
.
max_position_embeddings
,
'num_heads'
:
self
.
num_heads_per_partition
,
"num_kv_heads"
:
self
.
num_kv_heads_per_partition
,
"block_size"
:
self
.
sparse_block_size
,
"local_blocks"
:
self
.
local_blocks
,
"vert_stride"
:
self
.
vert_stride
,
"homo_head"
:
self
.
homo_heads
}
self
.
attn
=
Attention
(
self
.
num_heads_per_partition
,
self
.
head_dim
,
self
.
scale
,
num_kv_heads
=
self
.
num_kv_heads_per_partition
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
blocksparse_params
=
bs_params
,
prefix
=
f
"
{
prefix
}
.attn"
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
tuple
[
torch
.
Tensor
]]]:
qkv
,
_
=
self
.
query_key_value
(
hidden_states
)
qkv
=
qkv
.
view
(
qkv
.
shape
[:
-
1
]
+
(
-
1
,
(
self
.
num_q_per_kv
+
2
),
self
.
head_dim
))
q
,
k
,
v
=
qkv
.
split
([
self
.
num_q_per_kv
,
1
,
1
],
dim
=-
2
)
# NOTE: this is required by RotaryEmbed, which indeed does not have to
# TODO: allow 3D QK for rotary forward
q
=
q
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_heads_per_partition
)
k
=
k
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_partition
)
v
=
v
.
reshape
(
-
1
,
self
.
head_dim
*
self
.
num_kv_heads_per_partition
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
output
,
_
=
self
.
dense
(
attn_output
)
return
output
class
Phi3SmallDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
layer_idx
:
int
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
self
.
self_attn
=
Phi3SmallSelfAttention
(
config
,
layer_idx
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.self_attn"
)
self
.
mlp
=
Phi3SmallMLP
(
config
,
quant_config
)
self
.
input_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
post_attention_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
Phi3SmallModel
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
mup_embedding_multiplier
=
config
.
mup_embedding_multiplier
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
config
.
num_hidden_layers
,
lambda
prefix
:
Phi3SmallDecoderLayer
(
config
,
int
(
prefix
.
split
(
'.'
)[
-
1
]),
cache_config
,
quant_config
,
prefix
=
prefix
),
prefix
=
f
"
{
prefix
}
.layers"
)
self
.
final_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
make_empty_intermediate_tensors
=
(
make_empty_intermediate_tensors_factory
([
"hidden_states"
],
config
.
hidden_size
))
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
LongTensor
,
positions
:
Optional
[
torch
.
LongTensor
],
intermediate_tensors
:
Optional
[
IntermediateTensors
],
inputs_embeds
:
Optional
[
torch
.
Tensor
],
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
get_pp_group
().
is_first_rank
:
if
inputs_embeds
is
not
None
:
hidden_states
=
inputs_embeds
else
:
hidden_states
=
self
.
get_input_embeddings
(
input_ids
)
if
(
self
.
mup_embedding_multiplier
is
not
None
and
self
.
mup_embedding_multiplier
>
0.0
):
hidden_states
=
hidden_states
*
self
.
mup_embedding_multiplier
else
:
assert
intermediate_tensors
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]:
hidden_states
=
layer
(
positions
,
hidden_states
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
})
hidden_states
=
self
.
final_layernorm
(
hidden_states
)
return
hidden_states
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
class
Phi3SmallForCausalLM
(
nn
.
Module
,
SupportsPP
):
_tied_weights_keys
=
[
"lm_head.weight"
]
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_suffix
=
{
"rotary_emb.inv_freq"
:
None
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
Phi3SmallModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
vocab_size
=
config
.
vocab_size
self
.
mup_width_multiplier
=
config
.
mup_width_multiplier
self
.
lm_head
=
ParallelLMHead
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
=
quant_config
,
)
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
# tokens in tiktoken but not used
if
hasattr
(
config
,
'dummy_token_indices'
):
device
=
self
.
lm_head
.
weight
.
device
self
.
register_buffer
(
'dummy_token_indices'
,
torch
.
LongTensor
(
config
.
dummy_token_indices
).
to
(
device
),
persistent
=
False
)
else
:
self
.
dummy_token_indices
=
None
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
def
set_input_embeddings
(
self
,
value
):
self
.
model
.
embed_tokens
=
value
def
get_output_embeddings
(
self
):
return
self
.
lm_head
def
set_output_embeddings
(
self
,
value
):
self
.
lm_head
=
value
def
set_decoder
(
self
,
decoder
):
self
.
model
=
decoder
def
get_decoder
(
self
):
return
self
.
model
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
if
self
.
dummy_token_indices
is
not
None
and
logits
is
not
None
:
logits
.
index_fill_
(
-
1
,
self
.
dummy_token_indices
,
-
torch
.
inf
)
logits
=
logits
/
self
.
mup_width_multiplier
return
logits
def
forward
(
self
,
input_ids
:
torch
.
LongTensor
,
positions
:
Optional
[
torch
.
LongTensor
],
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
output_hidden_states
=
self
.
model
(
input_ids
=
input_ids
,
positions
=
positions
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
output_hidden_states
=
output_hidden_states
return
output_hidden_states
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
,
skip_prefixes
=
([
"lm_head.weight"
]
if
self
.
config
.
tie_word_embeddings
else
None
))
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/registry.py
View file @
752c6ade
...
@@ -110,7 +110,6 @@ _TEXT_GENERATION_MODELS = {
...
@@ -110,7 +110,6 @@ _TEXT_GENERATION_MODELS = {
"PersimmonForCausalLM"
:
(
"persimmon"
,
"PersimmonForCausalLM"
),
"PersimmonForCausalLM"
:
(
"persimmon"
,
"PersimmonForCausalLM"
),
"PhiForCausalLM"
:
(
"phi"
,
"PhiForCausalLM"
),
"PhiForCausalLM"
:
(
"phi"
,
"PhiForCausalLM"
),
"Phi3ForCausalLM"
:
(
"phi3"
,
"Phi3ForCausalLM"
),
"Phi3ForCausalLM"
:
(
"phi3"
,
"Phi3ForCausalLM"
),
"Phi3SmallForCausalLM"
:
(
"phi3_small"
,
"Phi3SmallForCausalLM"
),
"PhiMoEForCausalLM"
:
(
"phimoe"
,
"PhiMoEForCausalLM"
),
"PhiMoEForCausalLM"
:
(
"phimoe"
,
"PhiMoEForCausalLM"
),
"Phi4FlashForCausalLM"
:
(
"phi4flash"
,
"Phi4FlashForCausalLM"
),
"Phi4FlashForCausalLM"
:
(
"phi4flash"
,
"Phi4FlashForCausalLM"
),
"Plamo2ForCausalLM"
:
(
"plamo2"
,
"Plamo2ForCausalLM"
),
"Plamo2ForCausalLM"
:
(
"plamo2"
,
"Plamo2ForCausalLM"
),
...
...
vllm/platforms/interface.py
View file @
752c6ade
...
@@ -57,7 +57,6 @@ class _Backend(enum.Enum):
...
@@ -57,7 +57,6 @@ class _Backend(enum.Enum):
PALLAS
=
enum
.
auto
()
PALLAS
=
enum
.
auto
()
PALLAS_VLLM_V1
=
enum
.
auto
()
PALLAS_VLLM_V1
=
enum
.
auto
()
IPEX
=
enum
.
auto
()
IPEX
=
enum
.
auto
()
BLOCK_SPARSE_FLASH_ATTN
=
enum
.
auto
()
DUAL_CHUNK_FLASH_ATTN
=
enum
.
auto
()
DUAL_CHUNK_FLASH_ATTN
=
enum
.
auto
()
DIFFERENTIAL_FLASH_ATTN
=
enum
.
auto
()
DIFFERENTIAL_FLASH_ATTN
=
enum
.
auto
()
NO_ATTENTION
=
enum
.
auto
()
NO_ATTENTION
=
enum
.
auto
()
...
...
vllm/v1/attention/backends/cpu_attn.py
View file @
752c6ade
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
...
@@ -443,7 +443,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
...
@@ -443,7 +443,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
...
@@ -451,9 +450,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
...
@@ -451,9 +450,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
)
->
None
:
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"Torch SPDA does not support block-sparse attention."
)
if
logits_soft_cap
is
not
None
:
if
logits_soft_cap
is
not
None
:
logger
.
warning_once
(
"Torch SPDA does not support logits soft cap. "
logger
.
warning_once
(
"Torch SPDA does not support logits soft cap. "
"Outputs may be slightly off."
)
"Outputs may be slightly off."
)
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
752c6ade
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with FlashAttention."""
"""Attention layer with FlashAttention."""
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
...
@@ -349,15 +349,11 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -349,15 +349,11 @@ class FlashAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
use_irope
:
bool
=
False
,
use_irope
:
bool
=
False
,
)
->
None
:
)
->
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"FlashAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
...
...
vllm/v1/attention/backends/flashinfer.py
View file @
752c6ade
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
from
__future__
import
annotations
from
__future__
import
annotations
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
from
typing
import
TYPE_CHECKING
,
Optional
import
torch
import
torch
from
flashinfer
import
(
BatchDecodeWithPagedKVCacheWrapper
,
from
flashinfer
import
(
BatchDecodeWithPagedKVCacheWrapper
,
...
@@ -490,7 +490,6 @@ class FlashInferImpl(AttentionImpl):
...
@@ -490,7 +490,6 @@ class FlashInferImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
...
...
vllm/v1/attention/backends/flex_attention.py
View file @
752c6ade
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
"""Attention layer with FlashAttention."""
"""Attention layer with FlashAttention."""
from
collections
import
defaultdict
from
collections
import
defaultdict
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
import
torch
from
torch.nn.attention.flex_attention
import
(
BlockMask
,
_mask_mod_signature
,
from
torch.nn.attention.flex_attention
import
(
BlockMask
,
_mask_mod_signature
,
...
@@ -342,15 +342,10 @@ class FlexAttentionImpl(AttentionImpl):
...
@@ -342,15 +342,10 @@ class FlexAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
)
->
None
:
)
->
None
:
if
blocksparse_params
is
not
None
:
# TODO we should support this :think
raise
ValueError
(
"FlashAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
...
...
vllm/v1/attention/backends/mla/common.py
View file @
752c6ade
...
@@ -190,7 +190,7 @@ return curr_o @ W_O
...
@@ -190,7 +190,7 @@ return curr_o @ W_O
import
functools
import
functools
from
abc
import
abstractmethod
from
abc
import
abstractmethod
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
typing
import
TYPE_CHECKING
,
Any
,
Generic
,
Optional
,
TypeVar
,
Union
from
typing
import
TYPE_CHECKING
,
Generic
,
Optional
,
TypeVar
,
Union
import
torch
import
torch
...
@@ -754,7 +754,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
...
@@ -754,7 +754,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
kv_sharing_target_layer_name
:
Optional
[
str
],
...
...
vllm/v1/attention/backends/mla/cutlass_mla.py
View file @
752c6ade
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
os
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
import
torch
...
@@ -74,7 +74,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
...
@@ -74,7 +74,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
kv_sharing_target_layer_name
:
Optional
[
str
],
...
@@ -82,17 +81,14 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
...
@@ -82,17 +81,14 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
**
mla_args
)
->
None
:
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
kv_sharing_target_layer_name
,
**
mla_args
)
unsupported_features
=
[
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
if
any
(
unsupported_features
):
if
any
(
unsupported_features
):
raise
NotImplementedError
(
raise
NotImplementedError
(
"CutlassMLAImpl does not support one of the following: "
"CutlassMLAImpl does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"alibi_slopes, sliding_window, logits_soft_cap"
)
"logits_soft_cap"
)
if
attn_type
!=
AttentionType
.
DECODER
:
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/mla/flashmla.py
View file @
752c6ade
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
torch
import
torch
...
@@ -119,7 +119,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
...
@@ -119,7 +119,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
kv_sharing_target_layer_name
:
Optional
[
str
],
...
@@ -127,20 +126,17 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
...
@@ -127,20 +126,17 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
**
mla_args
)
->
None
:
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
kv_sharing_target_layer_name
,
**
mla_args
)
assert
is_flashmla_supported
(),
\
assert
is_flashmla_supported
(),
\
"FlashMLA is not supported on this device"
"FlashMLA is not supported on this device"
unsupported_features
=
[
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
if
any
(
unsupported_features
):
if
any
(
unsupported_features
):
raise
NotImplementedError
(
raise
NotImplementedError
(
"FlashMLAImpl does not support one of the following: "
"FlashMLAImpl does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"alibi_slopes, sliding_window, logits_soft_cap"
)
"logits_soft_cap"
)
if
attn_type
!=
AttentionType
.
DECODER
:
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/mla/rocm_aiter_mla.py
View file @
752c6ade
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
torch
import
torch
...
@@ -167,7 +167,6 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
...
@@ -167,7 +167,6 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
kv_sharing_target_layer_name
:
Optional
[
str
],
...
@@ -175,20 +174,17 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
...
@@ -175,20 +174,17 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
**
mla_args
)
->
None
:
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
kv_sharing_target_layer_name
,
**
mla_args
)
assert
(
num_heads
==
16
or
num_heads
==
128
),
(
assert
(
num_heads
==
16
or
num_heads
==
128
),
(
f
"Aiter MLA only supports 16 or 128 number of heads.
\n
"
f
"Aiter MLA only supports 16 or 128 number of heads.
\n
"
f
"Provided
{
num_heads
}
number of heads.
\n
"
f
"Provided
{
num_heads
}
number of heads.
\n
"
"Try adjusting tensor_parallel_size value."
)
"Try adjusting tensor_parallel_size value."
)
unsupported_features
=
[
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
if
any
(
unsupported_features
):
if
any
(
unsupported_features
):
raise
NotImplementedError
(
raise
NotImplementedError
(
"Aiter MLA does not support one of the following: "
"Aiter MLA does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"alibi_slopes, sliding_window, logits_soft_cap"
)
"logits_soft_cap"
)
from
aiter
import
flash_attn_varlen_func
from
aiter
import
flash_attn_varlen_func
self
.
flash_attn_varlen_func
=
flash_attn_varlen_func
self
.
flash_attn_varlen_func
=
flash_attn_varlen_func
...
...
vllm/v1/attention/backends/mla/triton_mla.py
View file @
752c6ade
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
import
torch
...
@@ -42,7 +42,6 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
...
@@ -42,7 +42,6 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
kv_sharing_target_layer_name
:
Optional
[
str
],
...
@@ -50,17 +49,14 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
...
@@ -50,17 +49,14 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
**
mla_args
)
->
None
:
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
logits_soft_cap
,
attn_type
,
kv_sharing_target_layer_name
,
**
mla_args
)
kv_sharing_target_layer_name
,
**
mla_args
)
unsupported_features
=
[
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
logits_soft_cap
]
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
]
if
any
(
unsupported_features
):
if
any
(
unsupported_features
):
raise
NotImplementedError
(
raise
NotImplementedError
(
"TritonMLAImpl does not support one of the following: "
"TritonMLAImpl does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"alibi_slopes, sliding_window, logits_soft_cap"
)
"logits_soft_cap"
)
if
attn_type
!=
AttentionType
.
DECODER
:
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/pallas.py
View file @
752c6ade
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
import
torch
import
torch_xla.core.xla_builder
as
xb
import
torch_xla.core.xla_builder
as
xb
...
@@ -132,7 +132,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
...
@@ -132,7 +132,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
...
@@ -142,9 +141,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
...
@@ -142,9 +141,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
logger
.
warning_once
(
logger
.
warning_once
(
"Using irope in Pallas is not supported yet, it will fall back "
"Using irope in Pallas is not supported yet, it will fall back "
"to global attention for long context."
)
"to global attention for long context."
)
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"Paged attention Pallas kernel does "
"not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
...
@@ -158,8 +154,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
...
@@ -158,8 +154,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
raise
NotImplementedError
(
"Alibi slopes is not supported."
)
raise
NotImplementedError
(
"Alibi slopes is not supported."
)
if
kv_cache_dtype
!=
"auto"
:
if
kv_cache_dtype
!=
"auto"
:
raise
NotImplementedError
(
"FP8 KV cache dtype is not supported."
)
raise
NotImplementedError
(
"FP8 KV cache dtype is not supported."
)
if
blocksparse_params
is
not
None
:
raise
NotImplementedError
(
"Blocksparse is not supported."
)
if
attn_type
!=
AttentionType
.
DECODER
:
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/v1/attention/backends/rocm_aiter_fa.py
View file @
752c6ade
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with AiterFlashAttention."""
"""Attention layer with AiterFlashAttention."""
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
from
typing
import
Optional
import
torch
import
torch
...
@@ -334,15 +334,11 @@ class AiterFlashAttentionImpl(AttentionImpl):
...
@@ -334,15 +334,11 @@ class AiterFlashAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
use_irope
:
bool
=
False
,
use_irope
:
bool
=
False
,
)
->
None
:
)
->
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"AiterFlashAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
...
...
vllm/v1/attention/backends/triton_attn.py
View file @
752c6ade
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with PagedAttention and Triton prefix prefill."""
"""Attention layer with PagedAttention and Triton prefix prefill."""
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
ClassVar
,
Optional
from
typing
import
ClassVar
,
Optional
import
torch
import
torch
...
@@ -205,15 +205,11 @@ class TritonAttentionImpl(AttentionImpl):
...
@@ -205,15 +205,11 @@ class TritonAttentionImpl(AttentionImpl):
alibi_slopes
:
Optional
[
list
[
float
]],
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
blocksparse_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
kv_sharing_target_layer_name
:
Optional
[
int
]
=
None
,
use_irope
:
bool
=
False
,
use_irope
:
bool
=
False
,
)
->
None
:
)
->
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"TritonAttention does not support block-sparse attention."
)
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment