Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
46 additions
and
5 deletions
+46
-5
vllm/_ipex_ops.py
vllm/_ipex_ops.py
+1
-0
vllm/adapter_commons/layers.py
vllm/adapter_commons/layers.py
+1
-0
vllm/adapter_commons/models.py
vllm/adapter_commons/models.py
+1
-0
vllm/adapter_commons/request.py
vllm/adapter_commons/request.py
+1
-0
vllm/adapter_commons/utils.py
vllm/adapter_commons/utils.py
+1
-0
vllm/adapter_commons/worker_manager.py
vllm/adapter_commons/worker_manager.py
+1
-0
vllm/assets/audio.py
vllm/assets/audio.py
+1
-0
vllm/assets/base.py
vllm/assets/base.py
+1
-0
vllm/assets/image.py
vllm/assets/image.py
+1
-0
vllm/assets/video.py
vllm/assets/video.py
+1
-0
vllm/attention/__init__.py
vllm/attention/__init__.py
+1
-0
vllm/attention/backends/abstract.py
vllm/attention/backends/abstract.py
+2
-0
vllm/attention/backends/blocksparse_attn.py
vllm/attention/backends/blocksparse_attn.py
+4
-0
vllm/attention/backends/cpu_mla.py
vllm/attention/backends/cpu_mla.py
+6
-4
vllm/attention/backends/dual_chunk_flash_attn.py
vllm/attention/backends/dual_chunk_flash_attn.py
+4
-0
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+4
-0
vllm/attention/backends/flashinfer.py
vllm/attention/backends/flashinfer.py
+4
-0
vllm/attention/backends/flashmla.py
vllm/attention/backends/flashmla.py
+3
-1
vllm/attention/backends/hpu_attn.py
vllm/attention/backends/hpu_attn.py
+4
-0
vllm/attention/backends/ipex_attn.py
vllm/attention/backends/ipex_attn.py
+4
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
vllm/_ipex_ops.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
...
...
vllm/adapter_commons/layers.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
...
...
vllm/adapter_commons/models.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
Callable
,
Optional
,
TypeVar
...
...
vllm/adapter_commons/request.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
,
abstractmethod
...
...
vllm/adapter_commons/utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
,
Callable
,
Optional
...
...
vllm/adapter_commons/worker_manager.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
Optional
...
...
vllm/assets/audio.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
pathlib
import
Path
...
...
vllm/assets/base.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
functools
import
lru_cache
from
pathlib
import
Path
...
...
vllm/assets/image.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
typing
import
Literal
...
...
vllm/assets/video.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
functools
import
lru_cache
...
...
vllm/attention/__init__.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionMetadata
,
...
...
vllm/attention/backends/abstract.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
,
abstractmethod
from
contextlib
import
contextmanager
...
...
@@ -269,6 +270,7 @@ class AttentionImpl(ABC, Generic[T]):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
)
->
None
:
raise
NotImplementedError
...
...
vllm/attention/backends/blocksparse_attn.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
,
field
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
...
...
@@ -305,7 +306,10 @@ class BlocksparseFlashAttentionImpl(AttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
assert
blocksparse_params
is
not
None
assert
alibi_slopes
is
None
,
ValueError
(
"Alibi not support for blocksparse flash attention."
)
...
...
vllm/attention/backends/cpu_mla.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
...
...
@@ -177,7 +178,7 @@ class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
seq_lens_tensor
=
seq_lens_tensor
,
max_query_len
=
max_query_len
,
max_kv_len
=
max_kv_len
,
query_start_loc
=
query_start_loc
,
prefill_
query_start_loc
=
query_start_loc
,
kv_start_loc
=
kv_start_loc
,
max_decode_seq_len
=
input_data
.
max_decode_seq_len
,
num_prefills
=
input_data
.
num_prefills
,
...
...
@@ -205,12 +206,13 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
],
# MLA Specific Arguments
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
**
mla_args
)
kv_sharing_target_layer_name
,
**
mla_args
)
unsupported_features
=
[
alibi_slopes
,
sliding_window
,
blocksparse_params
,
logits_soft_cap
...
...
@@ -262,8 +264,8 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
key
=
k
,
value
=
v_padded
,
out
=
output
,
seqlen_q
=
prefill_metadata
.
query_start_loc
,
seqlen_k
=
prefill_metadata
.
query_start_loc
,
seqlen_q
=
prefill_metadata
.
prefill_
query_start_loc
,
seqlen_k
=
prefill_metadata
.
prefill_
query_start_loc
,
max_seqlen_q
=
prefill_metadata
.
max_query_len
,
max_seqlen_k
=
prefill_metadata
.
max_query_len
,
pdropout
=
0.0
,
...
...
vllm/attention/backends/dual_chunk_flash_attn.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with Dual chunk flash attention and sparse attention.
"""
import
math
...
...
@@ -289,9 +290,12 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
layer_idx
:
int
=
-
1
,
dual_chunk_attention_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
vllm/attention/backends/flash_attn.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with FlashAttention."""
from
collections
import
defaultdict
from
dataclasses
import
dataclass
...
...
@@ -617,8 +618,11 @@ class FlashAttentionImpl(AttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
use_irope
:
bool
=
False
,
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"FlashAttention does not support block-sparse attention."
)
...
...
vllm/attention/backends/flashinfer.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
dataclasses
import
os
...
...
@@ -935,8 +936,11 @@ class FlashInferImpl(AttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
use_irope
:
bool
=
False
,
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
if
use_irope
:
logger
.
warning_once
(
"Using irope in FlashInfer is not supported yet, it will fall"
...
...
vllm/attention/backends/flashmla.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
contextlib
import
contextmanager
from
dataclasses
import
dataclass
...
...
@@ -183,12 +184,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]],
logits_soft_cap
:
Optional
[
float
],
attn_type
:
str
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
# MLA Specific Arguments
**
mla_args
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
alibi_slopes
,
sliding_window
,
kv_cache_dtype
,
blocksparse_params
,
logits_soft_cap
,
attn_type
,
**
mla_args
)
kv_sharing_target_layer_name
,
**
mla_args
)
assert
is_flashmla_supported
(),
\
"FlashMLA is not supported on this device"
...
...
vllm/attention/backends/hpu_attn.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
###############################################################################
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
...
...
@@ -109,9 +110,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_seq_len
:
int
=
4096
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
use_irope
:
bool
=
False
,
)
->
None
:
super
(
AttentionImpl
,
self
).
__init__
()
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
if
use_irope
:
logger
.
warning_once
(
"Using irope in HPU is not supported yet, it will fall back "
...
...
vllm/attention/backends/ipex_attn.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" Attention layer with torch scaled_dot_product_attention
and PagedAttention."""
from
dataclasses
import
dataclass
...
...
@@ -122,8 +123,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
use_irope
:
bool
=
False
,
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
if
use_irope
:
logger
.
warning_once
(
"Using irope in Ipex is not supported yet, it will fall"
...
...
Prev
1
…
37
38
39
40
41
42
43
44
45
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment