Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
b4ac2b9c
Unverified
Commit
b4ac2b9c
authored
Aug 11, 2025
by
DarkSharpness
Committed by
GitHub
Aug 11, 2025
Browse files
[Fix] Fix dual chunk model default behavior (#9032)
parent
83262dcb
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
1 deletion
+14
-1
python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py
...srt/layers/attention/dual_chunk_flashattention_backend.py
+1
-1
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+13
-0
No files found.
python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py
View file @
b4ac2b9c
...
@@ -483,7 +483,7 @@ class DualChunkFlashAttentionBackend(AttentionBackend):
...
@@ -483,7 +483,7 @@ class DualChunkFlashAttentionBackend(AttentionBackend):
).
squeeze
(
1
)
).
squeeze
(
1
)
return
o
.
view
(
-
1
,
layer
.
tp_q_head_num
*
layer
.
v_head_dim
)
return
o
.
view
(
-
1
,
layer
.
tp_q_head_num
*
layer
.
v_head_dim
)
def
init_cuda_graph_state
(
self
,
max_bs
:
int
):
def
init_cuda_graph_state
(
self
,
max_bs
:
int
,
max_num_tokens
:
int
):
"""Initialize CUDA graph state for the attention backend.
"""Initialize CUDA graph state for the attention backend.
Args:
Args:
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
b4ac2b9c
...
@@ -388,6 +388,19 @@ class ModelRunner:
...
@@ -388,6 +388,19 @@ class ModelRunner:
):
# override the default attention backend
):
# override the default attention backend
server_args
.
attention_backend
=
server_args
.
prefill_attention_backend
server_args
.
attention_backend
=
server_args
.
prefill_attention_backend
if
(
getattr
(
self
.
model_config
.
hf_config
,
"dual_chunk_attention_config"
,
None
)
is
not
None
):
if
server_args
.
attention_backend
is
None
:
server_args
.
attention_backend
=
"dual_chunk_flash_attn"
logger
.
info
(
"Dual chunk attention is turned on by default."
)
elif
server_args
.
attention_backend
!=
"dual_chunk_flash_attn"
:
raise
ValueError
(
"Dual chunk attention is enabled, but attention backend is set to "
f
"
{
server_args
.
attention_backend
}
. Please set it to 'dual_chunk_flash_attn'."
)
if
server_args
.
attention_backend
is
None
:
if
server_args
.
attention_backend
is
None
:
"""
"""
Auto select the fastest attention backend.
Auto select the fastest attention backend.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment