Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
abe30c35
Commit
abe30c35
authored
Nov 07, 2025
by
lizhigong
Browse files
修复extern报错、精度问题,支持radix cache和chunk prefill
parent
46da9556
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
5 deletions
+12
-5
python/sglang/srt/layers/attention/dcu_mla_backend.py
python/sglang/srt/layers/attention/dcu_mla_backend.py
+10
-3
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+1
-1
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+1
-1
No files found.
python/sglang/srt/layers/attention/dcu_mla_backend.py
View file @
abe30c35
...
@@ -432,11 +432,18 @@ class DCUMLABackend(AttentionBackend):
...
@@ -432,11 +432,18 @@ class DCUMLABackend(AttentionBackend):
layer
:
"RadixAttention"
,
layer
:
"RadixAttention"
,
forward_batch
:
ForwardBatch
,
forward_batch
:
ForwardBatch
,
save_kv_cache
:
bool
=
True
,
save_kv_cache
:
bool
=
True
,
# For multi-head latent attention
q_rope
:
Optional
[
torch
.
Tensor
]
=
None
,
k_rope
:
Optional
[
torch
.
Tensor
]
=
None
,
sinks
=
None
,
sinks
=
None
,
):
):
if
(
if
save_kv_cache
:
return
self
.
forward_decode
(
q
,
k
,
v
,
layer
,
forward_batch
,
save_kv_cache
)
if
((
forward_batch
.
forward_mode
==
ForwardMode
.
EXTEND
forward_batch
.
forward_mode
==
ForwardMode
.
EXTEND
or
forward_batch
.
forward_mode
==
ForwardMode
.
DRAFT_EXTEND
or
forward_batch
.
forward_mode
==
ForwardMode
.
DRAFT_EXTEND
)
):
):
# flash_attn不支持fp8,fp8无法正常执行extend
# flash_attn不支持fp8,fp8无法正常执行extend
if
not
self
.
skip_prefill
:
if
not
self
.
skip_prefill
:
...
@@ -444,7 +451,7 @@ class DCUMLABackend(AttentionBackend):
...
@@ -444,7 +451,7 @@ class DCUMLABackend(AttentionBackend):
# q, k, v, layer, forward_batch, save_kv_cache, sinks
# q, k, v, layer, forward_batch, save_kv_cache, sinks
# )
# )
return
self
.
flashattn_backend
.
forward_extend
(
return
self
.
flashattn_backend
.
forward_extend
(
q
,
k
,
v
,
layer
,
forward_batch
,
save_kv_cache
,
sinks
q
,
k
,
v
,
layer
,
forward_batch
,
save_kv_cache
,
q_rope
,
k_rope
,
sinks
)
)
else
:
else
:
raise
RuntimeError
(
"skip prefill but use forward_extend"
)
raise
RuntimeError
(
"skip prefill but use forward_extend"
)
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
abe30c35
...
@@ -1618,7 +1618,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
...
@@ -1618,7 +1618,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
self
.
seq_lens_cpu
=
self
.
seq_lens_cpu
[
keep_indices
]
self
.
seq_lens_cpu
=
self
.
seq_lens_cpu
[
keep_indices
]
self
.
orig_seq_lens
=
self
.
orig_seq_lens
[
keep_indices_device
]
self
.
orig_seq_lens
=
self
.
orig_seq_lens
[
keep_indices_device
]
self
.
out_cache_loc
=
None
self
.
out_cache_loc
=
None
self
.
seq_lens_sum
=
self
.
seq_lens
.
sum
()
self
.
seq_lens_sum
=
self
.
seq_lens
.
sum
()
.
item
()
self
.
output_ids
=
self
.
output_ids
[
keep_indices_device
]
self
.
output_ids
=
self
.
output_ids
[
keep_indices_device
]
self
.
return_logprob
=
any
(
req
.
return_logprob
for
req
in
self
.
reqs
)
self
.
return_logprob
=
any
(
req
.
return_logprob
for
req
in
self
.
reqs
)
if
self
.
return_logprob
:
if
self
.
return_logprob
:
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
abe30c35
...
@@ -174,6 +174,7 @@ MLA_ATTENTION_BACKENDS = [
...
@@ -174,6 +174,7 @@ MLA_ATTENTION_BACKENDS = [
CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS
=
[
CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS
=
[
"flashinfer"
,
"flashinfer"
,
"fa3"
,
"fa3"
,
"dcu_mla"
,
"fa4"
,
"fa4"
,
"flashmla"
,
"flashmla"
,
"cutlass_mla"
,
"cutlass_mla"
,
...
@@ -2238,7 +2239,6 @@ class ModelRunner:
...
@@ -2238,7 +2239,6 @@ class ModelRunner:
and
self
.
graph_runner
and
self
.
graph_runner
and
self
.
graph_runner
.
can_run
(
forward_batch
)
and
self
.
graph_runner
.
can_run
(
forward_batch
)
)
)
if
can_run_graph
:
if
can_run_graph
:
ret
=
self
.
graph_runner
.
replay
(
ret
=
self
.
graph_runner
.
replay
(
forward_batch
,
forward_batch
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment