Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
1801cd19
"docs/basic_usage/deepseek.md" did not exist on "4f077c01b8cca17993df1c2c77285dce176742c3"
Unverified
Commit
1801cd19
authored
Oct 23, 2025
by
narutolhy
Committed by
GitHub
Oct 24, 2025
Browse files
support more model in piecewise cuda graph (#11745)
parent
ffc722a6
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
5 deletions
+25
-5
python/sglang/srt/layers/radix_attention.py
python/sglang/srt/layers/radix_attention.py
+5
-2
python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
.../sglang/srt/model_executor/piecewise_cuda_graph_runner.py
+8
-3
test/srt/test_piecewise_cuda_graph.py
test/srt/test_piecewise_cuda_graph.py
+12
-0
No files found.
python/sglang/srt/layers/radix_attention.py
View file @
1801cd19
...
...
@@ -142,8 +142,11 @@ def unified_attention_with_output(
ret
=
forward_batch
.
attn_backend
.
forward
(
query
,
key
,
value
,
attention_layer
,
forward_batch
,
save_kv_cache
)
assert
output
.
shape
==
ret
.
shape
output
.
copy_
(
ret
)
assert
(
output
.
numel
()
==
ret
.
numel
()
),
f
"Output tensor element mismatch:
{
output
.
numel
()
}
!=
{
ret
.
numel
()
}
"
output
.
view
(
ret
.
shape
).
copy_
(
ret
)
return
...
...
python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
View file @
1801cd19
...
...
@@ -262,8 +262,13 @@ class PiecewiseCudaGraphRunner:
def
can_run
(
self
,
forward_batch
:
ForwardBatch
):
num_tokens
=
len
(
forward_batch
.
input_ids
)
# TODO(yuwei): support return logprob
# TODO(yuwei): support return
input_ids'
logprob
if
forward_batch
.
return_logprob
:
for
start_len
,
seq_len
in
zip
(
forward_batch
.
extend_logprob_start_lens_cpu
,
forward_batch
.
extend_seq_lens_cpu
,
):
if
start_len
is
not
None
and
start_len
<
seq_len
:
return
False
if
num_tokens
<=
self
.
max_num_tokens
:
return
True
...
...
@@ -438,7 +443,7 @@ class PiecewiseCudaGraphRunner:
out_cache_loc
=
out_cache_loc
,
seq_lens_sum
=
forward_batch
.
seq_lens_sum
,
encoder_lens
=
forward_batch
.
encoder_lens
,
return_logprob
=
forward_batch
.
return_logprob
,
return_logprob
=
False
,
extend_seq_lens
=
forward_batch
.
extend_seq_lens
,
extend_prefix_lens
=
forward_batch
.
extend_prefix_lens
,
extend_start_loc
=
forward_batch
.
extend_start_loc
,
...
...
test/srt/test_piecewise_cuda_graph.py
View file @
1801cd19
...
...
@@ -44,6 +44,18 @@ class TestPiecewiseCudaGraphCorrectness(CustomTestCase):
metrics
=
run_eval
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"score"
],
0.235
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
64
,
num_threads
=
32
,
)
metrics
=
run_eval
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"score"
],
0.65
)
class
TestPiecewiseCudaGraphBenchmark
(
CustomTestCase
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment