Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
60d7beda
Unverified
Commit
60d7beda
authored
Sep 15, 2025
by
Ke Bao
Committed by
GitHub
Sep 14, 2025
Browse files
Add split tile size for Triton attention (#10425)
parent
2f8ba6fe
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
0 deletions
+18
-0
python/sglang/srt/layers/attention/triton_backend.py
python/sglang/srt/layers/attention/triton_backend.py
+11
-0
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+7
-0
No files found.
python/sglang/srt/layers/attention/triton_backend.py
View file @
60d7beda
...
...
@@ -94,6 +94,11 @@ class TritonAttnBackend(AttentionBackend):
"SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS"
,
"false"
)
self
.
max_kv_splits
=
model_runner
.
server_args
.
triton_attention_num_kv_splits
self
.
split_tile_size
=
model_runner
.
server_args
.
triton_attention_split_tile_size
if
self
.
split_tile_size
is
not
None
:
self
.
max_kv_splits
=
(
self
.
max_context_len
+
self
.
split_tile_size
-
1
)
//
self
.
split_tile_size
# Check arguments
assert
not
(
...
...
@@ -153,6 +158,12 @@ class TritonAttnBackend(AttentionBackend):
num_kv_splits
.
fill_
(
self
.
max_kv_splits
)
return
if
self
.
split_tile_size
is
not
None
:
num_kv_splits
[:]
=
(
seq_lens
+
self
.
split_tile_size
-
1
)
//
self
.
split_tile_size
return
if
num_seq
<
256
:
SCHEDULE_SEQ
=
256
else
:
...
...
python/sglang/srt/server_args.py
View file @
60d7beda
...
...
@@ -362,6 +362,7 @@ class ServerArgs:
enable_p2p_check
:
bool
=
False
triton_attention_reduce_in_fp32
:
bool
=
False
triton_attention_num_kv_splits
:
int
=
8
triton_attention_split_tile_size
:
Optional
[
int
]
=
None
num_continuous_decode_steps
:
int
=
1
delete_ckpt_after_loading
:
bool
=
False
enable_memory_saver
:
bool
=
False
...
...
@@ -2100,6 +2101,12 @@ class ServerArgs:
default
=
ServerArgs
.
triton_attention_num_kv_splits
,
help
=
"The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8."
,
)
parser
.
add_argument
(
"--triton-attention-split-tile-size"
,
type
=
int
,
default
=
ServerArgs
.
triton_attention_split_tile_size
,
help
=
"The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference."
,
)
parser
.
add_argument
(
"--num-continuous-decode-steps"
,
type
=
int
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment