Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f4488e9d
Unverified
Commit
f4488e9d
authored
Oct 18, 2025
by
Minglei Zhu
Committed by
GitHub
Oct 18, 2025
Browse files
set default attention backend for deterministic inference (#11801)
parent
e68a2b5b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
2 deletions
+26
-2
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+17
-2
python/sglang/srt/utils/common.py
python/sglang/srt/utils/common.py
+9
-0
No files found.
python/sglang/srt/server_args.py
View file @
f4488e9d
...
...
@@ -44,6 +44,7 @@ from sglang.srt.utils import (
is_remote_url
,
is_sm90_supported
,
is_sm100_supported
,
is_sm120_supported
,
is_triton_kernels_available
,
is_valid_ipv6_address
,
json_list_type
,
...
...
@@ -1411,9 +1412,23 @@ class ServerArgs:
)
# Check attention backend
if
self
.
attention_backend
not
in
DETERMINISTIC_ATTENTION_BACKEND_CHOICES
:
if
self
.
attention_backend
is
None
:
# User didn't specify attention backend, fallback based on GPU architecture
if
is_sm100_supported
()
or
is_sm120_supported
():
# Blackwell and newer architectures
self
.
attention_backend
=
"flashinfer"
else
:
# Hopper (SM90) and older architectures
self
.
attention_backend
=
"fa3"
logger
.
warning
(
f
"Attention backend not specified. Falling back to '
{
self
.
attention_backend
}
' for deterministic inference. "
f
"You can explicitly set --attention-backend to one of
{
DETERMINISTIC_ATTENTION_BACKEND_CHOICES
}
."
)
elif
self
.
attention_backend
not
in
DETERMINISTIC_ATTENTION_BACKEND_CHOICES
:
# User explicitly specified an incompatible attention backend
raise
ValueError
(
f
"Currently only
{
DETERMINISTIC_ATTENTION_BACKEND_CHOICES
}
attention backends are supported for deterministic inference."
f
"Currently only
{
DETERMINISTIC_ATTENTION_BACKEND_CHOICES
}
attention backends are supported for deterministic inference, "
f
"but you explicitly specified '
{
self
.
attention_backend
}
'."
)
# Currently, only FA3 supports radix cache. Support for other backends is in progress
...
...
python/sglang/srt/utils/common.py
View file @
f4488e9d
...
...
@@ -174,6 +174,15 @@ def is_blackwell():
return
torch
.
cuda
.
get_device_capability
()[
0
]
==
10
@
lru_cache
(
maxsize
=
1
)
def
is_sm120_supported
(
device
=
None
)
->
bool
:
if
not
is_cuda_alike
():
return
False
return
(
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
12
)
and
(
torch
.
version
.
cuda
>=
"12.8"
)
@
lru_cache
(
maxsize
=
1
)
def
is_sm100_supported
(
device
=
None
)
->
bool
:
if
not
is_cuda_alike
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment