Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
e39628fd
Unverified
Commit
e39628fd
authored
Oct 29, 2025
by
Minglei Zhu
Committed by
GitHub
Oct 29, 2025
Browse files
[2/2] Deepseek deterministic: support deepseek v3 deterministic inference on 8 x H200 (#12095)
parent
bacb3825
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
19 additions
and
0 deletions
+19
-0
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py
...rt/layers/moe/fused_moe_triton/fused_moe_triton_config.py
+14
-0
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+3
-0
test/srt/test_fused_moe.py
test/srt/test_fused_moe.py
+2
-0
No files found.
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py
View file @
e39628fd
...
...
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple
import
torch
import
triton
from
sglang.srt.server_args
import
get_global_server_args
from
sglang.srt.utils
import
get_device_name
,
is_hip
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -51,6 +52,11 @@ def get_moe_configs(
kernel on a given batch size bs, the closest batch size in the grid should
be picked and the associated configuration chosen to invoke the kernel.
"""
if
get_global_server_args
().
enable_deterministic_inference
:
logger
.
warning
(
"Deterministic inference is enabled, using default MoE kernel config."
)
return
None
# Supported Triton versions, should be sorted from the newest to the oldest
supported_triton_versions
=
[
"3.4.0"
,
"3.3.1"
,
"3.2.0"
,
"3.1.0"
]
...
...
@@ -130,6 +136,14 @@ def get_default_config(
is_marlin
:
bool
,
block_shape
:
Optional
[
List
[
int
]]
=
None
,
)
->
Dict
[
str
,
int
]:
if
get_global_server_args
().
enable_deterministic_inference
:
config
=
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
}
return
config
if
dtype
==
"fp8_w8a8"
:
if
block_shape
is
None
:
config
=
{
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
e39628fd
...
...
@@ -515,6 +515,9 @@ class MoEGate(nn.Module):
True
,
# is_vnni
)
if
get_global_server_args
().
enable_deterministic_inference
:
return
F
.
linear
(
hidden_states
,
self
.
weight
,
None
)
# NOTE: For some unknown reason, router_gemm seems degrade accept length.
if
(
_is_cuda
...
...
test/srt/test_fused_moe.py
View file @
e39628fd
...
...
@@ -193,6 +193,8 @@ class TestFusedMOE(CustomTestCase):
dtypes
=
[
torch
.
float16
,
torch
.
bfloat16
]
fp8_modes
=
[
False
,
True
]
set_global_server_args_for_scheduler
(
ServerArgs
(
model_path
=
"dummy"
))
# Calculate total number of tests
total_tests
=
(
len
(
m_values
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment