Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2f8ba6fe
Unverified
Commit
2f8ba6fe
authored
Sep 14, 2025
by
Cheng Wan
Committed by
GitHub
Sep 14, 2025
Browse files
[Fix] MoE: fix w8a8_fp8 MoE and add tests to cover this code path (#10429)
parent
7ce6c10e
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
44 additions
and
8 deletions
+44
-8
python/sglang/srt/layers/quantization/w8a8_fp8.py
python/sglang/srt/layers/quantization/w8a8_fp8.py
+1
-1
test/srt/quant/test_w8a8_quantization.py
test/srt/quant/test_w8a8_quantization.py
+43
-7
No files found.
python/sglang/srt/layers/quantization/w8a8_fp8.py
View file @
2f8ba6fe
...
...
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional
import
torch
from
torch.nn.parameter
import
Parameter
from
sglang.srt.layers.moe
import
MoeRunner
,
MoeRunnerBackend
,
MoeRunnerConfig
from
sglang.srt.layers.moe.moe_runner.triton
import
TritonMoeQuantInfo
from
sglang.srt.layers.parameter
import
ChannelQuantScaleParameter
,
ModelWeightParameter
from
sglang.srt.layers.quantization.base_config
import
(
...
...
@@ -27,7 +28,6 @@ from sglang.srt.layers.quantization.fp8_utils import (
from
sglang.srt.utils
import
set_weight_attrs
if
TYPE_CHECKING
:
from
sglang.srt.layers.moe
import
MoeRunner
,
MoeRunnerBackend
,
MoeRunnerConfig
from
sglang.srt.layers.moe.token_dispatcher
import
(
CombineInput
,
StandardDispatchOutput
,
...
...
test/srt/quant/test_w8a8_quantization.py
View file @
2f8ba6fe
...
...
@@ -14,23 +14,39 @@ from sglang.test.test_utils import (
)
class
TestW8A8
(
CustomTestCase
):
class
BaseW8A8Test
(
CustomTestCase
):
model
:
str
=
None
quantization
:
str
=
None
gsm8k_accuracy_threshold
:
float
=
None
throughput_threshold
:
float
=
None
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
"neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a8"
if
cls
is
BaseW8A8Test
:
raise
unittest
.
SkipTest
(
"Skip base test class"
)
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
other_args
=
[]
if
cls
.
quantization
:
other_args
.
extend
([
"--quantization"
,
cls
.
quantization
])
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--quantization"
,
"w8a8_int8"
]
,
other_args
=
other_args
,
)
@
classmethod
def
tearDownClass
(
cls
):
if
cls
is
BaseW8A8Test
:
return
kill_process_tree
(
cls
.
process
.
pid
)
def
test_gsm8k
(
self
):
if
self
.
gsm8k_accuracy_threshold
is
None
:
self
.
skipTest
(
"gsm8k_accuracy_threshold not set for this test"
)
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
...
...
@@ -42,8 +58,7 @@ class TestW8A8(CustomTestCase):
)
metrics
=
run_eval
(
args
)
print
(
metrics
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.69
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
self
.
gsm8k_accuracy_threshold
)
def
run_decode
(
self
,
max_new_tokens
):
response
=
requests
.
post
(
...
...
@@ -60,15 +75,36 @@ class TestW8A8(CustomTestCase):
return
response
.
json
()
def
test_throughput
(
self
):
max_tokens
=
256
max_tokens
=
256
tic
=
time
.
perf_counter
()
res
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
perf_counter
()
print
(
res
[
"text"
])
throughput
=
max_tokens
/
(
tok
-
tic
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
assert
throughput
>=
140
self
.
assertGreaterEqual
(
throughput
,
self
.
throughput_threshold
)
class
TestW8A8Int8
(
BaseW8A8Test
):
model
=
"neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a8"
quantization
=
"w8a8_int8"
gsm8k_accuracy_threshold
=
0.69
throughput_threshold
=
200
class
TestW8A8Fp8
(
BaseW8A8Test
):
model
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
quantization
=
"w8a8_fp8"
gsm8k_accuracy_threshold
=
0.69
throughput_threshold
=
200
class
TestW8A8Fp8MoE
(
BaseW8A8Test
):
model
=
"RedHatAI/Qwen3-30B-A3B-FP8-dynamic"
quantization
=
"w8a8_fp8"
gsm8k_accuracy_threshold
=
0.88
throughput_threshold
=
180
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment