Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4fec53cf
Unverified
Commit
4fec53cf
authored
Feb 26, 2026
by
Michael Goin
Committed by
GitHub
Feb 26, 2026
Browse files
[CI] Actually run tests/kernels/quantization/test_block_fp8.py in CI (#34274)
parent
38c498b8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
9 deletions
+7
-9
.buildkite/test_areas/kernels.yaml
.buildkite/test_areas/kernels.yaml
+1
-1
tests/kernels/quantization/test_block_fp8.py
tests/kernels/quantization/test_block_fp8.py
+5
-7
vllm/utils/flashinfer.py
vllm/utils/flashinfer.py
+1
-1
No files found.
.buildkite/test_areas/kernels.yaml
View file @
4fec53cf
...
@@ -70,7 +70,7 @@ steps:
...
@@ -70,7 +70,7 @@ steps:
-
tests/kernels/moe/test_batched_deepgemm.py
-
tests/kernels/moe/test_batched_deepgemm.py
-
tests/kernels/attention/test_deepgemm_attention.py
-
tests/kernels/attention/test_deepgemm_attention.py
commands
:
commands
:
-
pytest -v -s kernels/quantization/test_block_fp8.py
-k deep_gemm
-
pytest -v -s kernels/quantization/test_block_fp8.py
-
pytest -v -s kernels/moe/test_deepgemm.py
-
pytest -v -s kernels/moe/test_deepgemm.py
-
pytest -v -s kernels/moe/test_batched_deepgemm.py
-
pytest -v -s kernels/moe/test_batched_deepgemm.py
-
pytest -v -s kernels/attention/test_deepgemm_attention.py
-
pytest -v -s kernels/attention/test_deepgemm_attention.py
...
...
tests/kernels/quantization/test_block_fp8.py
View file @
4fec53cf
...
@@ -37,13 +37,15 @@ vllm_config = VllmConfig()
...
@@ -37,13 +37,15 @@ vllm_config = VllmConfig()
# Test configurations
# Test configurations
DTYPES
=
[
torch
.
bfloat16
]
# [torch.half, torch.bfloat16, torch.float32]
DTYPES
=
[
torch
.
bfloat16
]
# [torch.half, torch.bfloat16, torch.float32]
# Quantization test configs
NUM_TOKENS
=
[
7
,
2050
]
NUM_TOKENS
=
[
7
,
2050
]
D
=
[
512
,
4096
,
5120
,
13824
]
D
=
[
512
,
4096
,
5120
,
13824
]
GROUP_SIZE
=
[
64
,
128
,
512
]
GROUP_SIZE
=
[
64
,
128
,
512
]
COLUMN_MAJOR_SCALES
=
[
True
,
False
]
COLUMN_MAJOR_SCALES
=
[
True
,
False
]
TMA_ALIGNED_SCALES
=
[
True
,
False
]
TMA_ALIGNED_SCALES
=
[
True
,
False
]
M
=
[
1
,
7
,
8
,
83
,
84
,
4096
]
# Matmul test configs
N
=
[
128
,
512
,
7168
,
7748
,
13824
]
M
=
[
1
,
7
,
8
,
83
,
4096
]
N
=
[
128
,
512
,
576
,
7168
,
13824
]
K
=
[
256
,
3884
,
4096
,
13824
,
16384
]
K
=
[
256
,
3884
,
4096
,
13824
,
16384
]
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
# and its hidden size is 7168.
# and its hidden size is 7168.
...
@@ -162,8 +164,6 @@ def test_w8a8_block_fp8_cutlass_matmul():
...
@@ -162,8 +164,6 @@ def test_w8a8_block_fp8_cutlass_matmul():
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
Bs
=
torch
.
rand
(
n_tiles
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
Bs
=
torch
.
rand
(
n_tiles
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
# Hopper requires row-major format for scales
Bs_cutlass
=
Bs
.
T
.
contiguous
()
if
current_platform
.
is_device_capability
(
90
)
else
Bs
A_fp8
,
As
=
per_token_group_quant_fp8
(
A_fp8
,
As
=
per_token_group_quant_fp8
(
A_fp32
,
block_size
[
1
],
column_major_scales
=
False
A_fp32
,
block_size
[
1
],
column_major_scales
=
False
...
@@ -174,9 +174,7 @@ def test_w8a8_block_fp8_cutlass_matmul():
...
@@ -174,9 +174,7 @@ def test_w8a8_block_fp8_cutlass_matmul():
)
)
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
out
=
cutlass_scaled_mm
(
out
=
cutlass_scaled_mm
(
A_fp8_cutlass
,
B_fp8
,
As_cutlass
,
Bs
,
block_size
,
out_dtype
)
A_fp8_cutlass
,
B_fp8
,
As_cutlass
,
Bs_cutlass
,
block_size
,
out_dtype
)
rel_diff
=
torch
.
mean
(
rel_diff
=
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
))
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
))
...
...
vllm/utils/flashinfer.py
View file @
4fec53cf
...
@@ -734,7 +734,7 @@ def should_use_flashinfer_for_blockscale_fp8_gemm(
...
@@ -734,7 +734,7 @@ def should_use_flashinfer_for_blockscale_fp8_gemm(
# Verify DeepGEMM N/K dims requirements
# Verify DeepGEMM N/K dims requirements
# NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
# NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
# test inside kernels/quatization/test_block_fp8.py
# test inside kernels/qua
n
tization/test_block_fp8.py
N_MULTIPLE
=
64
N_MULTIPLE
=
64
K_MULTIPLE
=
128
K_MULTIPLE
=
128
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment