Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
88faa466
Unverified
Commit
88faa466
authored
Aug 01, 2025
by
Michael Goin
Committed by
GitHub
Aug 01, 2025
Browse files
[CI] Initial tests for SM100 Blackwell runner (#21877)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
881e1af4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
14 deletions
+30
-14
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+21
-3
tests/compile/test_fusion_all_reduce.py
tests/compile/test_fusion_all_reduce.py
+9
-6
tests/kernels/quantization/test_cutlass_scaled_mm.py
tests/kernels/quantization/test_cutlass_scaled_mm.py
+0
-5
No files found.
.buildkite/test-pipeline.yaml
View file @
88faa466
...
@@ -647,13 +647,31 @@ steps:
...
@@ -647,13 +647,31 @@ steps:
-
label
:
Blackwell Test
-
label
:
Blackwell Test
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
gpu
:
b200
optional
:
true
#
optional: true
source_file_dependencies
:
source_file_dependencies
:
-
csrc/
-
csrc/quantization/fp4/
-
vllm/
-
csrc/attention/mla/
-
csrc/quantization/cutlass_w8a8/moe/
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/compilation/fusion.py
commands
:
commands
:
-
nvidia-smi
-
nvidia-smi
-
python3 examples/offline_inference/basic/chat.py
-
python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
-
pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-
pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
-
pytest -v -s tests/kernels/test_cutlass_mla_decode.py
# Quantization
-
pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
-
pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-
pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-
pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
# Fusion
-
pytest -v -s tests/compile/test_fusion_all_reduce.py
##### 1 GPU test #####
##### 1 GPU test #####
##### multi gpus test #####
##### multi gpus test #####
...
...
tests/compile/test_fusion_all_reduce.py
View file @
88faa466
...
@@ -136,12 +136,15 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
...
@@ -136,12 +136,15 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"test_model"
,
[
@
pytest
.
mark
.
parametrize
(
TestAllReduceRMSNormModel
,
"test_model"
,
TestAllReduceFusedAddRMSNormModel
,
[
TestAllReduceFusedAddRMSNormStaticQuantFP8Model
,
TestAllReduceRMSNormModel
,
TestAllReduceFusedAddRMSNormStaticQuantFP4Model
,
TestAllReduceFusedAddRMSNormModel
,
])
TestAllReduceFusedAddRMSNormStaticQuantFP8Model
,
# TODO: Enable with torch==2.8.0
# TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
16
])
...
...
tests/kernels/quantization/test_cutlass_scaled_mm.py
View file @
88faa466
...
@@ -559,8 +559,6 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
...
@@ -559,8 +559,6 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
m_a_scales
=
m_g
if
per_act_token
else
1
m_a_scales
=
m_g
if
per_act_token
else
1
n_b_scales
=
n_g
if
per_out_ch
else
1
n_b_scales
=
n_g
if
per_out_ch
else
1
print
(
"shape:"
,
m_g
,
n_g
,
k_g
)
# Create group-specific A and B (FP8) and output (FP16/FP32)
# Create group-specific A and B (FP8) and output (FP16/FP32)
a_g
=
to_fp8
(
torch
.
randn
((
m_g
,
k_g
),
device
=
device
))
a_g
=
to_fp8
(
torch
.
randn
((
m_g
,
k_g
),
device
=
device
))
b_g
=
to_fp8
(
torch
.
randn
((
n_g
,
k_g
),
device
=
device
).
t
())
b_g
=
to_fp8
(
torch
.
randn
((
n_g
,
k_g
),
device
=
device
).
t
())
...
@@ -639,7 +637,4 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
...
@@ -639,7 +637,4 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
for
g
in
range
(
num_experts
):
for
g
in
range
(
num_experts
):
baseline
=
baseline_tensors
[
g
]
baseline
=
baseline_tensors
[
g
]
c
=
out_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
c
=
out_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
print
(
baseline
)
print
(
c
)
print
(
"*"
)
torch
.
testing
.
assert_close
(
c
,
baseline
,
rtol
=
1e-2
,
atol
=
5e-4
)
torch
.
testing
.
assert_close
(
c
,
baseline
,
rtol
=
1e-2
,
atol
=
5e-4
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment