Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
63c08894
Unverified
Commit
63c08894
authored
Feb 01, 2026
by
Roy Wang
Committed by
GitHub
Jan 31, 2026
Browse files
[Misc] Fix flashinfer related tests (#33462)
Signed-off-by:
esmeetu
<
jasonailu87@gmail.com
>
parent
1e86c802
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
9 additions
and
8 deletions
+9
-8
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+1
-1
tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
...s/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+1
-1
tests/kernels/quantization/test_fp8_quant.py
tests/kernels/quantization/test_fp8_quant.py
+2
-2
vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
+4
-3
vllm/utils/flashinfer.py
vllm/utils/flashinfer.py
+1
-1
No files found.
tests/kernels/moe/test_moe.py
View file @
63c08894
...
...
@@ -412,7 +412,7 @@ def test_naive_block_assignment_moe(
monkeypatch
,
workspace_init
,
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
chunk_size
))
...
...
tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
View file @
63c08894
...
...
@@ -74,7 +74,7 @@ def get_ref_results(
@
pytest
.
mark
.
parametrize
(
"shape"
,
SHAPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"cutlass"
,
"trtllm"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"cutlass"
,
"cudnn"
,
"trtllm"
])
@
pytest
.
mark
.
parametrize
(
"autotune"
,
[
False
,
True
])
@
torch
.
inference_mode
()
def
test_flashinfer_nvfp4_gemm
(
...
...
tests/kernels/quantization/test_fp8_quant.py
View file @
63c08894
...
...
@@ -174,7 +174,7 @@ def test_static_fp8_quant_group_2d(
f
"group_shape (
{
group_shape
[
0
]
}
,
{
group_shape
[
1
]
}
)"
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale
=
scaled_quantize
(
...
...
@@ -202,7 +202,7 @@ def test_static_fp8_quant_1d_scale(
group_shape
:
tuple
[
int
,
int
],
)
->
None
:
"""Test static FP8 quantization with 1D scale (per-token or per-channel)."""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale_2d
=
scaled_quantize
(
...
...
vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
View file @
63c08894
...
...
@@ -154,9 +154,10 @@ def convert_to_nvfp4_linear_kernel_format(
)
layer
.
weight
=
torch
.
nn
.
Parameter
(
weight
,
requires_grad
=
False
)
layer
.
weight_scale
=
torch
.
nn
.
Parameter
(
weight_scale
,
requires_grad
=
False
)
elif
(
backend
==
NvFp4LinearBackend
.
VLLM_CUTLASS
or
backend
==
NvFp4LinearBackend
.
FLASHINFER_CUTLASS
elif
backend
in
(
NvFp4LinearBackend
.
VLLM_CUTLASS
,
NvFp4LinearBackend
.
FLASHINFER_CUTLASS
,
NvFp4LinearBackend
.
FLASHINFER_CUDNN
,
):
weight
,
weight_scale
,
weights_padding_cols
=
prepare_weights_for_nvfp4_cutlass
(
layer
.
weight
.
data
,
layer
.
weight_scale
.
data
...
...
vllm/utils/flashinfer.py
View file @
63c08894
...
...
@@ -521,7 +521,7 @@ def flashinfer_scaled_fp4_mm(
assert
a
.
stride
(
-
1
)
==
1
and
b
.
stride
(
-
1
)
==
1
assert
a
.
shape
[
1
]
==
b
.
shape
[
1
]
if
backend
==
"cutlass"
:
if
backend
in
(
"cutlass"
,
"cudnn"
)
:
block_scale_a
=
block_scale_a
.
view
(
torch
.
uint8
)
block_scale_b
=
block_scale_b
.
view
(
torch
.
uint8
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment