[Bugfix] Enable padded FP4 quantization (#25947)

Signed-off-by: Roi Koren <roik@nvidia.com>

[Bugfix] Enable padded FP4 quantization (#25947)
Signed-off-by: Roi Koren <roik@nvidia.com>
4069db3f · roikoren755 · GitHub · 0d37450e · 4069db3f · 4069db3f
Unverified Commit 4069db3f authored Oct 09, 2025 by roikoren755 Committed by GitHub Oct 09, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 3 deletions

vllm/_custom_ops.py vllm/_custom_ops.py +1 -1

vllm/utils/flashinfer.py vllm/utils/flashinfer.py +0 -2

No files found.
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1384,7 +1384,7 @@ def scaled_fp4_quant(
    rounded_m = round_up(m, 128)
    scale_n = n // block_size
    rounded_n = round_up(scale_n, 4)
-    output_scale = torch.empty(
+    output_scale = torch.zeros(
        (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
    )

--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -386,8 +386,6 @@ def flashinfer_scaled_fp4_mm(
    assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2
    assert a.stride(-1) == 1 and b.stride(-1) == 1
    assert a.shape[1] == b.shape[1]
-    assert block_scale_a.shape[1] == a.shape[1] // 8
-    assert block_scale_b.shape[1] == b.shape[1] // 8
    if backend == "cutlass":
        block_scale_a = block_scale_a.view(torch.uint8)