Fix num_token_padding support for static per-tensor scaled_fp8_quant (#20188)

Signed-off-by: mgoin <mgoin64@gmail.com>

Fix num_token_padding support for static per-tensor scaled_fp8_quant (#20188)
Signed-off-by: mgoin <mgoin64@gmail.com>
a29e62ea · Michael Goin · GitHub · e53be6f0 · a29e62ea
Unverified Commit a29e62ea authored Jun 28, 2025 by Michael Goin Committed by GitHub Jun 27, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 2 deletions

vllm/_custom_ops.py vllm/_custom_ops.py +1 -2

No files found.
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1274,8 +1274,7 @@ def scaled_fp8_quant(
            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
    else:
-        # num_token_padding not implemented for this case
+        assert scale.numel() == 1
-        assert (scale.numel() == 1 and num_token_padding is None)
        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
    return output, scale