[Performance][Hopper] Avoid M dim padding to 4x for most cases (due to cuda...

[Performance][Hopper] Avoid M dim padding to 4x for most cases (due to cuda graphs paddings) (#28492) Signed-off-by: Alexander Matveev <amatveev@redhat.com>

[Performance][Hopper] Avoid M dim padding to 4x for most cases (due to cuda...
[Performance][Hopper] Avoid M dim padding to 4x for most cases (due to cuda graphs paddings) (#28492) Signed-off-by: Alexander Matveev <amatveev@redhat.com>
f76e85c2 · Alexander Matveev · GitHub · 54aecd9e · f76e85c2
Unverified Commit f76e85c2 authored Nov 12, 2025 by Alexander Matveev Committed by GitHub Nov 12, 2025
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 14 deletions

vllm/model_executor/layers/quantization/utils/fp8_utils.py vllm/model_executor/layers/quantization/utils/fp8_utils.py +21 -14

No files found.
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -115,6 +115,9 @@ def _padded_cutlass(
        dim if dim % pad_multiple == 0 else dim + pad_multiple - (dim % pad_multiple)
    )
+    has_pad = padded > dim
+    if has_pad:
        padded_shape = [padded, *qx.shape[1:]]
        padded_qx = torch.zeros(padded_shape, device=qx.device, dtype=qx.dtype)
        padded_qx[0 : qx.shape[0], ...].copy_(qx)
@@ -129,6 +132,10 @@ def _padded_cutlass(
            padded_qx, weight, padded_x_scale, weight_scale, block_size, output_dtype
        )
        return output[0 : qx.shape[0], ...]
+    else:
+        return cutlass_scaled_mm(
+            qx, weight, x_scale, weight_scale, block_size, output_dtype
+        )
 def _padded_cutlass_fake(