[Bugfix] Enforce contiguous input for dynamic_per_token FP8/INT8 quant (#19452)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Bugfix] Enforce contiguous input for dynamic_per_token FP8/INT8 quant (#19452)
Signed-off-by: mgoin <mgoin64@gmail.com>
a3319f4f · Michael Goin · GitHub · 9d880f59 · a3319f4f
Unverified Commit a3319f4f authored Jun 12, 2025 by Michael Goin Committed by GitHub Jun 12, 2025
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

vllm/_custom_ops.py vllm/_custom_ops.py +3 -3

No files found.
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1270,7 +1270,7 @@ def scaled_fp8_quant(
                                device=input.device,
                                dtype=torch.float32)
            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
-                output, input, scale, scale_ub)
+                output, input.contiguous(), scale, scale_ub)
        else:
            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
@@ -1379,8 +1379,8 @@ def scaled_int8_quant(
                               dtype=torch.float32)
    input_azp = None if symmetric else torch.empty_like(input_scales,
                                                        dtype=torch.int32)
-    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales,
+    torch.ops._C.dynamic_scaled_int8_quant(output, input.contiguous(),
-                                           input_azp)
+                                           input_scales, input_azp)
    return output, input_scales, input_azp