[Kernel] Disable CUTLASS kernels for fp8 (#5505)

e38042d4 · Tyler Michael Smith · GitHub · 33e3b372 · e38042d4
Unverified Commit e38042d4 authored Jun 13, 2024 by Tyler Michael Smith Committed by GitHub Jun 13, 2024
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

vllm/model_executor/layers/quantization/fp8.py vllm/model_executor/layers/quantization/fp8.py +3 -1

No files found.
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -257,7 +257,9 @@ class Fp8LinearMethod(LinearMethodBase):
        #   If dynamic, layer.input_scale is None and x_scale computed from x.
        #   If static, layer.input_scale is scalar and x_scale is input_scale.
-        if bias is None and self.cutlass_fp8_supported:
+        # Temporarily disable CUTLASS kernels due to an illegal memory access
+        #if  bias is None and self.cutlass_fp8_supported:
+        if False:
            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
            # Fused GEMM_DQ