[Misc] Pass cutlass_fp8_supported correctly in fbgemm_fp8 (#6871)

3eeb148f · Elsa Granger · GitHub · b1366a95 · 3eeb148f
Unverified Commit 3eeb148f authored Jul 28, 2024 by Elsa Granger Committed by GitHub Jul 28, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 8 deletions

vllm/model_executor/layers/quantization/fbgemm_fp8.py vllm/model_executor/layers/quantization/fbgemm_fp8.py +11 -8

No files found.
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -9,6 +9,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
    def __init__(self, quant_config: FBGEMMFp8Config):
        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
    def create_weights(
        self,
@@ -139,11 +141,12 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
                size_k=layer.input_size_per_partition,
                bias=bias)
-        return apply_fp8_linear(input=x,
+        return apply_fp8_linear(
-                                weight=layer.weight,
+            input=x,
-                                weight_scale=layer.weight_scale,
+            weight=layer.weight,
-                                input_scale=None,
+            weight_scale=layer.weight_scale,
-                                input_scale_ub=layer.input_scale_ub,
+            input_scale=None,
-                                bias=bias,
+            input_scale_ub=layer.input_scale_ub,
-                                cutlass_fp8_supported=True,
+            bias=bias,
-                                use_per_token_if_dynamic=True)
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=True)