[Misc] Fix input_scale typing in w8a8_utils.py (#6579)

f952bbc8 · Michael Goin · GitHub · 9364f74e · f952bbc8
Unverified Commit f952bbc8 authored Jul 20, 2024 by Michael Goin Committed by GitHub Jul 20, 2024
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/model_executor/layers/quantization/utils/w8a8_utils.py vllm/model_executor/layers/quantization/utils/w8a8_utils.py +2 -2

No files found.
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -104,7 +104,7 @@ def apply_fp8_linear(
    input: torch.Tensor,
    weight: torch.Tensor,
    weight_scale: torch.Tensor,
-    input_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
    input_scale_ub: Optional[torch.Tensor] = None,
    bias: Optional[torch.Tensor] = None,
    cutlass_fp8_supported: bool = True,
@@ -192,7 +192,7 @@ def apply_int8_linear(
    input: torch.Tensor,
    weight: torch.Tensor,
    weight_scale: torch.Tensor,
-    input_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
    bias: Optional[torch.Tensor] = None,
 ):
    # ops.scaled_int8_quant supports both dynamic and static quant.