[Bugfix] Fix FP8 torch._scaled_mm fallback for torch>2.5 with CUDA<12.4 (#10095)

Signed-off-by: mgoin <michael@neuralmagic.com>

[Bugfix] Fix FP8 torch._scaled_mm fallback for torch>2.5 with CUDA<12.4 (#10095)
Signed-off-by: mgoin <michael@neuralmagic.com>
4ab32566 · Michael Goin · GitHub · 719c1ca4 · 4ab32566
Unverified Commit 4ab32566 authored Nov 06, 2024 by Michael Goin Committed by GitHub Nov 07, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 4 deletions

vllm/model_executor/layers/quantization/utils/w8a8_utils.py vllm/model_executor/layers/quantization/utils/w8a8_utils.py +2 -4

No files found.
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -7,8 +7,7 @@ from vllm.platforms import current_platform
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
-TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() \
+TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
-            if current_platform.is_rocm() else None
 def cutlass_fp8_supported() -> bool:
@@ -166,8 +165,7 @@ def apply_fp8_linear(
            # Making sure the dummy tensor is on the same device as the weight
            global TORCH_DEVICE_IDENTITY
-            if (TORCH_DEVICE_IDENTITY is not None
+            if TORCH_DEVICE_IDENTITY.device != weight.device:
-                    and TORCH_DEVICE_IDENTITY.device != weight.device):
                TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
            # GEMM