forward fix PR 14245, restore build on ROCm 6.2 (#14709)

Signed-off-by: Jeff Daily <jeff.daily@amd.com>

forward fix PR 14245, restore build on ROCm 6.2 (#14709)
Signed-off-by: Jeff Daily <jeff.daily@amd.com>
2a602b05 · Jeff Daily · GitHub · 7888e1d0 · 2a602b05
Unverified Commit 2a602b05 authored Mar 13, 2025 by Jeff Daily Committed by GitHub Mar 13, 2025
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 0 deletions

csrc/quantization/fp8/amd/quant_utils.cuh csrc/quantization/fp8/amd/quant_utils.cuh +12 -0

No files found.
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -19,12 +19,24 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) {
  return {};
 }

+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
 template <>
 __device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+    #if HIP_FP8_TYPE_OCP
  return c10::Float8_e4m3fn(
      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
                             __hip_fp8_e4m3::__default_interpret),
      c10::Float8_e4m3fn::from_bits());
+    #else
+  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+  return static_cast<c10::Float8_e4m3fn>(r);
+    #endif
 }

 template <>