Add the option for the macro and note (#893)

23e8fa5a · Driss Guessous · GitHub · 3e9414f1 · 23e8fa5a
Unverified Commit 23e8fa5a authored Mar 27, 2024 by Driss Guessous Committed by GitHub Mar 27, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 1 deletion

csrc/flash_attn/src/softmax.h csrc/flash_attn/src/softmax.h +8 -1

No files found.
--- a/csrc/flash_attn/src/softmax.h
+++ b/csrc/flash_attn/src/softmax.h
@@ -78,7 +78,14 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tenso
            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
            // max * log_2(e)) This allows the compiler to use the ffma
            // instruction instead of fadd and fmul separately.
-            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            // The following macro will disable the use of fma.
+            // See: https://github.com/pytorch/pytorch/issues/121558 for more details
+            // This macro is set in PyTorch and not FlashAttention
+            #ifdef UNFUSE_FMA
+                tensor(mi, ni) = exp2f(__fmul_rn(tensor(mi, ni), scale) - max_scaled);
+            #else
+                tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            #endif
        }
    }
 }