FLASH_MLA_BF16_TYPE控制bf16转换精度

3a477917 · zhanghj2 · 4c0bb04e · 3a477917 · 3a477917 · 3a477917
Commit 3a477917 authored Feb 24, 2026 by zhanghj2
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 0 deletions

csrc/gfx9/decode/combine/combine.cu csrc/gfx9/decode/combine/combine.cu +5 -0

csrc/utils.h csrc/utils.h +7 -0

setup.py setup.py +5 -0

No files found.
--- a/csrc/gfx9/decode/combine/combine.cu
+++ b/csrc/gfx9/decode/combine/combine.cu
@@ -167,7 +167,12 @@ flash_fwd_mla_combine_kernel(const CombineParams params) {
        // }
        auto float2bf16 = [] (float s) -> uint16_t {
            uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
+            #ifndef FLASH_MLA_BF16_TYPE
+            #define FLASH_MLA_BF16_TYPE 0
+            #endif
+            #if FLASH_MLA_BF16_TYPE == 1
            x32 += 0x8000u;
+            #endif
            return uint16_t(x32 >> 16);
        };

--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -290,7 +290,14 @@ __forceinline__ __device__ auto convert_type(Tensor<Engine, Layout> const &tenso
    #else
        {
            if constexpr (std::is_same_v<To_type, cutlass::bfloat16_t>) {
+                #ifndef FLASH_MLA_BF16_TYPE
+                #define FLASH_MLA_BF16_TYPE 0
+                #endif
+                #if FLASH_MLA_BF16_TYPE == 0
+                cutlass::NumericArrayConverter<To_type, From_type, numel, cutlass::FloatRoundStyle::round_toward_zero> convert_op;
+                #else
                cutlass::NumericArrayConverter<To_type, From_type, numel, cutlass::FloatRoundStyle::round_half_ulp_truncate> convert_op;
+                #endif
                *result_ptr = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
            } else {
                cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;

--- a/setup.py
+++ b/setup.py
@@ -19,9 +19,14 @@ def is_flag_set(flag: str) -> bool:
    return os.getenv(flag, "FALSE").lower() in ["true", "1", "y", "yes"]
 def get_features_args():
+    bf16_type = os.getenv("FLASH_MLA_BF16_TYPE", "0")
+    assert bf16_type == "0" or bf16_type == "1", "bf16_type must be 0 or 1"
+    bf16_mode_names = {"0": "round_toward_zero", "1": "round_half_ulp_truncate"}
+    print(f"Using BFloat16 rounding mode: {bf16_mode_names.get(bf16_type, 'unknown')}")
    features_args = []
    if is_flag_set("FLASH_MLA_DISABLE_FP16"):
        features_args.append("-DFLASH_MLA_DISABLE_FP16")
+    features_args.append(f"-DFLASH_MLA_BF16_TYPE={bf16_type}")
    return features_args
 def get_arch_flags():