issue/291/fix: 兼容 bf16

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/291/fix: 兼容 bf16
Signed-off-by: YdrMaster <ydrml@hotmail.com>
27a13460 · YdrMaster · f0c5a569 · 27a13460 · 27a13460 · 27a13460
Commit 27a13460 authored Jul 09, 2025 by YdrMaster
5 changed files
--- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
@@ -4,6 +4,9 @@
 #define INFINIOP_CUDA_KERNEL __global__ void
 #endif

+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
 // Posible maximum number of threads per block for CUDA architectures
 // Used for picking correct kernel launch configuration
 #define CUDA_BLOCK_SIZE_4096 4096
@@ -12,8 +15,9 @@

 #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)

-namespace device::cuda {
+using cuda_bfloat16 = nv_bfloat16;

+namespace device::cuda {
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 __forceinline__ __device__ __host__ size_t
 indexToReducedOffset(
@@ -45,8 +49,6 @@ indexToOffset(
 }
 } // namespace device::cuda

-#ifdef ENABLE_NVIDIA_API
-#include <cuda_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
    return expf(val);
@@ -73,4 +75,3 @@ __forceinline__ __device__ __nv_bfloat16
 exp_(const __nv_bfloat16 x) {
    return hexp(x);
 }
-#endif
--- a/src/infiniop/devices/maca/maca_kernel_common.h
+++ b/src/infiniop/devices/maca/maca_kernel_common.h
 #define INFINIOP_MACA_KERNEL __global__ void
+
+#include <maca_bf16.h>
+#include <maca_fp16.h>
+
 // Posible maximum number of threads per block for MACA architectures
 // Used for picking correct kernel launch configuration
 #define MACA_BLOCK_SIZE_1024 1024
@@ -6,6 +10,8 @@

 #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)

+using cuda_bfloat16 = maca_bfloat16;
+
 namespace device::maca {

 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
@@ -39,8 +45,6 @@ indexToOffset(
 }
 } // namespace device::maca

-#ifdef ENABLE_MACA_API
-#include <maca_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
    return expf(val);
@@ -65,4 +69,3 @@ __forceinline__ __device__ __hpcc_bfloat16;
 exp_(const __hpcc_bfloat16; x) {
    return hexp(x);
 }
-#endif
--- a/src/infiniop/ops/causal_softmax/cuda/kernel.cuh
+++ b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh
@@ -29,7 +29,7 @@ __device__ void causalSoftmaxKernel(
        //          2 | * * * ... * * * |
        //  height: 3  col_id->
        if (width + blockIdx.x >= threadIdx.x + height) {
-            if constexpr (std::is_same_v<Tdata, half>) {
+            if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
                y[col] = hexp(x[col] - max_);
            } else {
                y[col] = exp(x[col] - max_);

--- a/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
+++ b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
 #include "../../../devices/maca/common_maca.h"
-#include "../../../devices/maca/maca_kernel_common.h"
 #include "causal_softmax_metax.h"

 #include <hccub/block/block_reduce.cuh>
+#include "../../../devices/maca/maca_kernel_common.h"

 #include "../../../reduce/cuda/reduce.cuh"


--- a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
 #include "../../../devices/cuda/cuda_common.cuh"
-#include "../../../devices/cuda/cuda_kernel_common.cuh"
 #include "causal_softmax_nvidia.cuh"

+#include "../../../devices/cuda/cuda_kernel_common.cuh"
 #include <cub/block/block_reduce.cuh>

 #include "../../../reduce/cuda/reduce.cuh"