Commit 27a13460 authored by YdrMaster's avatar YdrMaster
Browse files

issue/291/fix: 兼容 bf16


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent f0c5a569
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
#define INFINIOP_CUDA_KERNEL __global__ void #define INFINIOP_CUDA_KERNEL __global__ void
#endif #endif
#include <cuda_bf16.h>
#include <cuda_fp16.h>
// Posible maximum number of threads per block for CUDA architectures // Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration // Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096 #define CUDA_BLOCK_SIZE_4096 4096
...@@ -12,8 +15,9 @@ ...@@ -12,8 +15,9 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess) #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace device::cuda { using cuda_bfloat16 = nv_bfloat16;
namespace device::cuda {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor // return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t __forceinline__ __device__ __host__ size_t
indexToReducedOffset( indexToReducedOffset(
...@@ -45,8 +49,6 @@ indexToOffset( ...@@ -45,8 +49,6 @@ indexToOffset(
} }
} // namespace device::cuda } // namespace device::cuda
#ifdef ENABLE_NVIDIA_API
#include <cuda_fp16.h>
__forceinline__ __device__ float __forceinline__ __device__ float
exp_(const float val) { exp_(const float val) {
return expf(val); return expf(val);
...@@ -73,4 +75,3 @@ __forceinline__ __device__ __nv_bfloat16 ...@@ -73,4 +75,3 @@ __forceinline__ __device__ __nv_bfloat16
exp_(const __nv_bfloat16 x) { exp_(const __nv_bfloat16 x) {
return hexp(x); return hexp(x);
} }
#endif
#define INFINIOP_MACA_KERNEL __global__ void #define INFINIOP_MACA_KERNEL __global__ void
#include <maca_bf16.h>
#include <maca_fp16.h>
// Posible maximum number of threads per block for MACA architectures // Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration // Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024 #define MACA_BLOCK_SIZE_1024 1024
...@@ -6,6 +10,8 @@ ...@@ -6,6 +10,8 @@
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess) #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
using cuda_bfloat16 = maca_bfloat16;
namespace device::maca { namespace device::maca {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor // return the memory offset of original tensor, given the flattened index of broadcasted tensor
...@@ -39,8 +45,6 @@ indexToOffset( ...@@ -39,8 +45,6 @@ indexToOffset(
} }
} // namespace device::maca } // namespace device::maca
#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__ __device__ float __forceinline__ __device__ float
exp_(const float val) { exp_(const float val) {
return expf(val); return expf(val);
...@@ -65,4 +69,3 @@ __forceinline__ __device__ __hpcc_bfloat16; ...@@ -65,4 +69,3 @@ __forceinline__ __device__ __hpcc_bfloat16;
exp_(const __hpcc_bfloat16; x) { exp_(const __hpcc_bfloat16; x) {
return hexp(x); return hexp(x);
} }
#endif
...@@ -29,7 +29,7 @@ __device__ void causalSoftmaxKernel( ...@@ -29,7 +29,7 @@ __device__ void causalSoftmaxKernel(
// 2 | * * * ... * * * | // 2 | * * * ... * * * |
// height: 3 col_id-> // height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) { if (width + blockIdx.x >= threadIdx.x + height) {
if constexpr (std::is_same_v<Tdata, half>) { if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
y[col] = hexp(x[col] - max_); y[col] = hexp(x[col] - max_);
} else { } else {
y[col] = exp(x[col] - max_); y[col] = exp(x[col] - max_);
......
#include "../../../devices/maca/common_maca.h" #include "../../../devices/maca/common_maca.h"
#include "../../../devices/maca/maca_kernel_common.h"
#include "causal_softmax_metax.h" #include "causal_softmax_metax.h"
#include <hccub/block/block_reduce.cuh> #include <hccub/block/block_reduce.cuh>
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh" #include "../../../reduce/cuda/reduce.cuh"
......
#include "../../../devices/cuda/cuda_common.cuh" #include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "causal_softmax_nvidia.cuh" #include "causal_softmax_nvidia.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh> #include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh" #include "../../../reduce/cuda/reduce.cuh"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment