Commit 27a13460 authored by YdrMaster's avatar YdrMaster
Browse files

issue/291/fix: 兼容 bf16


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent f0c5a569
......@@ -4,6 +4,9 @@
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
#include <cuda_bf16.h>
#include <cuda_fp16.h>
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096
......@@ -12,8 +15,9 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace device::cuda {
using cuda_bfloat16 = nv_bfloat16;
namespace device::cuda {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
......@@ -45,8 +49,6 @@ indexToOffset(
}
} // namespace device::cuda
#ifdef ENABLE_NVIDIA_API
#include <cuda_fp16.h>
__forceinline__ __device__ float
exp_(const float val) {
return expf(val);
......@@ -73,4 +75,3 @@ __forceinline__ __device__ __nv_bfloat16
exp_(const __nv_bfloat16 x) {
return hexp(x);
}
#endif
#define INFINIOP_MACA_KERNEL __global__ void
#include <maca_bf16.h>
#include <maca_fp16.h>
// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
......@@ -6,6 +10,8 @@
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
using cuda_bfloat16 = maca_bfloat16;
namespace device::maca {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
......@@ -39,8 +45,6 @@ indexToOffset(
}
} // namespace device::maca
#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__ __device__ float
exp_(const float val) {
return expf(val);
......@@ -65,4 +69,3 @@ __forceinline__ __device__ __hpcc_bfloat16;
exp_(const __hpcc_bfloat16; x) {
return hexp(x);
}
#endif
......@@ -29,7 +29,7 @@ __device__ void causalSoftmaxKernel(
// 2 | * * * ... * * * |
// height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) {
if constexpr (std::is_same_v<Tdata, half>) {
if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
y[col] = hexp(x[col] - max_);
} else {
y[col] = exp(x[col] - max_);
......
#include "../../../devices/maca/common_maca.h"
#include "../../../devices/maca/maca_kernel_common.h"
#include "causal_softmax_metax.h"
#include <hccub/block/block_reduce.cuh>
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
......
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "causal_softmax_nvidia.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment