Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
27a13460
Commit
27a13460
authored
Jul 09, 2025
by
YdrMaster
Browse files
issue/291/fix: 兼容 bf16
Signed-off-by:
YdrMaster
<
ydrml@hotmail.com
>
parent
f0c5a569
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
14 additions
and
10 deletions
+14
-10
src/infiniop/devices/cuda/cuda_kernel_common.cuh
src/infiniop/devices/cuda/cuda_kernel_common.cuh
+5
-4
src/infiniop/devices/maca/maca_kernel_common.h
src/infiniop/devices/maca/maca_kernel_common.h
+6
-3
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
+1
-1
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
...finiop/ops/causal_softmax/metax/causal_softmax_metax.maca
+1
-1
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
...finiop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+1
-1
No files found.
src/infiniop/devices/cuda/cuda_kernel_common.cuh
View file @
27a13460
...
...
@@ -4,6 +4,9 @@
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
#include <cuda_bf16.h>
#include <cuda_fp16.h>
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096
...
...
@@ -12,8 +15,9 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace
device
::
cuda
{
using
cuda_bfloat16
=
nv_bfloat16
;
namespace
device
::
cuda
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
...
...
@@ -45,8 +49,6 @@ indexToOffset(
}
}
// namespace device::cuda
#ifdef ENABLE_NVIDIA_API
#include <cuda_fp16.h>
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
return
expf
(
val
);
...
...
@@ -73,4 +75,3 @@ __forceinline__ __device__ __nv_bfloat16
exp_
(
const
__nv_bfloat16
x
)
{
return
hexp
(
x
);
}
#endif
src/infiniop/devices/maca/maca_kernel_common.h
View file @
27a13460
#define INFINIOP_MACA_KERNEL __global__ void
#include <maca_bf16.h>
#include <maca_fp16.h>
// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
...
...
@@ -6,6 +10,8 @@
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
using
cuda_bfloat16
=
maca_bfloat16
;
namespace
device
::
maca
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
...
...
@@ -39,8 +45,6 @@ indexToOffset(
}
}
// namespace device::maca
#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
return
expf
(
val
);
...
...
@@ -65,4 +69,3 @@ __forceinline__ __device__ __hpcc_bfloat16;
exp_
(
const
__hpcc_bfloat16
;
x
)
{
return
hexp
(
x
);
}
#endif
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
View file @
27a13460
...
...
@@ -29,7 +29,7 @@ __device__ void causalSoftmaxKernel(
// 2 | * * * ... * * * |
// height: 3 col_id->
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
||
std
::
is_same_v
<
Tdata
,
cuda_bfloat16
>
)
{
y
[
col
]
=
hexp
(
x
[
col
]
-
max_
);
}
else
{
y
[
col
]
=
exp
(
x
[
col
]
-
max_
);
...
...
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
View file @
27a13460
#include "../../../devices/maca/common_maca.h"
#include "../../../devices/maca/maca_kernel_common.h"
#include "causal_softmax_metax.h"
#include <hccub/block/block_reduce.cuh>
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
...
...
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
View file @
27a13460
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "causal_softmax_nvidia.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment