Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
77d4ac50
Unverified
Commit
77d4ac50
authored
Nov 21, 2025
by
PanZezhong1725
Committed by
GitHub
Nov 21, 2025
Browse files
Merge pull request #655 from gongchensu/feature/fix_cuda_version
Issue/654 - Fix CUDA 13.0 compatibility issues
parents
874cc65b
9a9f0982
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
5 deletions
+11
-5
src/infiniop/ops/layer_norm/cuda/kernel.cuh
src/infiniop/ops/layer_norm/cuda/kernel.cuh
+1
-1
src/infiniop/ops/lp_norm/cuda/kernel.cuh
src/infiniop/ops/lp_norm/cuda/kernel.cuh
+10
-2
src/infiniop/ops/tanh/cuda/kernel.cuh
src/infiniop/ops/tanh/cuda/kernel.cuh
+0
-2
No files found.
src/infiniop/ops/layer_norm/cuda/kernel.cuh
View file @
77d4ac50
...
...
@@ -81,7 +81,7 @@ __device__ void blockLayernormKernel(T *output, T const *input, T const *weight,
}
__shared__
float
sigma2
;
float
sigma2_block
=
BlockReduce
(
temp_storage
).
Reduce
(
sigma2_partial
,
cub
::
Sum
()
);
float
sigma2_block
=
BlockReduce
(
temp_storage
).
Sum
(
sigma2_partial
);
if
(
threadIdx
.
x
==
0
)
{
float
sigma_tmp
=
sqrt
(
sigma2_block
*
__fdividef
(
1.0
F
,
dimsize
)
+
eps
);
sigma2
=
__fdividef
(
1.0
F
,
sigma_tmp
);
...
...
src/infiniop/ops/lp_norm/cuda/kernel.cuh
View file @
77d4ac50
...
...
@@ -17,7 +17,11 @@ __device__ void blockLPNormKernel(
local_max
=
max
(
local_max
,
fabsf
((
float
)
input
[
tid
+
ind
*
stride
]));
}
__shared__
float
global_max
;
#if CUDART_VERSION >= 12090
float
max_block
=
BlockReduce
(
temp_storage
).
Reduce
(
local_max
,
::
cuda
::
maximum
());
#else
float
max_block
=
BlockReduce
(
temp_storage
).
Reduce
(
local_max
,
cub
::
Max
());
#endif
if
(
threadIdx
.
x
==
0
)
{
// must set threadIdx.x = 0 write the output to memory
global_max
=
max_block
;
}
...
...
@@ -30,7 +34,7 @@ __device__ void blockLPNormKernel(
}
__shared__
float
p_total
;
float
p_block
=
BlockReduce
(
temp_storage
).
Reduce
(
p_partial
,
cub
::
Sum
()
);
float
p_block
=
BlockReduce
(
temp_storage
).
Sum
(
p_partial
);
if
(
threadIdx
.
x
==
0
)
{
// must set threadIdx.x = 0 write the output to memory
p_total
=
powf
(
p_block
,
1.0
f
/
p
);
}
...
...
@@ -69,7 +73,11 @@ __device__ void blockLPNormStridesKernel(
local_max
=
max
(
local_max
,
fabsf
((
float
)
input
[
ind_i
+
ind
]));
}
__shared__
float
global_max
;
#if CUDART_VERSION >= 12090
float
max_block
=
BlockReduce
(
temp_storage
).
Reduce
(
local_max
,
::
cuda
::
maximum
());
#else
float
max_block
=
BlockReduce
(
temp_storage
).
Reduce
(
local_max
,
cub
::
Max
());
#endif
if
(
threadIdx
.
x
==
0
)
{
// must set threadIdx.x = 0 write the output to memory
global_max
=
max_block
;
}
...
...
@@ -82,7 +90,7 @@ __device__ void blockLPNormStridesKernel(
}
__shared__
float
p_total
;
float
p_block
=
BlockReduce
(
temp_storage
).
Reduce
(
p_partial
,
cub
::
Sum
()
);
float
p_block
=
BlockReduce
(
temp_storage
).
Sum
(
p_partial
);
if
(
threadIdx
.
x
==
0
)
{
// must set threadIdx.x = 0 write the output to memory
p_total
=
powf
(
p_block
,
1.0
f
/
p
);
}
...
...
src/infiniop/ops/tanh/cuda/kernel.cuh
View file @
77d4ac50
...
...
@@ -2,8 +2,6 @@
#define __TANH_CUDA_H__
#include <cmath>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
tanh
::
cuda
{
typedef
struct
TanhOp
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment