Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
20488ee7
Commit
20488ee7
authored
Sep 02, 2025
by
zhangyue
Browse files
issue/418: 解决 p800 上手写算子引用 sm 上指针的报错问题
parent
b3170335
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
17 additions
and
34 deletions
+17
-34
src/infiniop/devices/kunlun/kunlun_kernel_common.h
src/infiniop/devices/kunlun/kunlun_kernel_common.h
+3
-19
src/infiniop/ops/causal_softmax/kunlun/kernel.h
src/infiniop/ops/causal_softmax/kunlun/kernel.h
+2
-2
src/infiniop/ops/rms_norm/kunlun/kernel.h
src/infiniop/ops/rms_norm/kunlun/kernel.h
+2
-2
src/infiniop/reduce/kunlun/reduce_kunlun.h
src/infiniop/reduce/kunlun/reduce_kunlun.h
+10
-11
No files found.
src/infiniop/devices/kunlun/kunlun_kernel_common.h
View file @
20488ee7
...
...
@@ -37,22 +37,6 @@ inline __device__ float lowerBitMask(int i) {
return
(
1
<<
(
i
+
1
))
-
1
;
}
/**
* @brief Load data from shared memory
* @param p: pointer to shared memory
* @return loaded value
*/
template
<
typename
T
>
__device__
inline
T
loadsm
(
__shared_ptr__
const
T
*
p
)
{
T
v
;
if
constexpr
(
std
::
is_same
<
T
,
half
>::
value
||
std
::
is_same
<
T
,
bfloat16_t
>::
value
)
{
__builtin_memcpy
(
&
v
,
p
,
sizeof
(
T
));
}
else
{
v
=
*
p
;
}
return
v
;
}
// Load len data from shared memory
template
<
typename
T
>
__device__
inline
void
loadsm
(
__shared_ptr__
const
T
*
p
,
T
*
v
,
int
len
)
{
...
...
@@ -89,7 +73,7 @@ inline __device__ T atomicAdd(__shared_ptr__ T *ptr, T value) {
template
<
>
inline
__device__
half
atomicAdd
<
half
>
(
__shared_ptr__
half
*
ptr
,
half
value
)
{
ticket_lock_mix
();
__
half
old
=
loadsm
(
ptr
)
;
half
old
=
*
ptr
;
float
of
=
__half2float
(
old
);
float
vf
=
__half2float
(
value
);
float
sumf
=
of
+
vf
;
...
...
@@ -103,7 +87,7 @@ inline __device__ half atomicAdd<half>(__shared_ptr__ half *ptr, half value) {
template
<
>
inline
__device__
bfloat16_t
atomicAdd
<
bfloat16_t
>
(
__shared_ptr__
bfloat16_t
*
ptr
,
bfloat16_t
value
)
{
ticket_lock_mix
();
bfloat16_t
old
=
loadsm
(
ptr
)
;
bfloat16_t
old
=
*
ptr
;
float
of
=
__bfloat162float
(
old
);
float
vf
=
__bfloat162float
(
value
);
float
sumf
=
of
+
vf
;
...
...
@@ -122,7 +106,7 @@ inline __device__ bfloat16_t atomicAdd<bfloat16_t>(__shared_ptr__ bfloat16_t *pt
template
<
typename
T
>
inline
__device__
T
atomicMax
(
__shared_ptr__
T
*
ptr
,
T
value
)
{
ticket_lock_mix
();
T
old
=
loadsm
(
ptr
)
;
T
old
=
*
ptr
;
if
constexpr
(
std
::
is_same
<
T
,
bfloat16_t
>::
value
)
{
float
of
=
__bfloat162float
(
old
);
float
vf
=
__bfloat162float
(
value
);
...
...
src/infiniop/ops/causal_softmax/kunlun/kernel.h
View file @
20488ee7
...
...
@@ -31,7 +31,7 @@ __device__ void causalSoftmaxBlock(
// height: 3 col_id->
if
(
width
+
size_t
(
row_id
)
>=
col
+
height
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
y
[
col
]
=
hexp
(
loadsm
(
x
+
col
)
-
loadsm
(
&
max_
)
)
;
y
[
col
]
=
hexp
(
x
[
col
]
-
max_
);
}
else
if
constexpr
(
std
::
is_same_v
<
Tdata
,
bfloat16_t
>
)
{
y
[
col
]
=
__float2bfloat16
(
exp
(
__bfloat162float
(
x
[
col
])
-
__bfloat162float
(
max_
)));
}
else
{
...
...
@@ -54,7 +54,7 @@ __device__ void causalSoftmaxBlock(
// Apply softmax
for
(
size_t
col
=
core_id
();
col
<
width
;
col
+=
BLOCK_SIZE
)
{
if
(
sum_
!=
0
)
{
y
[
col
]
=
to
<
Tdata
>
(
to
<
Tcompute
>
(
loadsm
(
y
+
col
)
)
/
sum_
);
y
[
col
]
=
to
<
Tdata
>
(
to
<
Tcompute
>
(
y
[
col
]
)
/
sum_
);
}
else
{
y
[
col
]
=
Tdata
(
0
);
}
...
...
src/infiniop/ops/rms_norm/kunlun/kernel.h
View file @
20488ee7
...
...
@@ -25,8 +25,8 @@ __device__ void rmsnormBlock(
// Copy contiguous x, w into local mem (load from shared memory safely)
for
(
size_t
i
=
core_id
();
i
<
dim
;
i
+=
BLOCK_SIZE
)
{
Tdata
xi
=
loadsm
(
x
+
i
)
;
Tweight
wi
=
loadsm
(
w
+
i
)
;
Tdata
xi
=
x
[
i
]
;
Tweight
wi
=
w
[
i
]
;
y
[
i
]
=
static_cast
<
Tdata
>
(
to
<
Tcompute
>
(
xi
)
*
to
<
Tcompute
>
(
wi
)
*
rms
);
}
sync_cluster
();
...
...
src/infiniop/reduce/kunlun/reduce_kunlun.h
View file @
20488ee7
...
...
@@ -13,20 +13,20 @@ __device__ inline Tcompute sumSquared(__shared_ptr__ const Tdata *data_ptr, size
Tcompute
ss
=
0
;
for
(
size_t
i
=
core_id
();
i
<
count
;
i
+=
BLOCK_SIZE
)
{
Tdata
xi
=
loadsm
(
data_ptr
+
i
)
;
Tdata
xi
=
data_ptr
[
i
]
;
ss
+=
to
<
Tcompute
>
(
xi
)
*
to
<
Tcompute
>
(
xi
);
}
__shared__
Tcompute
temp_storage
;
if
(
core_id
()
==
0
)
{
temp_storage
=
0
;
temp_storage
=
to
<
Tcompute
>
(
0.
f
)
;
}
sync_cluster
();
atomicAdd
(
&
temp_storage
,
ss
);
sync_cluster
();
return
loadsm
(
&
temp_storage
)
;
return
temp_storage
;
}
// Sum(x) on contiguous data of length count
...
...
@@ -35,43 +35,42 @@ __device__ inline Tcompute sum(__shared_ptr__ const Tdata *data_ptr, size_t coun
Tcompute
ss
=
0
;
for
(
size_t
i
=
core_id
();
i
<
count
;
i
+=
BLOCK_SIZE
)
{
Tdata
xi
=
loadsm
(
data_ptr
+
i
)
;
Tdata
xi
=
data_ptr
[
i
]
;
ss
+=
to
<
Tcompute
>
(
xi
);
}
__shared__
Tcompute
temp_storage
;
if
(
core_id
()
==
0
)
{
temp_storage
=
0
;
temp_storage
=
to
<
Tcompute
>
(
0.
f
)
;
}
sync_cluster
();
atomicAdd
(
&
temp_storage
,
ss
);
sync_cluster
();
return
loadsm
(
&
temp_storage
)
;
return
temp_storage
;
}
// Max(x) on contiguous data of length count
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
>
__device__
inline
Tdata
max
(
__shared_ptr__
const
Tdata
*
data_ptr
,
size_t
count
)
{
Tdata
max_val
=
loadsm
(
data_ptr
)
;
Tdata
max_val
=
data_ptr
[
0
]
;
for
(
size_t
i
=
core_id
();
i
<
count
;
i
+=
BLOCK_SIZE
)
{
// Tdata xi = loadsm(data_ptr + i);
Tdata
xi
=
loadsm
(
data_ptr
+
i
);
Tdata
xi
=
data_ptr
[
i
];
max_val
=
fmax
(
max_val
,
to
<
Tdata
>
(
xi
));
}
__shared__
Tdata
temp_storage
;
if
(
core_id
()
==
0
)
{
temp_storage
=
loadsm
(
data_ptr
)
;
temp_storage
=
data_ptr
[
0
]
;
}
sync_cluster
();
atomicMax
(
&
temp_storage
,
max_val
);
sync_cluster
();
return
loadsm
(
&
temp_storage
)
;
return
temp_storage
;
}
}
// namespace op::common_kunlun::reduce_op
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment