Commit 0fdaa522 authored by zhangyue's avatar zhangyue
Browse files

mv atomic_add function to common_kunlun.h

parent 4d745cf9
......@@ -13,4 +13,14 @@ static inline __device__ float lowerBitMask(int i) {
return (1 << (i + 1)) - 1;
}
#endif
\ No newline at end of file
// Atomic add for reduce
static inline __device__ void atomic_add(__shared_ptr__ float *ptr, float value) {
int fail = 1;
while (fail) {
float a = SM2REG_atomic(ptr);
a = a + value;
fail = REG2SM_atomic(ptr, a);
}
}
#endif
......@@ -26,16 +26,6 @@ static inline __device__ void elementMul(float *x, float *w, float *y, int count
}
}
// Atomic add for reduce
static inline __device__ void atomic_add(__shared_ptr__ float *ptr, float value) {
int fail = 1;
while (fail) {
float a = SM2REG_atomic(ptr);
a = a + value;
fail = REG2SM_atomic(ptr, a);
}
}
// RmsNorm main kernel func
// kunlun2 has 8 cluster and 64 core
// Call it by rmsnorm<<<8, 32, stream>>>()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment