"tests/git@developer.sourcefind.cn:tsoc/superbenchmark.git" did not exist on "3704a432b90277612da9b1553cda00725e60b03b"
Commit 8727bdcf authored by zhangyue's avatar zhangyue
Browse files

rename private vars

parent 30bf79f1
...@@ -32,21 +32,21 @@ private: ...@@ -32,21 +32,21 @@ private:
private: private:
TPipe pipe; TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> inQue; TQue<QuePosition::VECIN, BUFFER_NUM> _in_que;
TQue<QuePosition::VECIN, BUFFER_NUM> sinQue; TQue<QuePosition::VECIN, BUFFER_NUM> _sin_que;
TQue<QuePosition::VECIN, BUFFER_NUM> cosQue; TQue<QuePosition::VECIN, BUFFER_NUM> _cos_que;
TQue<QuePosition::VECOUT, BUFFER_NUM> outQue; TQue<QuePosition::VECOUT, BUFFER_NUM> _out_que;
TBuf<TPosition::VECCALC> tmpOddBuf; TBuf<TPosition::VECCALC> _tmp_odd_buf;
TBuf<TPosition::VECCALC> tmpEvenBuf; TBuf<TPosition::VECCALC> _tmp_even_buf;
TBuf<TPosition::VECCALC> tmpOddBuf1; TBuf<TPosition::VECCALC> _tmp_odd_buf1;
TBuf<TPosition::VECCALC> tmpOddBuf2; TBuf<TPosition::VECCALC> _tmp_odd_buf2;
TBuf<TPosition::VECCALC> tmpEvenBuf1; TBuf<TPosition::VECCALC> _tmp_even_buf1;
TBuf<TPosition::VECCALC> tmpEvenBuf2; TBuf<TPosition::VECCALC> _tmp_even_buf2;
GlobalTensor<T> xGm, yGm; GlobalTensor<T> _x_gm, _y_gm;
GlobalTensor<U> pGm; GlobalTensor<U> _p_gm;
GlobalTensor<T> sinGm; GlobalTensor<T> _sin_gm;
GlobalTensor<T> cosGm; GlobalTensor<T> _cos_gm;
size_t _block_idx; size_t _block_idx;
size_t _tile_len; size_t _tile_len;
...@@ -83,57 +83,57 @@ __aicore__ inline void RoPEKernel<T, U>::init(GM_ADDR y, ...@@ -83,57 +83,57 @@ __aicore__ inline void RoPEKernel<T, U>::init(GM_ADDR y,
_block_idx = GetBlockIdx(); _block_idx = GetBlockIdx();
// Init global buffer // Init global buffer
xGm.SetGlobalBuffer((__gm__ T *)x); _x_gm.SetGlobalBuffer((__gm__ T *)x);
pGm.SetGlobalBuffer((__gm__ U *)pos); _p_gm.SetGlobalBuffer((__gm__ U *)pos);
sinGm.SetGlobalBuffer((__gm__ T *)sin); _sin_gm.SetGlobalBuffer((__gm__ T *)sin);
cosGm.SetGlobalBuffer((__gm__ T *)cos); _cos_gm.SetGlobalBuffer((__gm__ T *)cos);
yGm.SetGlobalBuffer((__gm__ T *)y); _y_gm.SetGlobalBuffer((__gm__ T *)y);
// Init Queue buffer // Init Queue buffer
pipe.InitBuffer(inQue, BUFFER_NUM, _copy_len * sizeof(T)); pipe.InitBuffer(_in_que, BUFFER_NUM, _copy_len * sizeof(T));
pipe.InitBuffer(outQue, BUFFER_NUM, _tile_len * sizeof(T)); pipe.InitBuffer(_out_que, BUFFER_NUM, _tile_len * sizeof(T));
pipe.InitBuffer(sinQue, BUFFER_NUM, _half_copy_len * sizeof(T)); pipe.InitBuffer(_sin_que, BUFFER_NUM, _half_copy_len * sizeof(T));
pipe.InitBuffer(cosQue, BUFFER_NUM, _half_copy_len * sizeof(T)); pipe.InitBuffer(_cos_que, BUFFER_NUM, _half_copy_len * sizeof(T));
pipe.InitBuffer(tmpOddBuf, _tile_len / 2 * sizeof(T)); pipe.InitBuffer(_tmp_odd_buf, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(tmpEvenBuf, _tile_len / 2 * sizeof(T)); pipe.InitBuffer(_tmp_even_buf, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(tmpOddBuf1, _tile_len / 2 * sizeof(T)); pipe.InitBuffer(_tmp_odd_buf1, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(tmpOddBuf2, _tile_len / 2 * sizeof(T)); pipe.InitBuffer(_tmp_odd_buf2, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(tmpEvenBuf1, _tile_len / 2 * sizeof(T)); pipe.InitBuffer(_tmp_even_buf1, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(tmpEvenBuf2, _tile_len / 2 * sizeof(T)); pipe.InitBuffer(_tmp_even_buf2, _tile_len / 2 * sizeof(T));
} }
template <typename T, typename U> template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::copyIn(size_t i) { __aicore__ inline void RoPEKernel<T, U>::copyIn(size_t i) {
LocalTensor<T> inputUb = inQue.AllocTensor<T>(); LocalTensor<T> input_ub = _in_que.AllocTensor<T>();
LocalTensor<T> sinUb = sinQue.AllocTensor<T>(); LocalTensor<T> sin_ub = _sin_que.AllocTensor<T>();
LocalTensor<T> cosUb = cosQue.AllocTensor<T>(); LocalTensor<T> cos_ub = _cos_que.AllocTensor<T>();
// Get idx of current tile in total input // Get idx of current tile in total input
auto idx = i * _st_xnt + _block_idx * _st_xnh; auto idx = i * _st_xnt + _block_idx * _st_xnh;
// Copy tile current tile into UB // Copy tile current tile into UB
DataCopy(inputUb, xGm[idx], _copy_len); DataCopy(input_ub, _x_gm[idx], _copy_len);
// Copy sin cos tile // Copy sin cos tile
auto pos_idx = pGm(i); auto pos_idx = _p_gm(i);
DataCopy(sinUb, sinGm[pos_idx * _tile_len / 2], _half_copy_len); DataCopy(sin_ub, _sin_gm[pos_idx * _tile_len / 2], _half_copy_len);
DataCopy(cosUb, cosGm[pos_idx * _tile_len / 2], _half_copy_len); DataCopy(cos_ub, _cos_gm[pos_idx * _tile_len / 2], _half_copy_len);
// Push in operands // Push in operands
inQue.EnQue(inputUb); _in_que.EnQue(input_ub);
sinQue.EnQue(sinUb); _sin_que.EnQue(sin_ub);
cosQue.EnQue(cosUb); _cos_que.EnQue(cos_ub);
} }
template <typename T, typename U> template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::compute(size_t i) { __aicore__ inline void RoPEKernel<T, U>::compute(size_t i) {
LocalTensor<T> inputUb = inQue.DeQue<T>(); LocalTensor<T> input_ub = _in_que.DeQue<T>();
LocalTensor<T> sinUb = sinQue.DeQue<T>(); LocalTensor<T> sin_ub = _sin_que.DeQue<T>();
LocalTensor<T> cosUb = cosQue.DeQue<T>(); LocalTensor<T> cos_ub = _cos_que.DeQue<T>();
LocalTensor<T> outputUb = outQue.AllocTensor<T>(); LocalTensor<T> output_ub = _out_que.AllocTensor<T>();
LocalTensor<T> tmpOdd = tmpOddBuf.Get<T>(); LocalTensor<T> tmp_odd = _tmp_odd_buf.Get<T>();
LocalTensor<T> tmpEven = tmpEvenBuf.Get<T>(); LocalTensor<T> tmp_even = _tmp_even_buf.Get<T>();
LocalTensor<T> tmpOdd1 = tmpOddBuf1.Get<T>(); LocalTensor<T> tmp_odd1 = _tmp_odd_buf1.Get<T>();
LocalTensor<T> tmpOdd2 = tmpOddBuf2.Get<T>(); LocalTensor<T> tmp_odd2 = _tmp_odd_buf2.Get<T>();
LocalTensor<T> tmpEven1 = tmpEvenBuf1.Get<T>(); LocalTensor<T> tmp_even1 = _tmp_even_buf1.Get<T>();
LocalTensor<T> tmpEven2 = tmpEvenBuf2.Get<T>(); LocalTensor<T> tmp_even2 = _tmp_even_buf2.Get<T>();
// separate odd and even bit elements // separate odd and even bit elements
uint64_t rsvdCnt = 0; uint64_t rsvdCnt = 0;
...@@ -143,43 +143,43 @@ __aicore__ inline void RoPEKernel<T, U>::compute(size_t i) { ...@@ -143,43 +143,43 @@ __aicore__ inline void RoPEKernel<T, U>::compute(size_t i) {
8, 8,
8, 8,
}; };
GatherMask<T>(tmpOdd, inputUb, 1, false, 0, gMaskParams, rsvdCnt); GatherMask<T>(tmp_odd, input_ub, 1, false, 0, gMaskParams, rsvdCnt);
GatherMask<T>(tmpEven, inputUb, 2, false, 0, gMaskParams, rsvdCnt); GatherMask<T>(tmp_even, input_ub, 2, false, 0, gMaskParams, rsvdCnt);
PipeBarrier<PIPE_V>(); PipeBarrier<PIPE_V>();
// compute odd bit elements // compute odd bit elements
// y_odd = x_odd * cos - x_even * sin // y_odd = x_odd * cos - x_even * sin
Mul<T>(tmpOdd1, tmpOdd, cosUb, _tile_len / 2); Mul<T>(tmp_odd1, tmp_odd, cos_ub, _tile_len / 2);
Mul<T>(tmpOdd2, tmpEven, sinUb, _tile_len / 2); Mul<T>(tmp_odd2, tmp_even, sin_ub, _tile_len / 2);
PipeBarrier<PIPE_V>(); PipeBarrier<PIPE_V>();
Sub<T>(tmpOdd1, tmpOdd1, tmpOdd2, _tile_len / 2); Sub<T>(tmp_odd1, tmp_odd1, tmp_odd2, _tile_len / 2);
// compute even bit elements // compute even bit elements
// y_even = x_odd * sin + x_even * cos // y_even = x_odd * sin + x_even * cos
Mul<T>(tmpEven1, tmpOdd, sinUb, _tile_len / 2); Mul<T>(tmp_even1, tmp_odd, sin_ub, _tile_len / 2);
Mul<T>(tmpEven2, tmpEven, cosUb, _tile_len / 2); Mul<T>(tmp_even2, tmp_even, cos_ub, _tile_len / 2);
PipeBarrier<PIPE_V>(); PipeBarrier<PIPE_V>();
Add<T>(tmpEven1, tmpEven1, tmpEven2, _tile_len / 2); Add<T>(tmp_even1, tmp_even1, tmp_even2, _tile_len / 2);
// combine odd and even bit elements // combine odd and even bit elements
for (uint32_t j = 0; j < _tile_len / 2; j += 1) { for (uint32_t j = 0; j < _tile_len / 2; j += 1) {
outputUb(j * 2) = tmpOdd1(j); output_ub(j * 2) = tmp_odd1(j);
outputUb(j * 2 + 1) = tmpEven1(j); output_ub(j * 2 + 1) = tmp_even1(j);
} }
outQue.EnQue<T>(outputUb); _out_que.EnQue<T>(output_ub);
inQue.FreeTensor(inputUb); _in_que.FreeTensor(input_ub);
sinQue.FreeTensor(sinUb); _sin_que.FreeTensor(sin_ub);
cosQue.FreeTensor(cosUb); _cos_que.FreeTensor(cos_ub);
} }
template <typename T, typename U> template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::copyOut(size_t i) { __aicore__ inline void RoPEKernel<T, U>::copyOut(size_t i) {
LocalTensor<T> outputUb = outQue.DeQue<T>(); LocalTensor<T> output_ub = _out_que.DeQue<T>();
auto idy = i * _st_ynt + _block_idx * _st_ynh; auto idy = i * _st_ynt + _block_idx * _st_ynh;
DataCopyExtParams params = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0}; DataCopyExtParams params = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
DataCopyPad(yGm[idy], outputUb, params); DataCopyPad(_y_gm[idy], output_ub, params);
outQue.FreeTensor(outputUb); _out_que.FreeTensor(output_ub);
} }
template <typename T, typename U> template <typename T, typename U>
...@@ -192,56 +192,30 @@ __aicore__ inline void RoPEKernel<T, U>::process(size_t seq_len) { ...@@ -192,56 +192,30 @@ __aicore__ inline void RoPEKernel<T, U>::process(size_t seq_len) {
} }
} }
#define ROPE_KERNEL(TYPE, POSTYPE) \ #define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead, \
switch (POSTYPE) { \ y_stride_seqlen, y_stride_nhead, \
case INFINI_DTYPE_I8: { \ x_stride_seqlen, x_stride_nhead
RoPEKernel<TYPE, int8_t> op; \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \ #define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
op.process(seq_len); \ case POS_TYPE_ENUM: { \
break; \ RoPEKernel<TYPE, POS_T> op; \
} \ op.init(ROPE_KERNEL_INIT_ARGS); \
case INFINI_DTYPE_I16: { \ op.process(seq_len); \
RoPEKernel<TYPE, int16_t> op; \ break; \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \ }
op.process(seq_len); \
break; \ #define ROPE_KERNEL(TYPE, POSTYPE) \
} \ switch (POSTYPE) { \
case INFINI_DTYPE_I32: { \ CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t) \
RoPEKernel<TYPE, int32_t> op; \ CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t) \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \ CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t) \
op.process(seq_len); \ CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t) \
break; \ CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t) \
} \ CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
case INFINI_DTYPE_I64: { \ CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
RoPEKernel<TYPE, int64_t> op; \ CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \ default: \
op.process(seq_len); \ break; \
break; \
} \
case INFINI_DTYPE_U8: { \
RoPEKernel<TYPE, uint8_t> op; \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \
op.process(seq_len); \
break; \
} \
case INFINI_DTYPE_U16: { \
RoPEKernel<TYPE, uint16_t> op; \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \
op.process(seq_len); \
break; \
} \
case INFINI_DTYPE_U32: { \
RoPEKernel<TYPE, uint32_t> op; \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \
op.process(seq_len); \
break; \
} \
case INFINI_DTYPE_U64: { \
RoPEKernel<TYPE, uint64_t> op; \
op.init(y, x, pos, sin, cos, dhead, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead); \
op.process(seq_len); \
break; \
} \
} }
#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE) \ #define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE) \
...@@ -264,6 +238,9 @@ DEFINE_ROPE_KERNEL(rope_kernel_float, float) ...@@ -264,6 +238,9 @@ DEFINE_ROPE_KERNEL(rope_kernel_float, float)
DEFINE_ROPE_KERNEL(rope_kernel_half, half) DEFINE_ROPE_KERNEL(rope_kernel_half, half)
#undef DEFINE_ROPE_KERNEL #undef DEFINE_ROPE_KERNEL
#undef ROPE_KERNEL
#undef CASE_POSTYPE
#undef ROPE_KERNEL_INIT_ARGS
extern "C" infiniStatus_t rope_kernel_launch( extern "C" infiniStatus_t rope_kernel_launch(
void *y, void *y,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment