Commit c1fa267c authored by zhangyunze's avatar zhangyunze
Browse files

feat:重构random sample ascend算子

parent b5c6c7b8
...@@ -5,7 +5,7 @@ project(Ascend_C) ...@@ -5,7 +5,7 @@ project(Ascend_C)
set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type") set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory") set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
set(RUN_MODE "npu" CACHE STRING "run mode: npu") set(RUN_MODE "npu" CACHE STRING "run mode: npu")
set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
constexpr size_t BLOCK_NUM = 8; constexpr size_t BLOCK_NUM = 8;
constexpr size_t BUFFER_NUM = 2; constexpr size_t BUFFER_NUM = 2;
constexpr size_t BYTE_ALIGN = 32; constexpr size_t BYTE_ALIGN = 32;
constexpr size_t BLOCK_LEN = 256;
template <typename T> template <typename T>
__aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) { __aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {
......
#ifndef __ACLNN_RANDOM_SAMPLE_H__
#define __ACLNN_RANDOM_SAMPLE_H__
#include "../random_sample.h"
DESCRIPTOR(ascend)
#endif // __ACLNN_RANDOM_SAMPLE_H__
#include "random_sample_ascend.h"
InfiniopRandomSampleAscendDescriptor::InfiniopRandomSampleAscendDescriptor(infiniDevice_t device_) {
device = device_;
device_id = 0;
pDesc = new aclnnTensorDescriptor();
topkIdxDesc = new aclnnTensorDescriptor();
topkValDesc = new aclnnTensorDescriptor();
resDesc = new aclnnTensorDescriptor();
}
infiniopStatus_t ascendCreateRandomSampleDescriptor(infiniopAscendHandle_t handle,
infiniopRandomSampleAscendDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t result,
infiniopTensorDescriptor_t probs) {
if (probs->ndim != 1) {
return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
}
if (result->ndim != 1 && result->shape[0] != 1) {
return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
}
(*desc_ptr) = new InfiniopRandomSampleAscendDescriptor(handle->device);
(*desc_ptr)->device_id = handle->device_id;
CHECK_STATUS((*desc_ptr)->pDesc->fromInfiniOpTensorDescriptor(probs), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS((*desc_ptr)->resDesc->fromInfiniOpTensorDescriptor(result), INFINIOP_STATUS_SUCCESS);
// Ascend aclnnTopk doesn't support U64 type
(*desc_ptr)->resDesc->dataType = aclDataType::ACL_INT64;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t ascendGetRandomSampleWorkspaceSize(infiniopRandomSampleAscendDescriptor_t desc,
uint64_t *size) {
auto &pDesc = desc->pDesc;
*size = numElements(pDesc->shape.data(), pDesc->ndim) * aclDataTypeSize(pDesc->dataType)
+ numElements(pDesc->shape.data(), pDesc->ndim) * infiniSizeof(infiniDtype_t::INFINI_DTYPE_I64);
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t ascendRandomSample(infiniopRandomSampleAscendDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *result,
void const *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream) {
if (topk <= 0 || topp < 0 || topp > 1.0) {
return INFINIOP_STATUS_BAD_PARAM;
}
if (random_val < 0 || random_val > 1.0) {
return INFINIOP_STATUS_BAD_PARAM;
}
auto &pDesc = desc->pDesc;
auto &topkIdxDesc = desc->topkIdxDesc;
auto &topkValDesc = desc->topkValDesc;
auto ndim = static_cast<int64_t>(pDesc->ndim);
auto voc = pDesc->shape[0];
auto topk_ = topk <= voc ? topk : voc;
bool doSample = topk_ > 1 && temperature != 0 && topp != 0;
auto topkShape = std::vector<int64_t>(pDesc->shape);
topkShape[ndim - 1] = doSample ? topk_ : 1;
auto topkStrides = std::vector<int64_t>(pDesc->strides);
// Infer contiguous strides
topkStrides[ndim - 1] = 1;
for (int64_t i = ndim - 2; i >= 0; --i) {
topkStrides[i] = topkStrides[i + 1] * topkShape[i + 1];
}
CHECK_STATUS(topkValDesc->setDescriptor(pDesc->dataType, topkShape, topkStrides), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(topkIdxDesc->setDescriptor(aclDataType::ACL_INT64, topkShape, topkStrides), INFINIOP_STATUS_SUCCESS);
// Infer data ptr
auto workspaceTmp = workspace;
auto topkValAddr = workspaceTmp;
workspaceTmp = (void *)((uint8_t *)workspace + numElements(topkValDesc->shape.data(), topkValDesc->ndim) * aclDataTypeSize(topkValDesc->dataType));
auto topkIdxAddr = workspaceTmp;
auto pAddr = (void *)probs;
// Create aclTensor
CHECK_STATUS(pDesc->createTensor(pAddr), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(topkValDesc->createTensor(topkValAddr), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(topkIdxDesc->createTensor(topkIdxAddr), INFINIOP_STATUS_SUCCESS);
if (!doSample) {
CHECK_STATUS(desc->resDesc->createTensor(result), INFINIOP_STATUS_SUCCESS);
}
// Do Topk calculate
uint64_t topkWorkspaceSize = 0;
aclOpExecutor *topkExecutor = nullptr;
auto ret = aclnnTopkGetWorkspaceSize(pDesc->t,
topkShape[ndim - 1],
ndim - 1,
true,
true,
topkValDesc->t,
doSample ? topkIdxDesc->t
: desc->resDesc->t,
&topkWorkspaceSize,
&topkExecutor);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnTopkGetWorkspaceSize failed ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
void *topkWorkspace;
CHECK_STATUS(mallocWorkspace(&topkWorkspace, topkWorkspaceSize), INFINIOP_STATUS_SUCCESS);
ret = aclnnTopk(topkWorkspace,
topkWorkspaceSize,
topkExecutor,
stream);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnTopk failed ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
CHECK_STATUS(freeWorkspace(topkWorkspace), INFINIOP_STATUS_SUCCESS);
if (doSample) {
// Do softmax and topp random sample
random_sample_do(
pAddr,
result,
topkValAddr,
topkIdxAddr,
topk,
static_cast<int>(pDesc->shape[0]),
topp,
temperature,
random_val,
pDesc->dataType,
stream);
}
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t ascendDestroyRandomSampleDescriptor(infiniopRandomSampleAscendDescriptor_t desc) {
delete desc->pDesc;
delete desc->topkIdxDesc;
delete desc->topkValDesc;
delete desc;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef __RANDOM_SAMPLE_ASCEND_H__
#define __RANDOM_SAMPLE_ASCEND_H__
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "random_sample_ascend_api.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <aclnnop/aclnn_topk.h>
struct InfiniopRandomSampleAscendDescriptor {
infiniDevice_t device;
int device_id;
aclnnTensorDescriptor_t pDesc, topkValDesc, topkIdxDesc, resDesc;
InfiniopRandomSampleAscendDescriptor(infiniDevice_t device_);
};
extern "C" void
random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
int32_t topk, int32_t voc, float topp, float temper,
float random, int dtype, void *stream);
#endif
#ifndef __RANDOM_SAMPLE_ASCEND_API_H__
#define __RANDOM_SAMPLE_ASCEND_API_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "infiniop/operator.h"
struct InfiniopRandomSampleAscendDescriptor;
typedef struct InfiniopRandomSampleAscendDescriptor *infiniopRandomSampleAscendDescriptor_t;
infiniopStatus_t ascendCreateRandomSampleDescriptor(infiniopAscendHandle_t handle,
infiniopRandomSampleAscendDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t results,
infiniopTensorDescriptor_t probs);
infiniopStatus_t ascendGetRandomSampleWorkspaceSize(infiniopRandomSampleAscendDescriptor_t desc,
uint64_t *size);
infiniopStatus_t ascendRandomSample(infiniopRandomSampleAscendDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *result,
void const *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream);
infiniopStatus_t ascendDestroyRandomSampleDescriptor(infiniopRandomSampleAscendDescriptor_t desc);
#endif
#include "kernel_operator.h" #include "../../../devices/ascend/ascend_kernel_common.h"
using namespace AscendC; using namespace AscendC;
const int32_t BLOCK_LEN = 256;
template <typename T> template <typename T>
class KernelRandomSample { class RandomSampleKernel {
public: public:
__aicore__ inline KernelRandomSample() {} __aicore__ inline RandomSampleKernel() {}
__aicore__ inline void Init(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr, __aicore__ inline void init(GM_ADDR probs, GM_ADDR result, GM_ADDR topk_val_addr, GM_ADDR topk_idx_addr, float random_val, float topp, int topk, float temperature, int32_t n);
GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_, __aicore__ inline void process();
float topp_, float temper_, float random_) {
topk = topk_;
voc = voc_;
topp = topp_;
invTemperature = 1.0f / temper_;
random = random_;
negMax = 0.f;
sum = 0.f;
// CumSumInfo
topkAligned = topk * sizeof(T) % 32 == 0
? topk
: (topk * sizeof(T) + 31) / 32 * 32 / sizeof(T);
vocAligned = voc * sizeof(T) % 32 == 0
? voc
: (voc * sizeof(T) + 31) / 32 * 32 / sizeof(T);
topkIdxAligned = (topk + 3) / 4 * 4;
bufferLen = topkAligned > BLOCK_LEN ? topkAligned : BLOCK_LEN;
// Set Gm
pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc);
topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk);
topkIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topkIdxAddr), topk);
resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1);
// Global input and output
pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T));
pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t));
pipe.InitBuffer(resQue, 1, 32); // 32 bytes for aligned
pipe.InitBuffer(inBuf, BLOCK_LEN * sizeof(T));
pipe.InitBuffer(tmpBuf1, bufferLen * sizeof(T));
pipe.InitBuffer(tmpBuf2, bufferLen * sizeof(T));
pipe.InitBuffer(tmpBuf3, bufferLen * sizeof(T));
pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T));
pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T));
}
__aicore__ inline void Process() {
CopyIn();
Compute();
CopyOut();
}
private: private:
// Softmax __aicore__ inline void copyIn();
__aicore__ inline void copyOut();
__aicore__ inline void compute();
__aicore__ inline void SoftMax(LocalTensor<T> &topkValIn, __aicore__ inline void SoftMax(LocalTensor<T> &topkValIn,
LocalTensor<T> &softMaxOut) { LocalTensor<T> &softMaxOut);
float invSum = 1.0f / sum;
LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>();
LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>();
LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>();
Adds(tmpBuffer, topkValIn, static_cast<T>(negMax), topk);
Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), topk);
Exp(tmpBuffer3, tmpBuffer2, topk);
Muls(softMaxOut, tmpBuffer3, static_cast<T>(invSum), topk);
}
// Cumsum
__aicore__ inline void InclusiveSum(LocalTensor<T> &topkValIn, __aicore__ inline void InclusiveSum(LocalTensor<T> &topkValIn,
LocalTensor<T> &topkValOut) { LocalTensor<T> &topkValOut);
static constexpr CumSumConfig cumSumConfig{true, false, false};
LocalTensor<T> lastRowLocal;
CumSum<T, cumSumConfig>(topkValOut, lastRowLocal, topkValIn,
{1, static_cast<uint32_t>(topkAligned)});
}
// Random sample
__aicore__ inline void RandomSample(LocalTensor<T> &valIn, __aicore__ inline void RandomSample(LocalTensor<T> &valIn,
LocalTensor<int64_t> &Index, LocalTensor<int64_t> &Index,
LocalTensor<int64_t> &result) { LocalTensor<int64_t> &result);
int end = 0;
for (end = 0; end < topk; end++) { GlobalTensor<T> _pGM,
if (static_cast<float>(valIn(end)) >= topp) { _topk_valGM;
break; GlobalTensor<int64_t> _topk_idxGM, _resGM;
} TPipe pipe;
} TQue<QuePosition::VECIN, 1> _topk_valQue;
if (end < topk - 1) { TQue<QuePosition::VECIN, 1> _topk_idxQue;
end += 1; TQue<QuePosition::VECOUT, 1> _resQue;
} else {
end = topk; TBuf<TPosition::VECCALC> _inBuf;
TBuf<TPosition::VECCALC> _tmp1Buf;
TBuf<TPosition::VECCALC> _tmp2Buf;
TBuf<TPosition::VECCALC> _tmp3Buf;
TBuf<TPosition::VECCALC> _softmax_OutBuf;
TBuf<TPosition::VECCALC> _inclusive_sum_OutBuf;
int32_t _topk;
int32_t _voc;
float _random_val;
float _topp;
float _invTemp;
float _negMax = 0.f;
float _sum = 0.f;
int32_t _topkAligned;
int32_t _topkIdxAligned;
int32_t _vocAligned;
int32_t _bufferLen;
};
template <typename T>
__aicore__ inline void RandomSampleKernel<T>::init(GM_ADDR probs, GM_ADDR result, GM_ADDR topk_val_addr, GM_ADDR topk_idx_addr, float random_val, float topp, int topk, float temperature, int32_t n) {
_topk = topk;
_voc = n;
_random_val = random_val;
_topp = topp;
_invTemp = 1.0f / temperature;
// CumSumInfo
_topkAligned = alignTileLen<T>(_topk, BYTE_ALIGN);
_vocAligned = alignTileLen<T>(_voc, BYTE_ALIGN);
_topkIdxAligned = (_topk + 3) / 4 * 4;
_bufferLen = _topkAligned > BLOCK_LEN ? _topkAligned : BLOCK_LEN;
// Set GlobalTensor
_pGM.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(probs), _voc);
_topk_valGM.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topk_val_addr), _topk);
_topk_idxGM.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topk_idx_addr), _topk);
_resGM.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(result), 1);
// Global input and output
pipe.InitBuffer(_topk_valQue, 1, _topkAligned * sizeof(T));
pipe.InitBuffer(_topk_idxQue, 1, _topkIdxAligned * sizeof(int64_t));
pipe.InitBuffer(_resQue, 1, BYTE_ALIGN); // 32 bytes for aligned
pipe.InitBuffer(_inBuf, BLOCK_LEN * sizeof(T));
pipe.InitBuffer(_tmp1Buf, _bufferLen * sizeof(T));
pipe.InitBuffer(_tmp2Buf, _bufferLen * sizeof(T));
pipe.InitBuffer(_tmp3Buf, _bufferLen * sizeof(T));
pipe.InitBuffer(_softmax_OutBuf, _topkAligned * sizeof(T));
pipe.InitBuffer(_inclusive_sum_OutBuf, _topkAligned * sizeof(T));
}
template <typename T>
__aicore__ inline void RandomSampleKernel<T>::process() {
copyIn();
compute();
copyOut();
}
template <typename T>
__aicore__ inline void RandomSampleKernel<T>::SoftMax(LocalTensor<T> &topkValIn,
LocalTensor<T> &softMaxOut) {
float invSum = 1.0f / _sum;
LocalTensor<T> tmpBuffer = _tmp1Buf.Get<T>();
LocalTensor<T> tmpBuffer2 = _tmp2Buf.Get<T>();
LocalTensor<T> tmpBuffer3 = _tmp3Buf.Get<T>();
Adds(tmpBuffer, topkValIn, static_cast<T>(_negMax), _topk);
Muls(tmpBuffer2, tmpBuffer, static_cast<T>(_invTemp), _topk);
Exp(tmpBuffer3, tmpBuffer2, _topk);
Muls(softMaxOut, tmpBuffer3, static_cast<T>(invSum), _topk);
}
template <typename T>
__aicore__ inline void RandomSampleKernel<T>::InclusiveSum(LocalTensor<T> &topkValIn,
LocalTensor<T> &topkValOut) {
static constexpr CumSumConfig cumSumConfig{true, false, false};
LocalTensor<T> lastRowLocal;
CumSum<T, cumSumConfig>(topkValOut, lastRowLocal, topkValIn,
{1, static_cast<uint32_t>(_topkAligned)});
}
template <typename T>
__aicore__ inline void RandomSampleKernel<T>::RandomSample(LocalTensor<T> &valIn,
LocalTensor<int64_t> &Index,
LocalTensor<int64_t> &result) {
int end = 0;
for (end = 0; end < _topk; end++) {
if (static_cast<float>(valIn(end)) >= _topp) {
break;
} }
}
if (end < _topk - 1) {
end += 1;
} else {
end = _topk;
}
auto randomVal = random * static_cast<float>(valIn(end - 1)); auto random_val = _random_val * static_cast<float>(valIn(end - 1));
for (int i = 0; i < end; i++) { for (int i = 0; i < end; i++) {
if (randomVal < static_cast<float>(valIn(i))) { if (random_val < static_cast<float>(valIn(i))) {
result(0) = Index(i); result(0) = Index(i);
return; return;
}
} }
result(0) = Index(end - 1);
} }
result(0) = Index(end - 1);
}
template <typename T>
__aicore__ inline void RandomSampleKernel<T>::copyIn() {
LocalTensor<T> topkValLocal = _topk_valQue.AllocTensor<T>();
LocalTensor<int64_t> topkIdxLocal = _topk_idxQue.AllocTensor<int64_t>();
DataCopy(topkValLocal, _topk_valGM, _topkAligned);
DataCopy(topkIdxLocal, _topk_idxGM, _topkIdxAligned);
// Get Max val of input
_negMax = -static_cast<float>(topkValLocal(0));
__aicore__ inline void CopyIn() { // Copy in p and compute _sum
LocalTensor<T> topkValLocal = topkQue.AllocTensor<T>(); int32_t repeatTimes = _voc / BLOCK_LEN;
LocalTensor<int64_t> topkIdxLocal = topkIdxQue.AllocTensor<int64_t>(); int32_t remainder = _voc % BLOCK_LEN;
DataCopy(topkValLocal, topkGm, topkAligned); float sum_s = 0.f;
DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned); LocalTensor<T> _inBuffer = _inBuf.Get<T>();
// Get Max val of input LocalTensor<T> tmpBuffer = _tmp1Buf.Get<T>();
negMax = -static_cast<float>(topkValLocal(0)); LocalTensor<T> tmpBuffer2 = _tmp2Buf.Get<T>();
LocalTensor<T> tmpBuffer3 = _tmp3Buf.Get<T>();
// Copy in p and compute sum for (int32_t i = 0; i < repeatTimes; i++) {
int32_t repeatTimes = voc / BLOCK_LEN; DataCopy(_inBuffer, _pGM[i * BLOCK_LEN], BLOCK_LEN);
int32_t remainder = voc % BLOCK_LEN; Adds(tmpBuffer, _inBuffer, static_cast<T>(_negMax), BLOCK_LEN);
float sum_s = 0.f; Muls(tmpBuffer2, tmpBuffer, static_cast<T>(_invTemp), BLOCK_LEN);
LocalTensor<T> inBuffer = inBuf.Get<T>(); Exp(tmpBuffer3, tmpBuffer2, BLOCK_LEN);
LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>(); sum_s = 0.f;
LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>(); for (int j = 0; j < BLOCK_LEN; ++j) {
LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>(); sum_s += static_cast<float>(tmpBuffer3(j));
for (int32_t i = 0; i < repeatTimes; i++) {
DataCopy(inBuffer, pGm[i * BLOCK_LEN], BLOCK_LEN);
Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), BLOCK_LEN);
Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), BLOCK_LEN);
Exp(tmpBuffer3, tmpBuffer2, BLOCK_LEN);
sum_s = 0.f;
for (int j = 0; j < BLOCK_LEN; ++j) {
sum_s += static_cast<float>(tmpBuffer3(j));
}
sum += sum_s;
} }
if (remainder != 0) { _sum += sum_s;
int32_t remainderAligned = remainder * sizeof(T) % 32 == 0 }
? remainder if (remainder != 0) {
: (remainder * sizeof(T) + 31) / 32 * 32 / sizeof(T); int32_t remainderAligned = alignTileLen<T>(remainder, BYTE_ALIGN);
DataCopy(inBuffer, pGm[repeatTimes * BLOCK_LEN], remainderAligned); DataCopy(_inBuffer, _pGM[repeatTimes * BLOCK_LEN], remainderAligned);
Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), remainder); Adds(tmpBuffer, _inBuffer, static_cast<T>(_negMax), remainder);
Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), remainder); Muls(tmpBuffer2, tmpBuffer, static_cast<T>(_invTemp), remainder);
Exp(tmpBuffer3, tmpBuffer2, remainder); Exp(tmpBuffer3, tmpBuffer2, remainder);
sum_s = 0.f; sum_s = 0.f;
for (int i = 0; i < remainder; ++i) { for (int i = 0; i < remainder; ++i) {
sum_s += static_cast<float>(tmpBuffer3(i)); sum_s += static_cast<float>(tmpBuffer3(i));
}
sum += sum_s;
} }
_sum += sum_s;
topkQue.EnQue(topkValLocal);
topkIdxQue.EnQue(topkIdxLocal);
} }
__aicore__ inline void Compute() { _topk_valQue.EnQue(topkValLocal);
// Get input data _topk_idxQue.EnQue(topkIdxLocal);
LocalTensor<T> topkValLocal = topkQue.DeQue<T>(); }
// SoftMax
LocalTensor<T> softMaxOutLocal = softMaxOutBuf.Get<T>();
SoftMax(topkValLocal, softMaxOutLocal);
// InclusiveSum template <typename T>
LocalTensor<T> inclusiveOutLocal = inclusiveSumOutBuf.Get<T>(); __aicore__ inline void RandomSampleKernel<T>::compute() {
InclusiveSum(softMaxOutLocal, inclusiveOutLocal); // Get input data
LocalTensor<T> topkValLocal = _topk_valQue.DeQue<T>();
// randomSample // SoftMax
LocalTensor<int64_t> topkIdxLocal = topkIdxQue.DeQue<int64_t>(); LocalTensor<T> softMaxOutLocal = _softmax_OutBuf.Get<T>();
LocalTensor<int64_t> resultLocal = resQue.AllocTensor<int64_t>(); SoftMax(topkValLocal, softMaxOutLocal);
RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal);
topkQue.FreeTensor(topkValLocal); // InclusiveSum
topkIdxQue.FreeTensor(topkIdxLocal); LocalTensor<T> inclusiveOutLocal = _inclusive_sum_OutBuf.Get<T>();
resQue.EnQue(resultLocal); InclusiveSum(softMaxOutLocal, inclusiveOutLocal);
}
__aicore__ inline void CopyOut() {
LocalTensor<int64_t> resLocal = resQue.DeQue<int64_t>();
DataCopy(resGm, resLocal, 32 / sizeof(int64_t));
resQue.FreeTensor(resLocal);
}
private: // randomSample
GlobalTensor<T> pGm; LocalTensor<int64_t> topkIdxLocal = _topk_idxQue.DeQue<int64_t>();
GlobalTensor<T> topkGm; LocalTensor<int64_t> resultLocal = _resQue.AllocTensor<int64_t>();
GlobalTensor<int64_t> topkIdxGm; RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal);
GlobalTensor<int64_t> resGm;
TPipe pipe; _topk_valQue.FreeTensor(topkValLocal);
_topk_idxQue.FreeTensor(topkIdxLocal);
_resQue.EnQue(resultLocal);
}
TQue<QuePosition::VECIN, 1> topkQue; template <typename T>
TQue<QuePosition::VECIN, 1> topkIdxQue; __aicore__ inline void RandomSampleKernel<T>::copyOut() {
TQue<QuePosition::VECOUT, 1> resQue; LocalTensor<int64_t> resLocal = _resQue.DeQue<int64_t>();
DataCopy(_resGM, resLocal, BYTE_ALIGN / sizeof(int64_t));
TBuf<TPosition::VECCALC> inBuf; _resQue.FreeTensor(resLocal);
TBuf<TPosition::VECCALC> tmpBuf1; }
TBuf<TPosition::VECCALC> tmpBuf2;
TBuf<TPosition::VECCALC> tmpBuf3;
TBuf<TPosition::VECCALC> softMaxOutBuf;
TBuf<TPosition::VECCALC> inclusiveSumOutBuf;
// Kernel params
int32_t topk;
int32_t voc;
float topp;
float invTemperature;
float random;
float negMax;
float sum;
int32_t topkAligned;
int32_t topkIdxAligned;
int32_t vocAligned;
int32_t bufferLen;
};
extern "C" __global__ __aicore__ void extern "C" __global__ __aicore__ void random_sample_kernel_fp16(
random_sample_kernel_f16(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr, GM_ADDR probs,
GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_, GM_ADDR result,
float topp_, float temper_, float random_) { GM_ADDR topk_val_addr,
KernelRandomSample<half> op; GM_ADDR topk_idx_addr,
op.Init(p, res, topkAddr, topkIdxAddr, topk_, voc_, topp_, temper_, random_); float random_val,
op.Process(); float topp,
int topk,
float temperature,
int32_t n) {
RandomSampleKernel<half> op;
op.init(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
op.process();
} }
extern "C" void extern "C" __global__ __aicore__ void random_sample_kernel_fp32(
random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr, GM_ADDR probs,
int32_t topk, int32_t voc, float topp, float temper, GM_ADDR result,
float random, int dtype, void *stream) { GM_ADDR topk_val_addr,
GM_ADDR topk_idx_addr,
float random_val,
float topp,
int topk,
float temperature,
int32_t n) {
RandomSampleKernel<float> op;
op.init(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
op.process();
}
switch (dtype) { extern "C" infiniStatus_t random_sample_kernel_launch(
case 1: void *probs,
random_sample_kernel_f16<<<1, nullptr, stream>>>( void *result,
p, res, topkAddr, topkIdxAddr, topk, voc, topp, temper, random); void *topk_val_addr,
void *topk_idx_addr,
float random_val,
float topp,
int topk,
float temperature,
uint64_t n,
infiniDtype_t dt_p,
void *stream) {
switch (dt_p) {
case INFINI_DTYPE_F16:
random_sample_kernel_fp16<<<1, nullptr, stream>>>(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
break;
case INFINI_DTYPE_F32:
random_sample_kernel_fp32<<<1, nullptr, stream>>>(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
break;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
return INFINI_STATUS_SUCCESS;
} }
#include "../../../devices/ascend/common_ascend.h"
#include "random_sample_aclnn.h"
#include <aclnnop/aclnn_topk.h>
namespace op::random_sample::ascend {
struct Descriptor::Opaque {
aclnnTensorDescriptor_t probs;
aclnnTensorDescriptor_t result;
~Opaque() {
delete probs;
delete result;
}
};
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result);
CHECK_DTYPE(result->dt_i, INFINI_DTYPE_I64);
auto workspace_size = probs_desc->numel() * infiniSizeOf(probs_desc->dtype()) + probs_desc->numel() * infiniSizeOf(infiniDtype_t::INFINI_DTYPE_I64);
auto tresult = new aclnnTensorDescriptor(result_desc);
auto tprobs = new aclnnTensorDescriptor(probs_desc);
*desc_ptr
= new Descriptor(
result.take(),
workspace_size,
new Opaque{tprobs, tresult},
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
size_t Descriptor::minWorkspaceSize() const {
return _min_workspace_size;
}
extern "C" infiniStatus_t random_sample_kernel_launch(
void *probs,
void *result,
void *topk_val_addr,
void *topk_idx_addr,
float random_val,
float topp,
int topk,
float temperature,
uint64_t n,
infiniDtype_t dt_p,
void *stream);
infiniStatus_t
Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *result,
const void *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream) const {
if (workspace_size < _min_workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
auto topk_ = topk <= (int)_info.n ? topk : (int)_info.n;
bool dosample = topk_ > 1 && temperature != 0.0f && topp != 0.0f && random_val != 0.0f;
auto topk_shape = std::vector<int64_t>{dosample ? topk_ : 1};
auto topk_stride = std::vector<int64_t>{1};
auto topk_idx = new aclnnTensorDescriptor(toAclDataType(_info.dt_i), topk_shape, topk_stride);
auto topk_val = new aclnnTensorDescriptor(toAclDataType(_info.dt_p), topk_shape, topk_stride);
auto topk_val_addr = workspace;
auto topk_idx_addr = (void *)((uint8_t *)workspace + topk_ * infiniSizeOf(_info.dt_p));
uint64_t topk_workspace_size = 0;
aclOpExecutor *topk_executor = nullptr;
CHECK_ACL(aclnnTopkGetWorkspaceSize(_opaque->probs->tensor,
topk_shape[0],
0,
true,
true,
topk_val->tensor,
dosample ? topk_idx->tensor : _opaque->result->tensor,
&topk_workspace_size,
&topk_executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(topk_executor));
void *topk_workspace;
CHECK_ACL(aclrtMalloc(&topk_workspace, topk_workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
AclSetTensorAddr(topk_executor, 0, _opaque->probs->tensor, (void *)probs);
AclSetTensorAddr(topk_executor, 1, topk_val->tensor, topk_val_addr);
if (!dosample) {
AclSetTensorAddr(topk_executor, 2, _opaque->result->tensor, result);
} else {
AclSetTensorAddr(topk_executor, 2, topk_idx->tensor, topk_idx_addr);
}
CHECK_ACL(aclnnTopk(topk_workspace, topk_workspace_size, topk_executor, stream));
CHECK_ACL(aclrtFree(topk_workspace));
if (dosample) {
auto status = random_sample_kernel_launch((void *)probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk_, temperature, _info.n, _info.dt_p, stream);
CHECK_STATUS(status);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::random_sample::ascend
...@@ -11,8 +11,12 @@ ...@@ -11,8 +11,12 @@
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
#include "maca/random_sample_maca.h" #include "maca/random_sample_maca.h"
#endif #endif
#ifdef ENABLE_ASCEND_API
#include "ascend/random_sample_aclnn.h"
#endif
__C infiniStatus_t infiniopCreateRandomSampleDescriptor( __C infiniStatus_t
infiniopCreateRandomSampleDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
infiniopRandomSampleDescriptor_t *desc_ptr, infiniopRandomSampleDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t result,
...@@ -37,6 +41,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor( ...@@ -37,6 +41,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca); CREATE(INFINI_DEVICE_METAX, maca);
#endif #endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -67,6 +74,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize( ...@@ -67,6 +74,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca); GET(INFINI_DEVICE_METAX, maca);
#endif #endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -107,6 +117,9 @@ __C infiniStatus_t infiniopRandomSample( ...@@ -107,6 +117,9 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca); CALCULATE(INFINI_DEVICE_METAX, maca);
#endif #endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -134,6 +147,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor( ...@@ -134,6 +147,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca); DELETE(INFINI_DEVICE_METAX, maca);
#endif #endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -141,4 +157,3 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor( ...@@ -141,4 +157,3 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#undef DELETE #undef DELETE
} }
...@@ -43,6 +43,7 @@ _TOLERANCE_MAP = { ...@@ -43,6 +43,7 @@ _TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 0}, torch.float16: {"atol": 0, "rtol": 0},
} }
DEBUG = False DEBUG = False
PROFILE = False PROFILE = False
NUM_PRERUN = 10 NUM_PRERUN = 10
...@@ -59,27 +60,18 @@ infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor) ...@@ -59,27 +60,18 @@ infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor)
def random_sample(data, random_val, topp, topk, voc, temperature): def random_sample(data, random_val, topp, topk, voc, temperature):
if topp > 0 and topk > 1: if topp > 0 and topk > 1:
sorted_vals, sorted_indices = torch.sort(data, descending=True) sorted_vals, sorted_indices = torch.sort(data, descending=True)
scaled_vals = (sorted_vals - sorted_vals[0]) / temperature scaled_vals = (sorted_vals - sorted_vals[0]) / temperature
probs = torch.softmax(scaled_vals, dim=0) probs = torch.softmax(scaled_vals, dim=0)
cum_probs = torch.cumsum(probs, dim=0) cum_probs = torch.cumsum(probs, dim=0)
k_index = min(topk, voc) - 1 k_index = min(topk, voc) - 1
threshold = min(cum_probs[k_index], topp) * random_val threshold = min(cum_probs[k_index], topp) * random_val
idx = torch.searchsorted(cum_probs, threshold) idx = torch.searchsorted(cum_probs, threshold)
return sorted_indices[idx] return sorted_indices[idx]
return torch.argmax(data)
def random_sample(data, random_val, topp, topk, voc, temperature): return torch.argmax(data)
if topp > 0 and topk > 1:
ans = random_sample_1(data.to("cpu"), random_val, topp, topk, voc, temperature)
else:
ans = random_sample_0(data)
return ans
def test( def test(
...@@ -92,7 +84,7 @@ def test( ...@@ -92,7 +84,7 @@ def test(
topk, topk,
temperature, temperature,
dtype=torch.float16, dtype=torch.float16,
sync=None sync=None,
): ):
print( print(
f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}" f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
...@@ -136,6 +128,7 @@ def test( ...@@ -136,6 +128,7 @@ def test(
) )
) )
workspace = create_workspace(workspace_size.value, torch_device) workspace = create_workspace(workspace_size.value, torch_device)
def lib_random_sample(): def lib_random_sample():
check_error( check_error(
lib.infiniopRandomSample( lib.infiniopRandomSample(
...@@ -223,4 +216,5 @@ if __name__ == "__main__": ...@@ -223,4 +216,5 @@ if __name__ == "__main__":
# Execute tests # Execute tests
for device in get_test_devices(args): for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment