feat:重构random sample ascend算子

c1fa267c · zhangyunze · b5c6c7b8 · c1fa267c · c1fa267c · c1fa267c
Commit c1fa267c authored May 20, 2025 by zhangyunze
10 changed files
--- a/src/infiniop/devices/ascend/CMakeLists.txt
+++ b/src/infiniop/devices/ascend/CMakeLists.txt
@@ -5,7 +5,7 @@ project(Ascend_C)
 set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
 set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
 set(RUN_MODE "npu" CACHE STRING "run mode: npu")
-set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
 set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)

 if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)

--- a/src/infiniop/devices/ascend/ascend_kernel_common.h
+++ b/src/infiniop/devices/ascend/ascend_kernel_common.h
@@ -7,6 +7,7 @@
 constexpr size_t BLOCK_NUM = 8;
 constexpr size_t BUFFER_NUM = 2;
 constexpr size_t BYTE_ALIGN = 32;
+constexpr size_t BLOCK_LEN = 256;

 template <typename T>
 __aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {

--- a/src/infiniop/ops/random_sample/ascend/random_sample_aclnn.h
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_aclnn.h
+#ifndef __ACLNN_RANDOM_SAMPLE_H__
+#define __ACLNN_RANDOM_SAMPLE_H__
+
+#include "../random_sample.h"
+
+DESCRIPTOR(ascend)
+
+#endif // __ACLNN_RANDOM_SAMPLE_H__
--- a/src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
-#include "random_sample_ascend.h"
-
-InfiniopRandomSampleAscendDescriptor::InfiniopRandomSampleAscendDescriptor(infiniDevice_t device_) {
-    device = device_;
-    device_id = 0;
-    pDesc = new aclnnTensorDescriptor();
-    topkIdxDesc = new aclnnTensorDescriptor();
-    topkValDesc = new aclnnTensorDescriptor();
-    resDesc = new aclnnTensorDescriptor();
-}
-
-infiniopStatus_t ascendCreateRandomSampleDescriptor(infiniopAscendHandle_t handle,
-                                                    infiniopRandomSampleAscendDescriptor_t *desc_ptr,
-                                                    infiniopTensorDescriptor_t result,
-                                                    infiniopTensorDescriptor_t probs) {
-    if (probs->ndim != 1) {
-        return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
-    }
-    if (result->ndim != 1 && result->shape[0] != 1) {
-        return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
-    }
-
-    (*desc_ptr) = new InfiniopRandomSampleAscendDescriptor(handle->device);
-    (*desc_ptr)->device_id = handle->device_id;
-
-    CHECK_STATUS((*desc_ptr)->pDesc->fromInfiniOpTensorDescriptor(probs), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS((*desc_ptr)->resDesc->fromInfiniOpTensorDescriptor(result), INFINIOP_STATUS_SUCCESS);
-    // Ascend aclnnTopk doesn't support U64 type
-    (*desc_ptr)->resDesc->dataType = aclDataType::ACL_INT64;
-
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t ascendGetRandomSampleWorkspaceSize(infiniopRandomSampleAscendDescriptor_t desc,
-                                                    uint64_t *size) {
-    auto &pDesc = desc->pDesc;
-    *size = numElements(pDesc->shape.data(), pDesc->ndim) * aclDataTypeSize(pDesc->dataType)
-          + numElements(pDesc->shape.data(), pDesc->ndim) * infiniSizeof(infiniDtype_t::INFINI_DTYPE_I64);
-
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t ascendRandomSample(infiniopRandomSampleAscendDescriptor_t desc,
-                                    void *workspace,
-                                    uint64_t workspace_size,
-                                    void *result,
-                                    void const *probs,
-                                    float random_val,
-                                    float topp,
-                                    int topk,
-                                    float temperature,
-                                    void *stream) {
-    if (topk <= 0 || topp < 0 || topp > 1.0) {
-        return INFINIOP_STATUS_BAD_PARAM;
-    }
-
-    if (random_val < 0 || random_val > 1.0) {
-        return INFINIOP_STATUS_BAD_PARAM;
-    }
-
-    auto &pDesc = desc->pDesc;
-    auto &topkIdxDesc = desc->topkIdxDesc;
-    auto &topkValDesc = desc->topkValDesc;
-    auto ndim = static_cast<int64_t>(pDesc->ndim);
-    auto voc = pDesc->shape[0];
-    auto topk_ = topk <= voc ? topk : voc;
-    bool doSample = topk_ > 1 && temperature != 0 && topp != 0;
-
-    auto topkShape = std::vector<int64_t>(pDesc->shape);
-    topkShape[ndim - 1] = doSample ? topk_ : 1;
-
-    auto topkStrides = std::vector<int64_t>(pDesc->strides);
-    // Infer contiguous strides
-    topkStrides[ndim - 1] = 1;
-    for (int64_t i = ndim - 2; i >= 0; --i) {
-        topkStrides[i] = topkStrides[i + 1] * topkShape[i + 1];
-    }
-
-    CHECK_STATUS(topkValDesc->setDescriptor(pDesc->dataType, topkShape, topkStrides), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(topkIdxDesc->setDescriptor(aclDataType::ACL_INT64, topkShape, topkStrides), INFINIOP_STATUS_SUCCESS);
-
-    // Infer data ptr
-    auto workspaceTmp = workspace;
-    auto topkValAddr = workspaceTmp;
-    workspaceTmp = (void *)((uint8_t *)workspace + numElements(topkValDesc->shape.data(), topkValDesc->ndim) * aclDataTypeSize(topkValDesc->dataType));
-    auto topkIdxAddr = workspaceTmp;
-    auto pAddr = (void *)probs;
-
-    // Create aclTensor
-    CHECK_STATUS(pDesc->createTensor(pAddr), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(topkValDesc->createTensor(topkValAddr), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(topkIdxDesc->createTensor(topkIdxAddr), INFINIOP_STATUS_SUCCESS);
-    if (!doSample) {
-        CHECK_STATUS(desc->resDesc->createTensor(result), INFINIOP_STATUS_SUCCESS);
-    }
-
-    // Do Topk calculate
-    uint64_t topkWorkspaceSize = 0;
-    aclOpExecutor *topkExecutor = nullptr;
-    auto ret = aclnnTopkGetWorkspaceSize(pDesc->t,
-                                         topkShape[ndim - 1],
-                                         ndim - 1,
-                                         true,
-                                         true,
-                                         topkValDesc->t,
-                                         doSample ? topkIdxDesc->t
-                                                  : desc->resDesc->t,
-                                         &topkWorkspaceSize,
-                                         &topkExecutor);
-    CHECK_RET(ret == ACL_SUCCESS,
-              LOG_PRINT("aclnnTopkGetWorkspaceSize failed ERROR: %d\n", ret);
-              return INFINIOP_STATUS_INTERNAL_ERROR);
-    void *topkWorkspace;
-    CHECK_STATUS(mallocWorkspace(&topkWorkspace, topkWorkspaceSize), INFINIOP_STATUS_SUCCESS);
-    ret = aclnnTopk(topkWorkspace,
-                    topkWorkspaceSize,
-                    topkExecutor,
-                    stream);
-    CHECK_RET(ret == ACL_SUCCESS,
-              LOG_PRINT("aclnnTopk failed ERROR: %d\n", ret);
-              return INFINIOP_STATUS_INTERNAL_ERROR);
-    CHECK_STATUS(freeWorkspace(topkWorkspace), INFINIOP_STATUS_SUCCESS);
-
-    if (doSample) {
-        // Do softmax and topp random sample
-        random_sample_do(
-            pAddr,
-            result,
-            topkValAddr,
-            topkIdxAddr,
-            topk,
-            static_cast<int>(pDesc->shape[0]),
-            topp,
-            temperature,
-            random_val,
-            pDesc->dataType,
-            stream);
-    }
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t ascendDestroyRandomSampleDescriptor(infiniopRandomSampleAscendDescriptor_t desc) {
-    delete desc->pDesc;
-    delete desc->topkIdxDesc;
-    delete desc->topkValDesc;
-    delete desc;
-    return INFINIOP_STATUS_SUCCESS;
-}
--- a/src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
-#ifndef __RANDOM_SAMPLE_ASCEND_H__
-#define __RANDOM_SAMPLE_ASCEND_H__
-
-#include "../../../devices/ascend/tensor_aclnn.h"
-#include "../../utils.h"
-#include "random_sample_ascend_api.h"
-#include <acl/acl.h>
-#include <acl/acl_base.h>
-#include <acl/acl_rt.h>
-#include <aclnnop/aclnn_topk.h>
-
-struct InfiniopRandomSampleAscendDescriptor {
-    infiniDevice_t device;
-    int device_id;
-    aclnnTensorDescriptor_t pDesc, topkValDesc, topkIdxDesc, resDesc;
-
-    InfiniopRandomSampleAscendDescriptor(infiniDevice_t device_);
-};
-
-extern "C" void
-random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
-                 int32_t topk, int32_t voc, float topp, float temper,
-                 float random, int dtype, void *stream);
-
-#endif
--- a/src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
-#ifndef __RANDOM_SAMPLE_ASCEND_API_H__
-#define __RANDOM_SAMPLE_ASCEND_API_H__
-
-#include "../../../devices/ascend/ascend_handle.h"
-#include "infiniop/operator.h"
-
-struct InfiniopRandomSampleAscendDescriptor;
-typedef struct InfiniopRandomSampleAscendDescriptor *infiniopRandomSampleAscendDescriptor_t;
-
-infiniopStatus_t ascendCreateRandomSampleDescriptor(infiniopAscendHandle_t handle,
-                                                    infiniopRandomSampleAscendDescriptor_t *desc_ptr,
-                                                    infiniopTensorDescriptor_t results,
-                                                    infiniopTensorDescriptor_t probs);
-
-infiniopStatus_t ascendGetRandomSampleWorkspaceSize(infiniopRandomSampleAscendDescriptor_t desc,
-                                                    uint64_t *size);
-
-infiniopStatus_t ascendRandomSample(infiniopRandomSampleAscendDescriptor_t desc,
-                                    void *workspace,
-                                    uint64_t workspace_size,
-                                    void *result,
-                                    void const *probs,
-                                    float random_val,
-                                    float topp,
-                                    int topk,
-                                    float temperature,
-                                    void *stream);
-
-infiniopStatus_t ascendDestroyRandomSampleDescriptor(infiniopRandomSampleAscendDescriptor_t desc);
-
-#endif
--- a/src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
-#include "kernel_operator.h"
+#include "../../../devices/ascend/ascend_kernel_common.h"

 using namespace AscendC;

-const int32_t BLOCK_LEN = 256;
-
 template <typename T>
-class KernelRandomSample {
+class RandomSampleKernel {
 public:
-    __aicore__ inline KernelRandomSample() {}
-    __aicore__ inline void Init(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
-                                GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
-                                float topp_, float temper_, float random_) {
-
-        topk = topk_;
-        voc = voc_;
-        topp = topp_;
-        invTemperature = 1.0f / temper_;
-        random = random_;
-        negMax = 0.f;
-        sum = 0.f;
+    __aicore__ inline RandomSampleKernel() {}
+    __aicore__ inline void init(GM_ADDR probs, GM_ADDR result, GM_ADDR topk_val_addr, GM_ADDR topk_idx_addr, float random_val, float topp, int topk, float temperature, int32_t n);
+    __aicore__ inline void process();
+
+private:
+    __aicore__ inline void copyIn();
+    __aicore__ inline void copyOut();
+    __aicore__ inline void compute();
+    __aicore__ inline void SoftMax(LocalTensor<T> &topkValIn,
+                                   LocalTensor<T> &softMaxOut);
+    __aicore__ inline void InclusiveSum(LocalTensor<T> &topkValIn,
+                                        LocalTensor<T> &topkValOut);
+    __aicore__ inline void RandomSample(LocalTensor<T> &valIn,
+                                        LocalTensor<int64_t> &Index,
+                                        LocalTensor<int64_t> &result);
+
+    GlobalTensor<T> _pGM,
+        _topk_valGM;
+    GlobalTensor<int64_t> _topk_idxGM, _resGM;
+    TPipe pipe;
+    TQue<QuePosition::VECIN, 1> _topk_valQue;
+    TQue<QuePosition::VECIN, 1> _topk_idxQue;
+    TQue<QuePosition::VECOUT, 1> _resQue;
+
+    TBuf<TPosition::VECCALC> _inBuf;
+    TBuf<TPosition::VECCALC> _tmp1Buf;
+    TBuf<TPosition::VECCALC> _tmp2Buf;
+    TBuf<TPosition::VECCALC> _tmp3Buf;
+    TBuf<TPosition::VECCALC> _softmax_OutBuf;
+    TBuf<TPosition::VECCALC> _inclusive_sum_OutBuf;
+
+    int32_t _topk;
+    int32_t _voc;
+    float _random_val;
+    float _topp;
+    float _invTemp;
+    float _negMax = 0.f;
+    float _sum = 0.f;
+
+    int32_t _topkAligned;
+    int32_t _topkIdxAligned;
+    int32_t _vocAligned;
+    int32_t _bufferLen;
+};
+
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::init(GM_ADDR probs, GM_ADDR result, GM_ADDR topk_val_addr, GM_ADDR topk_idx_addr, float random_val, float topp, int topk, float temperature, int32_t n) {
+    _topk = topk;
+    _voc = n;
+    _random_val = random_val;
+    _topp = topp;
+    _invTemp = 1.0f / temperature;

    // CumSumInfo
-        topkAligned = topk * sizeof(T) % 32 == 0
-                        ? topk
-                        : (topk * sizeof(T) + 31) / 32 * 32 / sizeof(T);
-        vocAligned = voc * sizeof(T) % 32 == 0
-                       ? voc
-                       : (voc * sizeof(T) + 31) / 32 * 32 / sizeof(T);
-        topkIdxAligned = (topk + 3) / 4 * 4;
-
-        bufferLen = topkAligned > BLOCK_LEN ? topkAligned : BLOCK_LEN;
-
-        // Set Gm
-        pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc);
-        topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk);
-        topkIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topkIdxAddr), topk);
-        resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1);
+    _topkAligned = alignTileLen<T>(_topk, BYTE_ALIGN);
+    _vocAligned = alignTileLen<T>(_voc, BYTE_ALIGN);
+    _topkIdxAligned = (_topk + 3) / 4 * 4;
+    _bufferLen = _topkAligned > BLOCK_LEN ? _topkAligned : BLOCK_LEN;
+
+    // Set GlobalTensor
+    _pGM.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(probs), _voc);
+    _topk_valGM.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topk_val_addr), _topk);
+    _topk_idxGM.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topk_idx_addr), _topk);
+    _resGM.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(result), 1);

    // Global input and output
-        pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T));
-        pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t));
-        pipe.InitBuffer(resQue, 1, 32); // 32 bytes for aligned
-        pipe.InitBuffer(inBuf, BLOCK_LEN * sizeof(T));
-        pipe.InitBuffer(tmpBuf1, bufferLen * sizeof(T));
-        pipe.InitBuffer(tmpBuf2, bufferLen * sizeof(T));
-        pipe.InitBuffer(tmpBuf3, bufferLen * sizeof(T));
-        pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T));
-
-        pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T));
-    }
-    __aicore__ inline void Process() {
-        CopyIn();
-        Compute();
-        CopyOut();
-    }
+    pipe.InitBuffer(_topk_valQue, 1, _topkAligned * sizeof(T));
+    pipe.InitBuffer(_topk_idxQue, 1, _topkIdxAligned * sizeof(int64_t));
+    pipe.InitBuffer(_resQue, 1, BYTE_ALIGN); // 32 bytes for aligned
+    pipe.InitBuffer(_inBuf, BLOCK_LEN * sizeof(T));
+    pipe.InitBuffer(_tmp1Buf, _bufferLen * sizeof(T));
+    pipe.InitBuffer(_tmp2Buf, _bufferLen * sizeof(T));
+    pipe.InitBuffer(_tmp3Buf, _bufferLen * sizeof(T));
+    pipe.InitBuffer(_softmax_OutBuf, _topkAligned * sizeof(T));
+    pipe.InitBuffer(_inclusive_sum_OutBuf, _topkAligned * sizeof(T));
+}

-private:
-    // Softmax
-    __aicore__ inline void SoftMax(LocalTensor<T> &topkValIn,
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::process() {
+    copyIn();
+    compute();
+    copyOut();
+}
+
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::SoftMax(LocalTensor<T> &topkValIn,
                                                      LocalTensor<T> &softMaxOut) {
-        float invSum = 1.0f / sum;
-        LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>();
-        LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>();
-        LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>();
-        Adds(tmpBuffer, topkValIn, static_cast<T>(negMax), topk);
-        Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), topk);
-        Exp(tmpBuffer3, tmpBuffer2, topk);
-        Muls(softMaxOut, tmpBuffer3, static_cast<T>(invSum), topk);
-    }
+    float invSum = 1.0f / _sum;
+    LocalTensor<T> tmpBuffer = _tmp1Buf.Get<T>();
+    LocalTensor<T> tmpBuffer2 = _tmp2Buf.Get<T>();
+    LocalTensor<T> tmpBuffer3 = _tmp3Buf.Get<T>();
+    Adds(tmpBuffer, topkValIn, static_cast<T>(_negMax), _topk);
+    Muls(tmpBuffer2, tmpBuffer, static_cast<T>(_invTemp), _topk);
+    Exp(tmpBuffer3, tmpBuffer2, _topk);
+    Muls(softMaxOut, tmpBuffer3, static_cast<T>(invSum), _topk);
+}

-    // Cumsum
-    __aicore__ inline void InclusiveSum(LocalTensor<T> &topkValIn,
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::InclusiveSum(LocalTensor<T> &topkValIn,
                                                           LocalTensor<T> &topkValOut) {
    static constexpr CumSumConfig cumSumConfig{true, false, false};
    LocalTensor<T> lastRowLocal;
    CumSum<T, cumSumConfig>(topkValOut, lastRowLocal, topkValIn,
-                                {1, static_cast<uint32_t>(topkAligned)});
-    }
+                            {1, static_cast<uint32_t>(_topkAligned)});
+}

-    // Random sample
-    __aicore__ inline void RandomSample(LocalTensor<T> &valIn,
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::RandomSample(LocalTensor<T> &valIn,
                                                           LocalTensor<int64_t> &Index,
                                                           LocalTensor<int64_t> &result) {
    int end = 0;
-        for (end = 0; end < topk; end++) {
-            if (static_cast<float>(valIn(end)) >= topp) {
+    for (end = 0; end < _topk; end++) {
+        if (static_cast<float>(valIn(end)) >= _topp) {
            break;
        }
    }
-        if (end < topk - 1) {
+    if (end < _topk - 1) {
        end += 1;
    } else {
-            end = topk;
+        end = _topk;
    }

-        auto randomVal = random * static_cast<float>(valIn(end - 1));
+    auto random_val = _random_val * static_cast<float>(valIn(end - 1));
    for (int i = 0; i < end; i++) {
-            if (randomVal < static_cast<float>(valIn(i))) {
+        if (random_val < static_cast<float>(valIn(i))) {
            result(0) = Index(i);
            return;
        }
    }
    result(0) = Index(end - 1);
-    }
+}

-    __aicore__ inline void CopyIn() {
-        LocalTensor<T> topkValLocal = topkQue.AllocTensor<T>();
-        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.AllocTensor<int64_t>();
-        DataCopy(topkValLocal, topkGm, topkAligned);
-        DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned);
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::copyIn() {
+    LocalTensor<T> topkValLocal = _topk_valQue.AllocTensor<T>();
+    LocalTensor<int64_t> topkIdxLocal = _topk_idxQue.AllocTensor<int64_t>();
+    DataCopy(topkValLocal, _topk_valGM, _topkAligned);
+    DataCopy(topkIdxLocal, _topk_idxGM, _topkIdxAligned);
    // Get Max val of input
-        negMax = -static_cast<float>(topkValLocal(0));
+    _negMax = -static_cast<float>(topkValLocal(0));

-        // Copy in p and compute sum
-        int32_t repeatTimes = voc / BLOCK_LEN;
-        int32_t remainder = voc % BLOCK_LEN;
+    // Copy in p and compute _sum
+    int32_t repeatTimes = _voc / BLOCK_LEN;
+    int32_t remainder = _voc % BLOCK_LEN;
    float sum_s = 0.f;
-        LocalTensor<T> inBuffer = inBuf.Get<T>();
-        LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>();
-        LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>();
-        LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>();
+    LocalTensor<T> _inBuffer = _inBuf.Get<T>();
+    LocalTensor<T> tmpBuffer = _tmp1Buf.Get<T>();
+    LocalTensor<T> tmpBuffer2 = _tmp2Buf.Get<T>();
+    LocalTensor<T> tmpBuffer3 = _tmp3Buf.Get<T>();
    for (int32_t i = 0; i < repeatTimes; i++) {
-            DataCopy(inBuffer, pGm[i * BLOCK_LEN], BLOCK_LEN);
-            Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), BLOCK_LEN);
-            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), BLOCK_LEN);
+        DataCopy(_inBuffer, _pGM[i * BLOCK_LEN], BLOCK_LEN);
+        Adds(tmpBuffer, _inBuffer, static_cast<T>(_negMax), BLOCK_LEN);
+        Muls(tmpBuffer2, tmpBuffer, static_cast<T>(_invTemp), BLOCK_LEN);
        Exp(tmpBuffer3, tmpBuffer2, BLOCK_LEN);
        sum_s = 0.f;
        for (int j = 0; j < BLOCK_LEN; ++j) {
            sum_s += static_cast<float>(tmpBuffer3(j));
        }
-            sum += sum_s;
+        _sum += sum_s;
    }
    if (remainder != 0) {
-            int32_t remainderAligned = remainder * sizeof(T) % 32 == 0
-                                         ? remainder
-                                         : (remainder * sizeof(T) + 31) / 32 * 32 / sizeof(T);
-            DataCopy(inBuffer, pGm[repeatTimes * BLOCK_LEN], remainderAligned);
-            Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), remainder);
-            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), remainder);
+        int32_t remainderAligned = alignTileLen<T>(remainder, BYTE_ALIGN);
+        DataCopy(_inBuffer, _pGM[repeatTimes * BLOCK_LEN], remainderAligned);
+        Adds(tmpBuffer, _inBuffer, static_cast<T>(_negMax), remainder);
+        Muls(tmpBuffer2, tmpBuffer, static_cast<T>(_invTemp), remainder);
        Exp(tmpBuffer3, tmpBuffer2, remainder);
        sum_s = 0.f;
        for (int i = 0; i < remainder; ++i) {
            sum_s += static_cast<float>(tmpBuffer3(i));
        }
-            sum += sum_s;
+        _sum += sum_s;
    }

-        topkQue.EnQue(topkValLocal);
-        topkIdxQue.EnQue(topkIdxLocal);
-    }
+    _topk_valQue.EnQue(topkValLocal);
+    _topk_idxQue.EnQue(topkIdxLocal);
+}

-    __aicore__ inline void Compute() {
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::compute() {
    // Get input data
-        LocalTensor<T> topkValLocal = topkQue.DeQue<T>();
+    LocalTensor<T> topkValLocal = _topk_valQue.DeQue<T>();

    // SoftMax
-        LocalTensor<T> softMaxOutLocal = softMaxOutBuf.Get<T>();
+    LocalTensor<T> softMaxOutLocal = _softmax_OutBuf.Get<T>();
    SoftMax(topkValLocal, softMaxOutLocal);

    // InclusiveSum
-        LocalTensor<T> inclusiveOutLocal = inclusiveSumOutBuf.Get<T>();
+    LocalTensor<T> inclusiveOutLocal = _inclusive_sum_OutBuf.Get<T>();
    InclusiveSum(softMaxOutLocal, inclusiveOutLocal);

    // randomSample
-        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.DeQue<int64_t>();
-        LocalTensor<int64_t> resultLocal = resQue.AllocTensor<int64_t>();
+    LocalTensor<int64_t> topkIdxLocal = _topk_idxQue.DeQue<int64_t>();
+    LocalTensor<int64_t> resultLocal = _resQue.AllocTensor<int64_t>();
    RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal);

-        topkQue.FreeTensor(topkValLocal);
-        topkIdxQue.FreeTensor(topkIdxLocal);
-        resQue.EnQue(resultLocal);
-    }
-    __aicore__ inline void CopyOut() {
-        LocalTensor<int64_t> resLocal = resQue.DeQue<int64_t>();
-        DataCopy(resGm, resLocal, 32 / sizeof(int64_t));
-        resQue.FreeTensor(resLocal);
-    }
-
-private:
-    GlobalTensor<T> pGm;
-    GlobalTensor<T> topkGm;
-    GlobalTensor<int64_t> topkIdxGm;
-    GlobalTensor<int64_t> resGm;
-
-    TPipe pipe;
+    _topk_valQue.FreeTensor(topkValLocal);
+    _topk_idxQue.FreeTensor(topkIdxLocal);
+    _resQue.EnQue(resultLocal);
+}

-    TQue<QuePosition::VECIN, 1> topkQue;
-    TQue<QuePosition::VECIN, 1> topkIdxQue;
-    TQue<QuePosition::VECOUT, 1> resQue;
-
-    TBuf<TPosition::VECCALC> inBuf;
-    TBuf<TPosition::VECCALC> tmpBuf1;
-    TBuf<TPosition::VECCALC> tmpBuf2;
-    TBuf<TPosition::VECCALC> tmpBuf3;
-    TBuf<TPosition::VECCALC> softMaxOutBuf;
-
-    TBuf<TPosition::VECCALC> inclusiveSumOutBuf;
-
-    // Kernel params
-    int32_t topk;
-    int32_t voc;
-    float topp;
-    float invTemperature;
-    float random;
-    float negMax;
-    float sum;
-
-    int32_t topkAligned;
-    int32_t topkIdxAligned;
-    int32_t vocAligned;
-    int32_t bufferLen;
-};
+template <typename T>
+__aicore__ inline void RandomSampleKernel<T>::copyOut() {
+    LocalTensor<int64_t> resLocal = _resQue.DeQue<int64_t>();
+    DataCopy(_resGM, resLocal, BYTE_ALIGN / sizeof(int64_t));
+    _resQue.FreeTensor(resLocal);
+}

-extern "C" __global__ __aicore__ void
-random_sample_kernel_f16(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
-                         GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
-                         float topp_, float temper_, float random_) {
-    KernelRandomSample<half> op;
-    op.Init(p, res, topkAddr, topkIdxAddr, topk_, voc_, topp_, temper_, random_);
-    op.Process();
+extern "C" __global__ __aicore__ void random_sample_kernel_fp16(
+    GM_ADDR probs,
+    GM_ADDR result,
+    GM_ADDR topk_val_addr,
+    GM_ADDR topk_idx_addr,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    int32_t n) {
+    RandomSampleKernel<half> op;
+    op.init(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
+    op.process();
 }

-extern "C" void
-random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
-                 int32_t topk, int32_t voc, float topp, float temper,
-                 float random, int dtype, void *stream) {
+extern "C" __global__ __aicore__ void random_sample_kernel_fp32(
+    GM_ADDR probs,
+    GM_ADDR result,
+    GM_ADDR topk_val_addr,
+    GM_ADDR topk_idx_addr,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    int32_t n) {
+    RandomSampleKernel<float> op;
+    op.init(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
+    op.process();
+}

-    switch (dtype) {
-    case 1:
-        random_sample_kernel_f16<<<1, nullptr, stream>>>(
-            p, res, topkAddr, topkIdxAddr, topk, voc, topp, temper, random);
+extern "C" infiniStatus_t random_sample_kernel_launch(
+    void *probs,
+    void *result,
+    void *topk_val_addr,
+    void *topk_idx_addr,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    uint64_t n,
+    infiniDtype_t dt_p,
+    void *stream) {
+    switch (dt_p) {
+    case INFINI_DTYPE_F16:
+        random_sample_kernel_fp16<<<1, nullptr, stream>>>(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
+        break;
+    case INFINI_DTYPE_F32:
+        random_sample_kernel_fp32<<<1, nullptr, stream>>>(probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk, temperature, n);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }
+    return INFINI_STATUS_SUCCESS;
 }
--- a/src/infiniop/ops/random_sample/ascend/randomsample_aclnn.cc
+++ b/src/infiniop/ops/random_sample/ascend/randomsample_aclnn.cc
+#include "../../../devices/ascend/common_ascend.h"
+#include "random_sample_aclnn.h"
+#include <aclnnop/aclnn_topk.h>
+
+namespace op::random_sample::ascend {
+
+struct Descriptor::Opaque {
+    aclnnTensorDescriptor_t probs;
+    aclnnTensorDescriptor_t result;
+
+    ~Opaque() {
+        delete probs;
+        delete result;
+    }
+};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t result_desc,
+    infiniopTensorDescriptor_t probs_desc) {
+    auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
+    auto result = RandomSampleInfo::create(result_desc, probs_desc);
+    CHECK_RESULT(result);
+    CHECK_DTYPE(result->dt_i, INFINI_DTYPE_I64);
+    auto workspace_size = probs_desc->numel() * infiniSizeOf(probs_desc->dtype()) + probs_desc->numel() * infiniSizeOf(infiniDtype_t::INFINI_DTYPE_I64);
+    auto tresult = new aclnnTensorDescriptor(result_desc);
+    auto tprobs = new aclnnTensorDescriptor(probs_desc);
+    *desc_ptr
+        = new Descriptor(
+            result.take(),
+            workspace_size,
+            new Opaque{tprobs, tresult},
+            handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::minWorkspaceSize() const {
+    return _min_workspace_size;
+}
+
+extern "C" infiniStatus_t random_sample_kernel_launch(
+    void *probs,
+    void *result,
+    void *topk_val_addr,
+    void *topk_idx_addr,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    uint64_t n,
+    infiniDtype_t dt_p,
+    void *stream);
+
+infiniStatus_t
+Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *result,
+    const void *probs,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    void *stream) const {
+    if (workspace_size < _min_workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    auto topk_ = topk <= (int)_info.n ? topk : (int)_info.n;
+    bool dosample = topk_ > 1 && temperature != 0.0f && topp != 0.0f && random_val != 0.0f;
+    auto topk_shape = std::vector<int64_t>{dosample ? topk_ : 1};
+    auto topk_stride = std::vector<int64_t>{1};
+    auto topk_idx = new aclnnTensorDescriptor(toAclDataType(_info.dt_i), topk_shape, topk_stride);
+    auto topk_val = new aclnnTensorDescriptor(toAclDataType(_info.dt_p), topk_shape, topk_stride);
+    auto topk_val_addr = workspace;
+    auto topk_idx_addr = (void *)((uint8_t *)workspace + topk_ * infiniSizeOf(_info.dt_p));
+    uint64_t topk_workspace_size = 0;
+    aclOpExecutor *topk_executor = nullptr;
+    CHECK_ACL(aclnnTopkGetWorkspaceSize(_opaque->probs->tensor,
+                                        topk_shape[0],
+                                        0,
+                                        true,
+                                        true,
+                                        topk_val->tensor,
+                                        dosample ? topk_idx->tensor : _opaque->result->tensor,
+                                        &topk_workspace_size,
+                                        &topk_executor));
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(topk_executor));
+    void *topk_workspace;
+    CHECK_ACL(aclrtMalloc(&topk_workspace, topk_workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
+    AclSetTensorAddr(topk_executor, 0, _opaque->probs->tensor, (void *)probs);
+    AclSetTensorAddr(topk_executor, 1, topk_val->tensor, topk_val_addr);
+    if (!dosample) {
+        AclSetTensorAddr(topk_executor, 2, _opaque->result->tensor, result);
+    } else {
+        AclSetTensorAddr(topk_executor, 2, topk_idx->tensor, topk_idx_addr);
+    }
+    CHECK_ACL(aclnnTopk(topk_workspace, topk_workspace_size, topk_executor, stream));
+    CHECK_ACL(aclrtFree(topk_workspace));
+
+    if (dosample) {
+        auto status = random_sample_kernel_launch((void *)probs, result, topk_val_addr, topk_idx_addr, random_val, topp, topk_, temperature, _info.n, _info.dt_p, stream);
+        CHECK_STATUS(status);
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::random_sample::ascend
--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
@@ -11,8 +11,12 @@
 #ifdef ENABLE_METAX_API
 #include "maca/random_sample_maca.h"
 #endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/random_sample_aclnn.h"
+#endif

-__C infiniStatus_t infiniopCreateRandomSampleDescriptor(
+__C infiniStatus_t
+infiniopCreateRandomSampleDescriptor(
    infiniopHandle_t handle,
    infiniopRandomSampleDescriptor_t *desc_ptr,
    infiniopTensorDescriptor_t result,
@@ -37,6 +41,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
 #ifdef ENABLE_METAX_API
        CREATE(INFINI_DEVICE_METAX, maca);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -67,6 +74,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
 #ifdef ENABLE_METAX_API
        GET(INFINI_DEVICE_METAX, maca);
 #endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -107,6 +117,9 @@ __C infiniStatus_t infiniopRandomSample(
 #ifdef ENABLE_METAX_API
        CALCULATE(INFINI_DEVICE_METAX, maca);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -134,6 +147,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
 #ifdef ENABLE_METAX_API
        DELETE(INFINI_DEVICE_METAX, maca);
 #endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -141,4 +157,3 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(

 #undef DELETE
 }
-
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -43,6 +43,7 @@ _TOLERANCE_MAP = {
    torch.float16: {"atol": 0, "rtol": 0},
 }

+
 DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
@@ -73,15 +74,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
    return torch.argmax(data)


-def random_sample(data, random_val, topp, topk, voc, temperature):
-    if topp > 0 and topk > 1:
-        ans = random_sample_1(data.to("cpu"), random_val, topp, topk, voc, temperature)
-    else:
-        ans = random_sample_0(data) 
-    
-    return ans
-
-
 def test(
    lib,
    handle,
@@ -92,7 +84,7 @@ def test(
    topk,
    temperature,
    dtype=torch.float16,
-    sync=None
+    sync=None,
 ):
    print(
        f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
@@ -136,6 +128,7 @@ def test(
        )
    )
    workspace = create_workspace(workspace_size.value, torch_device)
+
    def lib_random_sample():
        check_error(
            lib.infiniopRandomSample(
@@ -223,4 +216,5 @@ if __name__ == "__main__":
    # Execute tests
    for device in get_test_devices(args):
        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
    print("\033[92mTest passed!\033[0m")