issue/11: add random sample ascend

b5c6c7b8 · zhangyue · zhangyunze · 384cb5bf · b5c6c7b8 · b5c6c7b8
Commit b5c6c7b8 authored Feb 18, 2025 by zhangyue Committed by zhangyunze Jun 17, 2025
7 changed files
--- a/src/infiniop/devices/ascend/CMakeLists.txt
+++ b/src/infiniop/devices/ascend/CMakeLists.txt
@@ -5,7 +5,7 @@ project(Ascend_C)
 set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
 set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
 set(RUN_MODE "npu" CACHE STRING "run mode: npu")
-set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
 set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
 if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
@@ -26,6 +26,7 @@ include_directories(
 ascendc_library(ascend_kernels STATIC
    ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
    ../../ops/rope/ascend/rope_ascend_kernel.cpp
-    # ../../ops/random_sample/ascend/random_sample_kernel.cpp
+    ../../ops/random_sample/ascend/random_sample_kernel.cpp
 )
+target_include_directories(ascend_kernels PRIVATE ../../../../include)
--- a/src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
+#include "random_sample_ascend.h"
+InfiniopRandomSampleAscendDescriptor::InfiniopRandomSampleAscendDescriptor(infiniDevice_t device_) {
+    device = device_;
+    device_id = 0;
+    pDesc = new aclnnTensorDescriptor();
+    topkIdxDesc = new aclnnTensorDescriptor();
+    topkValDesc = new aclnnTensorDescriptor();
+    resDesc = new aclnnTensorDescriptor();
+}
+infiniopStatus_t ascendCreateRandomSampleDescriptor(infiniopAscendHandle_t handle,
+                                                    infiniopRandomSampleAscendDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t result,
+                                                    infiniopTensorDescriptor_t probs) {
+    if (probs->ndim != 1) {
+        return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (result->ndim != 1 && result->shape[0] != 1) {
+        return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
+    }
+    (*desc_ptr) = new InfiniopRandomSampleAscendDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+    CHECK_STATUS((*desc_ptr)->pDesc->fromInfiniOpTensorDescriptor(probs), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS((*desc_ptr)->resDesc->fromInfiniOpTensorDescriptor(result), INFINIOP_STATUS_SUCCESS);
+    // Ascend aclnnTopk doesn't support U64 type
+    (*desc_ptr)->resDesc->dataType = aclDataType::ACL_INT64;
+    return INFINIOP_STATUS_SUCCESS;
+}
+infiniopStatus_t ascendGetRandomSampleWorkspaceSize(infiniopRandomSampleAscendDescriptor_t desc,
+                                                    uint64_t *size) {
+    auto &pDesc = desc->pDesc;
+    *size = numElements(pDesc->shape.data(), pDesc->ndim) * aclDataTypeSize(pDesc->dataType)
+          + numElements(pDesc->shape.data(), pDesc->ndim) * infiniSizeof(infiniDtype_t::INFINI_DTYPE_I64);
+    return INFINIOP_STATUS_SUCCESS;
+}
+infiniopStatus_t ascendRandomSample(infiniopRandomSampleAscendDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream) {
+    if (topk <= 0 || topp < 0 || topp > 1.0) {
+        return INFINIOP_STATUS_BAD_PARAM;
+    }
+    if (random_val < 0 || random_val > 1.0) {
+        return INFINIOP_STATUS_BAD_PARAM;
+    }
+    auto &pDesc = desc->pDesc;
+    auto &topkIdxDesc = desc->topkIdxDesc;
+    auto &topkValDesc = desc->topkValDesc;
+    auto ndim = static_cast<int64_t>(pDesc->ndim);
+    auto voc = pDesc->shape[0];
+    auto topk_ = topk <= voc ? topk : voc;
+    bool doSample = topk_ > 1 && temperature != 0 && topp != 0;
+    auto topkShape = std::vector<int64_t>(pDesc->shape);
+    topkShape[ndim - 1] = doSample ? topk_ : 1;
+    auto topkStrides = std::vector<int64_t>(pDesc->strides);
+    // Infer contiguous strides
+    topkStrides[ndim - 1] = 1;
+    for (int64_t i = ndim - 2; i >= 0; --i) {
+        topkStrides[i] = topkStrides[i + 1] * topkShape[i + 1];
+    }
+    CHECK_STATUS(topkValDesc->setDescriptor(pDesc->dataType, topkShape, topkStrides), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(topkIdxDesc->setDescriptor(aclDataType::ACL_INT64, topkShape, topkStrides), INFINIOP_STATUS_SUCCESS);
+    // Infer data ptr
+    auto workspaceTmp = workspace;
+    auto topkValAddr = workspaceTmp;
+    workspaceTmp = (void *)((uint8_t *)workspace + numElements(topkValDesc->shape.data(), topkValDesc->ndim) * aclDataTypeSize(topkValDesc->dataType));
+    auto topkIdxAddr = workspaceTmp;
+    auto pAddr = (void *)probs;
+    // Create aclTensor
+    CHECK_STATUS(pDesc->createTensor(pAddr), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(topkValDesc->createTensor(topkValAddr), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(topkIdxDesc->createTensor(topkIdxAddr), INFINIOP_STATUS_SUCCESS);
+    if (!doSample) {
+        CHECK_STATUS(desc->resDesc->createTensor(result), INFINIOP_STATUS_SUCCESS);
+    }
+    // Do Topk calculate
+    uint64_t topkWorkspaceSize = 0;
+    aclOpExecutor *topkExecutor = nullptr;
+    auto ret = aclnnTopkGetWorkspaceSize(pDesc->t,
+                                         topkShape[ndim - 1],
+                                         ndim - 1,
+                                         true,
+                                         true,
+                                         topkValDesc->t,
+                                         doSample ? topkIdxDesc->t
+                                                  : desc->resDesc->t,
+                                         &topkWorkspaceSize,
+                                         &topkExecutor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnTopkGetWorkspaceSize failed ERROR: %d\n", ret);
+              return INFINIOP_STATUS_INTERNAL_ERROR);
+    void *topkWorkspace;
+    CHECK_STATUS(mallocWorkspace(&topkWorkspace, topkWorkspaceSize), INFINIOP_STATUS_SUCCESS);
+    ret = aclnnTopk(topkWorkspace,
+                    topkWorkspaceSize,
+                    topkExecutor,
+                    stream);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnTopk failed ERROR: %d\n", ret);
+              return INFINIOP_STATUS_INTERNAL_ERROR);
+    CHECK_STATUS(freeWorkspace(topkWorkspace), INFINIOP_STATUS_SUCCESS);
+    if (doSample) {
+        // Do softmax and topp random sample
+        random_sample_do(
+            pAddr,
+            result,
+            topkValAddr,
+            topkIdxAddr,
+            topk,
+            static_cast<int>(pDesc->shape[0]),
+            topp,
+            temperature,
+            random_val,
+            pDesc->dataType,
+            stream);
+    }
+    return INFINIOP_STATUS_SUCCESS;
+}
+infiniopStatus_t ascendDestroyRandomSampleDescriptor(infiniopRandomSampleAscendDescriptor_t desc) {
+    delete desc->pDesc;
+    delete desc->topkIdxDesc;
+    delete desc->topkValDesc;
+    delete desc;
+    return INFINIOP_STATUS_SUCCESS;
+}
--- a/src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
+#ifndef __RANDOM_SAMPLE_ASCEND_H__
+#define __RANDOM_SAMPLE_ASCEND_H__
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include "random_sample_ascend_api.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <acl/acl_rt.h>
+#include <aclnnop/aclnn_topk.h>
+struct InfiniopRandomSampleAscendDescriptor {
+    infiniDevice_t device;
+    int device_id;
+    aclnnTensorDescriptor_t pDesc, topkValDesc, topkIdxDesc, resDesc;
+    InfiniopRandomSampleAscendDescriptor(infiniDevice_t device_);
+};
+extern "C" void
+random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
+                 int32_t topk, int32_t voc, float topp, float temper,
+                 float random, int dtype, void *stream);
+#endif
--- a/src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
+#ifndef __RANDOM_SAMPLE_ASCEND_API_H__
+#define __RANDOM_SAMPLE_ASCEND_API_H__
+#include "../../../devices/ascend/ascend_handle.h"
+#include "infiniop/operator.h"
+struct InfiniopRandomSampleAscendDescriptor;
+typedef struct InfiniopRandomSampleAscendDescriptor *infiniopRandomSampleAscendDescriptor_t;
+infiniopStatus_t ascendCreateRandomSampleDescriptor(infiniopAscendHandle_t handle,
+                                                    infiniopRandomSampleAscendDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t results,
+                                                    infiniopTensorDescriptor_t probs);
+infiniopStatus_t ascendGetRandomSampleWorkspaceSize(infiniopRandomSampleAscendDescriptor_t desc,
+                                                    uint64_t *size);
+infiniopStatus_t ascendRandomSample(infiniopRandomSampleAscendDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream);
+infiniopStatus_t ascendDestroyRandomSampleDescriptor(infiniopRandomSampleAscendDescriptor_t desc);
+#endif
--- a/src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
+++ b/src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
+#include "kernel_operator.h"
+using namespace AscendC;
+const int32_t BLOCK_LEN = 256;
+template <typename T>
+class KernelRandomSample {
+public:
+    __aicore__ inline KernelRandomSample() {}
+    __aicore__ inline void Init(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
+                                GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
+                                float topp_, float temper_, float random_) {
+        topk = topk_;
+        voc = voc_;
+        topp = topp_;
+        invTemperature = 1.0f / temper_;
+        random = random_;
+        negMax = 0.f;
+        sum = 0.f;
+        // CumSumInfo
+        topkAligned = topk * sizeof(T) % 32 == 0
+                        ? topk
+                        : (topk * sizeof(T) + 31) / 32 * 32 / sizeof(T);
+        vocAligned = voc * sizeof(T) % 32 == 0
+                       ? voc
+                       : (voc * sizeof(T) + 31) / 32 * 32 / sizeof(T);
+        topkIdxAligned = (topk + 3) / 4 * 4;
+        bufferLen = topkAligned > BLOCK_LEN ? topkAligned : BLOCK_LEN;
+        // Set Gm
+        pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc);
+        topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk);
+        topkIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topkIdxAddr), topk);
+        resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1);
+        // Global input and output
+        pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T));
+        pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t));
+        pipe.InitBuffer(resQue, 1, 32); // 32 bytes for aligned
+        pipe.InitBuffer(inBuf, BLOCK_LEN * sizeof(T));
+        pipe.InitBuffer(tmpBuf1, bufferLen * sizeof(T));
+        pipe.InitBuffer(tmpBuf2, bufferLen * sizeof(T));
+        pipe.InitBuffer(tmpBuf3, bufferLen * sizeof(T));
+        pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T));
+        pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T));
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+private:
+    // Softmax
+    __aicore__ inline void SoftMax(LocalTensor<T> &topkValIn,
+                                   LocalTensor<T> &softMaxOut) {
+        float invSum = 1.0f / sum;
+        LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>();
+        LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>();
+        LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>();
+        Adds(tmpBuffer, topkValIn, static_cast<T>(negMax), topk);
+        Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), topk);
+        Exp(tmpBuffer3, tmpBuffer2, topk);
+        Muls(softMaxOut, tmpBuffer3, static_cast<T>(invSum), topk);
+    }
+    // Cumsum
+    __aicore__ inline void InclusiveSum(LocalTensor<T> &topkValIn,
+                                        LocalTensor<T> &topkValOut) {
+        static constexpr CumSumConfig cumSumConfig{true, false, false};
+        LocalTensor<T> lastRowLocal;
+        CumSum<T, cumSumConfig>(topkValOut, lastRowLocal, topkValIn,
+                                {1, static_cast<uint32_t>(topkAligned)});
+    }
+    // Random sample
+    __aicore__ inline void RandomSample(LocalTensor<T> &valIn,
+                                        LocalTensor<int64_t> &Index,
+                                        LocalTensor<int64_t> &result) {
+        int end = 0;
+        for (end = 0; end < topk; end++) {
+            if (static_cast<float>(valIn(end)) >= topp) {
+                break;
+            }
+        }
+        if (end < topk - 1) {
+            end += 1;
+        } else {
+            end = topk;
+        }
+        auto randomVal = random * static_cast<float>(valIn(end - 1));
+        for (int i = 0; i < end; i++) {
+            if (randomVal < static_cast<float>(valIn(i))) {
+                result(0) = Index(i);
+                return;
+            }
+        }
+        result(0) = Index(end - 1);
+    }
+    __aicore__ inline void CopyIn() {
+        LocalTensor<T> topkValLocal = topkQue.AllocTensor<T>();
+        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.AllocTensor<int64_t>();
+        DataCopy(topkValLocal, topkGm, topkAligned);
+        DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned);
+        // Get Max val of input
+        negMax = -static_cast<float>(topkValLocal(0));
+        // Copy in p and compute sum
+        int32_t repeatTimes = voc / BLOCK_LEN;
+        int32_t remainder = voc % BLOCK_LEN;
+        float sum_s = 0.f;
+        LocalTensor<T> inBuffer = inBuf.Get<T>();
+        LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>();
+        LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>();
+        LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>();
+        for (int32_t i = 0; i < repeatTimes; i++) {
+            DataCopy(inBuffer, pGm[i * BLOCK_LEN], BLOCK_LEN);
+            Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), BLOCK_LEN);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), BLOCK_LEN);
+            Exp(tmpBuffer3, tmpBuffer2, BLOCK_LEN);
+            sum_s = 0.f;
+            for (int j = 0; j < BLOCK_LEN; ++j) {
+                sum_s += static_cast<float>(tmpBuffer3(j));
+            }
+            sum += sum_s;
+        }
+        if (remainder != 0) {
+            int32_t remainderAligned = remainder * sizeof(T) % 32 == 0
+                                         ? remainder
+                                         : (remainder * sizeof(T) + 31) / 32 * 32 / sizeof(T);
+            DataCopy(inBuffer, pGm[repeatTimes * BLOCK_LEN], remainderAligned);
+            Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), remainder);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), remainder);
+            Exp(tmpBuffer3, tmpBuffer2, remainder);
+            sum_s = 0.f;
+            for (int i = 0; i < remainder; ++i) {
+                sum_s += static_cast<float>(tmpBuffer3(i));
+            }
+            sum += sum_s;
+        }
+        topkQue.EnQue(topkValLocal);
+        topkIdxQue.EnQue(topkIdxLocal);
+    }
+    __aicore__ inline void Compute() {
+        // Get input data
+        LocalTensor<T> topkValLocal = topkQue.DeQue<T>();
+        // SoftMax
+        LocalTensor<T> softMaxOutLocal = softMaxOutBuf.Get<T>();
+        SoftMax(topkValLocal, softMaxOutLocal);
+        // InclusiveSum
+        LocalTensor<T> inclusiveOutLocal = inclusiveSumOutBuf.Get<T>();
+        InclusiveSum(softMaxOutLocal, inclusiveOutLocal);
+        // randomSample
+        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.DeQue<int64_t>();
+        LocalTensor<int64_t> resultLocal = resQue.AllocTensor<int64_t>();
+        RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal);
+        topkQue.FreeTensor(topkValLocal);
+        topkIdxQue.FreeTensor(topkIdxLocal);
+        resQue.EnQue(resultLocal);
+    }
+    __aicore__ inline void CopyOut() {
+        LocalTensor<int64_t> resLocal = resQue.DeQue<int64_t>();
+        DataCopy(resGm, resLocal, 32 / sizeof(int64_t));
+        resQue.FreeTensor(resLocal);
+    }
+private:
+    GlobalTensor<T> pGm;
+    GlobalTensor<T> topkGm;
+    GlobalTensor<int64_t> topkIdxGm;
+    GlobalTensor<int64_t> resGm;
+    TPipe pipe;
+    TQue<QuePosition::VECIN, 1> topkQue;
+    TQue<QuePosition::VECIN, 1> topkIdxQue;
+    TQue<QuePosition::VECOUT, 1> resQue;
+    TBuf<TPosition::VECCALC> inBuf;
+    TBuf<TPosition::VECCALC> tmpBuf1;
+    TBuf<TPosition::VECCALC> tmpBuf2;
+    TBuf<TPosition::VECCALC> tmpBuf3;
+    TBuf<TPosition::VECCALC> softMaxOutBuf;
+    TBuf<TPosition::VECCALC> inclusiveSumOutBuf;
+    // Kernel params
+    int32_t topk;
+    int32_t voc;
+    float topp;
+    float invTemperature;
+    float random;
+    float negMax;
+    float sum;
+    int32_t topkAligned;
+    int32_t topkIdxAligned;
+    int32_t vocAligned;
+    int32_t bufferLen;
+};
+extern "C" __global__ __aicore__ void
+random_sample_kernel_f16(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
+                         GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
+                         float topp_, float temper_, float random_) {
+    KernelRandomSample<half> op;
+    op.Init(p, res, topkAddr, topkIdxAddr, topk_, voc_, topp_, temper_, random_);
+    op.Process();
+}
+extern "C" void
+random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
+                 int32_t topk, int32_t voc, float topp, float temper,
+                 float random, int dtype, void *stream) {
+    switch (dtype) {
+    case 1:
+        random_sample_kernel_f16<<<1, nullptr, stream>>>(
+            p, res, topkAddr, topkIdxAddr, topk, voc, topp, temper, random);
+    }
+}
--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
@@ -141,3 +141,4 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
 #undef DELETE
 }
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -43,7 +43,6 @@ _TOLERANCE_MAP = {
    torch.float16: {"atol": 0, "rtol": 0},
 }
 DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
@@ -74,6 +73,15 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
    return torch.argmax(data)
+def random_sample(data, random_val, topp, topk, voc, temperature):
+    if topp > 0 and topk > 1:
+        ans = random_sample_1(data.to("cpu"), random_val, topp, topk, voc, temperature)
+    else:
+        ans = random_sample_0(data) 
+    return ans
 def test(
    lib,
    handle,
@@ -128,7 +136,6 @@ def test(
        )
    )
    workspace = create_workspace(workspace_size.value, torch_device)
    def lib_random_sample():
        check_error(
            lib.infiniopRandomSample(
@@ -216,5 +223,4 @@ if __name__ == "__main__":
    # Execute tests
    for device in get_test_devices(args):
        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")