issue/50/fix: 添加 random sample/cpu 并修改测例

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/50/fix: 添加 random sample/cpu 并修改测例
Signed-off-by: YdrMaster <ydrml@hotmail.com>
b5ccf30f · YdrMaster · bd8ae651 · b5ccf30f · b5ccf30f · b5ccf30f
Commit b5ccf30f authored Mar 18, 2025 by YdrMaster
8 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,3 +47,4 @@ jobs:
        pip install torch
        LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/matmul.py --cpu
        LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/rms_norm.py --cpu
+        LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/random_sample.py --cpu
--- a/include/infiniop/ops/random_sample.h
+++ b/include/infiniop/ops/random_sample.h
@@ -5,21 +5,29 @@

 typedef InfiniopDescriptor *infiniopRandomSampleDescriptor_t;

-__C __export infiniStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs);
+__C __export infiniStatus_t infiniopCreateRandomSampleDescriptor(
+    infiniopHandle_t handle,
+    infiniopRandomSampleDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t result,
+    infiniopTensorDescriptor_t probs);

-__C __export infiniStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
+    infiniopRandomSampleDescriptor_t desc,
+    size_t *size);

-__C __export infiniStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
-                                                 void *workspace,
-                                                 size_t workspace_size,
-                                                 void *result,
-                                                 void const *probs,
-                                                 float random_val,
-                                                 float topp,
-                                                 int topk,
-                                                 float temperature,
-                                                 void *stream);
+__C __export infiniStatus_t infiniopRandomSample(
+    infiniopRandomSampleDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *result,
+    const void *probs,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    void *stream);

-__C __export infiniStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc);
+__C __export infiniStatus_t infiniopDestroyRandomSampleDescriptor(
+    infiniopRandomSampleDescriptor_t desc);

 #endif
--- a/src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
+++ b/src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
+#include "random_sample_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../../../tensor.h"
+#include <algorithm>
+
+namespace op::random_sample::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t result_desc,
+    infiniopTensorDescriptor_t probs_desc) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto dt_i = result_desc->dtype();
+    auto dt_p = probs_desc->dtype();
+
+    CHECK_DTYPE(dt_i,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+    CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_API_OR(result_desc->ndim(), 0,
+                 return INFINI_STATUS_BAD_TENSOR_SHAPE);
+    CHECK_API_OR(probs_desc->ndim(), 1,
+                 return INFINI_STATUS_BAD_TENSOR_SHAPE);
+    CHECK_API_OR(probs_desc->stride(0), 1,
+                 return INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    *desc_ptr = new Descriptor(
+        dt_i,
+        dt_p,
+        probs_desc->dim(0),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename DT>
+struct ComputeType {
+    using type = DT;
+};
+
+template <>
+struct ComputeType<fp16_t> {
+    using type = float;
+};
+
+template <class Tidx, class Tval>
+struct Scheme {
+    using Tcompute = typename ComputeType<Tval>::type;
+
+    static Tcompute get(void const *ptr, size_t i) {
+        return utils::cast<Tcompute, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
+    }
+
+    static void argmax(
+        void *result, void const *probs, size_t n) {
+
+        auto idx = reinterpret_cast<Tidx *>(result);
+        *idx = 0;
+
+        auto max_val = get(probs, 0);
+        for (size_t i = 0; i < n; i++) {
+            if (auto val = get(probs, i); val > max_val) {
+                max_val = val;
+                *idx = static_cast<Tidx>(i);
+            }
+        }
+    }
+
+    static void random(
+        void *result, void const *probs, size_t n,
+        float random_val, float topp, int topk, float temperature) {
+
+        struct KVPair {
+            Tidx idx;
+            Tcompute val;
+
+            bool operator<(const KVPair &other) const {
+                return val > other.val;
+            }
+        };
+
+        auto idx = reinterpret_cast<Tidx *>(result);
+        // build & sort
+        std::vector<KVPair> pairs(n);
+        for (size_t i = 0; i < n; i++) {
+            pairs[i] = {static_cast<Tidx>(i), get(probs, i)};
+        }
+        std::sort(pairs.begin(), pairs.end());
+        // softmax & sum
+        auto const max_val = pairs[0].val;
+        pairs[0].val = 1;
+        for (size_t i = 1; i < n; i++) {
+            pairs[i].val = pairs[i - 1].val + std::exp((pairs[i].val - max_val) / temperature);
+        }
+        // topk & topp & limit
+        auto const pk = pairs[std::min(static_cast<size_t>(topk), n) - 1].val,
+                   pp = pairs[n - 1].val * topp,
+                   plimit = random_val * std::min(pk, pp);
+        // sample
+        for (size_t i = 0; i < n; i++) {
+            if (plimit <= pairs[i].val) {
+                *idx = pairs[i].idx;
+                break;
+            }
+        }
+    }
+};
+
+template <class Tidx, class Tval>
+void switch_f(
+    size_t n,
+    void *result, const void *probs,
+    float random_val, float topp, int topk, float temperature) {
+    if (random_val == 0 || topp == 0 || topk == 1 || temperature == 0) {
+        Scheme<Tidx, Tval>::argmax(result, probs, n);
+    } else {
+        Scheme<Tidx, Tval>::random(result, probs, n, random_val, topp, topk, temperature);
+    }
+}
+
+template <class Tidx>
+void switch_val(
+    infiniDtype_t dt_p, size_t n,
+    void *result, void const *probs,
+    float random_val, float topp, int topk, float temperature) {
+    switch (dt_p) {
+    case INFINI_DTYPE_F16:
+        switch_f<Tidx, fp16_t>(n, result, probs, random_val, topp, topk, temperature);
+        break;
+    case INFINI_DTYPE_F32:
+        switch_f<Tidx, float>(n, result, probs, random_val, topp, topk, temperature);
+        break;
+    case INFINI_DTYPE_F64:
+        switch_f<Tidx, double>(n, result, probs, random_val, topp, topk, temperature);
+        break;
+    default:
+        // unreachable
+        std::abort();
+    }
+}
+
+void switch_idx(
+    infiniDtype_t dt_i, infiniDtype_t dt_p, size_t n,
+    void *result, void const *probs,
+    float random_val, float topp, int topk, float temperature) {
+
+#define CASE(DT_VAL, DT_TYP)                                                             \
+    case DT_VAL:                                                                         \
+        switch_val<DT_TYP>(dt_p, n, result, probs, random_val, topp, topk, temperature); \
+        break
+
+    switch (dt_i) {
+        CASE(INFINI_DTYPE_I8, int8_t);
+        CASE(INFINI_DTYPE_I16, int16_t);
+        CASE(INFINI_DTYPE_I32, int32_t);
+        CASE(INFINI_DTYPE_I64, int64_t);
+        CASE(INFINI_DTYPE_U8, uint8_t);
+        CASE(INFINI_DTYPE_U16, uint16_t);
+        CASE(INFINI_DTYPE_U32, uint32_t);
+        CASE(INFINI_DTYPE_U64, uint64_t);
+    default:
+        // unreachable
+        std::abort();
+    }
+
+#undef CASE
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *result,
+    const void *probs,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    void *stream) const {
+
+    switch_idx(_dt_i, _dt_p, _n, result, probs, random_val, topp, topk, temperature);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::random_sample::cpu
--- a/src/infiniop/ops/random_sample/cpu/random_sample_cpu.h
+++ b/src/infiniop/ops/random_sample/cpu/random_sample_cpu.h
+#ifndef __RANDOM_SAMPLE_CPU_H__
+#define __RANDOM_SAMPLE_CPU_H__
+
+#include "../random_sample.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __RANDOM_SAMPLE_CPU_H__
--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
@@ -2,152 +2,110 @@
 #include "../../handle.h"
 #include "infiniop/ops/random_sample.h"

-__C infiniStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
-    switch (handle->device) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *)desc_ptr, result, probs);
-#endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return cudaCreateRandomSampleDescriptor((CudaHandle_t)handle, (RandomSampleCudaDescriptor_t *)desc_ptr, result, probs);
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-    case DevCambriconMlu: {
-        return bangCreateRandomSampleDescriptor((BangHandle_t)handle,
-                                                (RandomSampleBangDescriptor_t *)desc_ptr, result,
-                                                probs);
-    }
-#endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendCreateRandomSampleDescriptor((AscendHandle_t)handle,
-                                                  (RandomSampleAscendDescriptor_t *)desc_ptr, result, probs);
-    }
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCreateRandomSampleDescriptor((MacaHandle_t)handle,
-                                                (RandomSampleMacaDescriptor_t *)desc_ptr, result,
-                                                probs);
-    }
+#ifdef ENABLE_CPU_API
+#include "cpu/random_sample_cpu.h"
 #endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu:
-        return musaCreateRandomSampleDescriptor((MusaHandle_t)handle, (RandomSampleMusaDescriptor_t *)desc_ptr, result, probs);
+
+__C infiniStatus_t infiniopCreateRandomSampleDescriptor(
+    infiniopHandle_t handle,
+    infiniopRandomSampleDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t result,
+    infiniopTensorDescriptor_t probs) {
+
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::random_sample::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::random_sample::NAMESPACE::Descriptor **>(desc_ptr), \
+            result,                                                                  \
+            probs)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+
+#undef CREATE
 };

-__C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, size_t *size) {
+__C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
+    infiniopRandomSampleDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                              \
+    case CASE:                                                                                            \
+        *size = reinterpret_cast<const op::random_sample::NAMESPACE::Descriptor *>(desc)->workspace_size; \
+        return INFINI_STATUS_SUCCESS
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t)desc, size);
-#endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu: {
-        return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t)desc, size);
-    }

+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CAMBRICON_MLU
-    case DevCambriconMlu: {
-        return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t)desc, size);
-        // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
-    }
-#endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t)desc, size);
-    }
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t)desc, size);
-    }
-#endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu: {
-        return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t)desc, size);
-    }
-#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+
+#undef GET
 }

-__C infiniStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *result,
-                                        const void *probs,
-                                        float random_val,
-                                        float topp,
-                                        int topk,
-                                        float temperature,
-                                        void *stream) {
+__C infiniStatus_t infiniopRandomSample(
+    infiniopRandomSampleDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *result,
+    const void *probs,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::random_sample::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size,                                      \
+                        result, probs,                                                  \
+                        random_val,                                                     \
+                        topp, topk, temperature,                                        \
+                        stream)
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuRandomSample((RandomSampleCpuDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-#endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return cudaRandomSample((RandomSampleCudaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-    case DevCambriconMlu: {
-        return bangRandomSample((RandomSampleBangDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-    }
-#endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendRandomSample((RandomSampleAscendDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-    }
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaRandomSample((RandomSampleMacaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-    }
-#endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu:
-        return musaRandomSample((RandomSampleMusaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+
+#undef CALCULATE
 }

-__C infiniStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
+__C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
+    infiniopRandomSampleDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::random_sample::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t)desc);
-#endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t)desc);
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-    case DevCambriconMlu: {
-        return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t)desc);
-    }
-#endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t)desc);
-    }
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t)desc);
-    }
-#endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu:
-        return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t)desc);
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+
+#undef DELETE
 }
--- a/src/infiniop/ops/random_sample/random_sample.h
+++ b/src/infiniop/ops/random_sample/random_sample.h
+#ifndef __RANDOM_SAMPLE_H__
+#define __RANDOM_SAMPLE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::random_sample::NAMESPACE {              \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+                                                          \
+        infiniDtype_t _dt_i, _dt_p;                       \
+        size_t _n;                                        \
+                                                          \
+        Descriptor(                                       \
+            infiniDtype_t dt_i,                           \
+            infiniDtype_t dt_p,                           \
+            size_t n,                                     \
+            size_t workspace_size_,                       \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _dt_i(dt_i),                                \
+              _dt_p(dt_p),                                \
+              _n(n),                                      \
+              workspace_size(workspace_size_) {}          \
+                                                          \
+    public:                                               \
+        size_t workspace_size;                            \
+                                                          \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t result_desc,       \
+            infiniopTensorDescriptor_t probs_desc);       \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *result,                                 \
+            const void *probs,                            \
+            float random_val,                             \
+            float topp,                                   \
+            int topk,                                     \
+            float temperature,                            \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __RANDOM_SAMPLE_H__
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -17,4 +17,17 @@

 #define CHECK_STATUS(API) CHECK_API_OR(API, INFINI_STATUS_SUCCESS, return api_result_)

+#define CHECK_DTYPE(DT, ...)                                 \
+    do {                                                     \
+        auto found_supported_dtype = false;                  \
+        for (auto dt : {__VA_ARGS__}) {                      \
+            if (dt == DT) {                                  \
+                found_supported_dtype = true;                \
+                break;                                       \
+            }                                                \
+        }                                                    \
+        CHECK_API_OR(found_supported_dtype, true,            \
+                     return INFINI_STATUS_BAD_TENSOR_DTYPE); \
+    } while (0)
+
 #endif // INFINIUTILS_CHECK_H
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -82,25 +82,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
        globalM = dataNp[0]
        dataNp = (dataNp - globalM) / temperature
        dataNp = torch.softmax(dataNp.float(), dim=0)
-        sum_s = 0
-        for end in range(topk):
-            sum_s += dataNp[end]
-            if sum_s >= topp:
-                break
-        if end < topk - 1:
-            end += 1
-        else:
-            end = topk
-
-        sum_s = 0
-        for i in range(end):
-            sum_s += dataNp[i]
-        random_val *= sum_s
-
-        sum_s = 0
-        for i in range(end):
-            sum_s += dataNp[i]
-            if random_val < sum_s:
+        for i in range(1, voc):
+            dataNp[i] += dataNp[i - 1]
+        limit_k = dataNp[min(topk, voc) - 1]
+        limit_p = dataNp[voc - 1] * topp
+        limit = min(limit_k, limit_p) * random_val
+
+        for i in range(voc):
+            if limit < dataNp[i]:
                return indices[i]
    else:
        return torch.argmax(data)
@@ -129,7 +118,7 @@ def test(
        data, random_val, topp, topk, voc, temperature
    )  # 这个函数在device速度可能会很慢，可以通过data.to("cpu")方式加快计算过程

-    indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
+    indices = torch.zeros([], dtype=torch.int64).to(torch_device)

    x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]]

@@ -147,7 +136,7 @@ def test(

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
    for tensor in [x_tensor, indices_tensor]:
-        tensor.descriptor.contents.invalidate()
+        tensor.destroyDesc(lib)

    workspace_size = c_uint64(0)
    check_error(
@@ -181,13 +170,13 @@ def test(
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
        debug_all(
-            (indices[0].type(ans.dtype), data[indices[0]]),
+            (indices.type(ans.dtype), data[indices]),
            (ans, data[ans]),
            "or",
            atol=atol,
            rtol=rtol,
        )
-    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
+    assert indices.type(ans.dtype) == ans or data[ans] == data[indices]

    # Profiling workflow
    if PROFILE: