[T2-2-3] blkmjsian

- dequantize awq - rope v2

[T2-2-3] blkmjsian
- dequantize awq - rope v2
9ad23fad · blkmjsian · GitHub · b3170335 · 9ad23fad · 9ad23fad
Unverified Commit 9ad23fad authored Sep 02, 2025 by blkmjsian Committed by GitHub Sep 02, 2025
20 changed files
--- a/src/infiniop/ops/rope_v2/cpu/rope_v2_cpu.h
+++ b/src/infiniop/ops/rope_v2/cpu/rope_v2_cpu.h
+#ifndef __INFINIOP_ROPE_V2_CPU_H__
+#define __INFINIOP_ROPE_V2_CPU_H__
+
+#include "../rope_v2.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __INFINIOP_ROPE_V2_CPU_H__
--- a/src/infiniop/ops/rope_v2/cuda/kernel.cuh
+++ b/src/infiniop/ops/rope_v2/cuda/kernel.cuh
+#ifndef __INFINIOP_ROPE_V2_CUDA_KERNEL_CUH__
+#define __INFINIOP_ROPE_V2_CUDA_KERNEL_CUH__
+
+template <typename Tdata, typename Tindex, typename Tangle>
+__device__ void ropeThreadPerItemBlock(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+
+    auto y_offset = blockIdx.x * y_stride_seqlen + blockIdx.y * y_stride_nhead;
+    auto x_offset = blockIdx.x * x_stride_seqlen + blockIdx.y * x_stride_nhead;
+    size_t pos_id = size_t(pos_ids[blockIdx.x]);
+    auto table_offset = pos_id * table_dim;
+    const size_t half_dim = table_dim; // Head dimension = 2 * table_dim
+
+    for (size_t i = threadIdx.x; i < table_dim; i += blockDim.x) {
+        Tangle sin__ = sin_table[table_offset + i];
+        Tangle cos__ = cos_table[table_offset + i];
+
+        // Calculate positions in first and second halves
+        size_t pos0 = i;
+        size_t pos1 = i + half_dim;
+
+        if constexpr (std::is_same<Tdata, half>::value) {
+            Tangle x0 = __half2float(x_[x_offset + pos0]);
+            Tangle x1 = __half2float(x_[x_offset + pos1]);
+
+            Tangle y0 = x0 * cos__ - x1 * sin__;
+            Tangle y1 = x0 * sin__ + x1 * cos__;
+
+            y_[y_offset + pos0] = __float2half(y0);
+            y_[y_offset + pos1] = __float2half(y1);
+        } else if constexpr (std::is_same<Tdata, cuda_bfloat16>::value) {
+            Tangle x0 = __bfloat162float(x_[x_offset + pos0]);
+            Tangle x1 = __bfloat162float(x_[x_offset + pos1]);
+
+            Tangle y0 = x0 * cos__ - x1 * sin__;
+            Tangle y1 = x0 * sin__ + x1 * cos__;
+
+            y_[y_offset + pos0] = __float2bfloat16(y0);
+            y_[y_offset + pos1] = __float2bfloat16(y1);
+        } else {
+            Tangle x0 = x_[x_offset + pos0];
+            Tangle x1 = x_[x_offset + pos1];
+
+            y_[y_offset + pos0] = x0 * cos__ - x1 * sin__;
+            y_[y_offset + pos1] = x0 * sin__ + x1 * cos__;
+        }
+    }
+}
+
+#endif
--- a/src/infiniop/ops/rope_v2/metax/rope_metax.h
+++ b/src/infiniop/ops/rope_v2/metax/rope_metax.h
+#ifndef __INFINIOP_ROPE_METAX_H__
+#define __INFINIOP_ROPE_METAX_H__
+
+#include "../rope.h"
+
+DESCRIPTOR(metax)
+
+#endif // __INFINIOP_ROPE_METAX_H__
--- a/src/infiniop/ops/rope_v2/metax/rope_metax.maca
+++ b/src/infiniop/ops/rope_v2/metax/rope_metax.maca
+#include "../../../devices/metax/metax_common.h"
+#include "rope_metax.h"
+
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, typename Tindex, typename Tangle>
+INFINIOP_METAX_KERNEL ropeThreadPerItemKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+    ropeThreadPerItemBlock(
+        y_, x_, pos_ids,
+        sin_table, cos_table,
+        table_dim,
+        y_stride_seqlen, y_stride_nhead,
+        x_stride_seqlen, x_stride_nhead);
+}
+
+namespace op::rope::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t pos_desc,
+    infiniopTensorDescriptor_t sin_desc,
+    infiniopTensorDescriptor_t cos_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+    CHECK_RESULT(info);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata, typename Tindex>
+infiniStatus_t calculateRoPE(const RoPEInfo &info,
+                             int block_size,
+                             Tdata *y,
+                             const Tdata *x,
+                             const Tindex *pos_ids,
+                             const Tdata *sin_table,
+                             const Tdata *cos_table,
+                             hcStream_t stream) {
+    auto dimx = uint32_t(info.seqlen),
+         dimy = uint32_t(info.nhead);
+    int nthreads = std::max(int(info.table_dim), block_size);
+
+    ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
+        y, x, pos_ids, sin_table, cos_table, info.table_dim,
+        info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROPE(TDATA, TINDEX)                      \
+    calculateRoPE(_info,                                   \
+                  _opaque->internal->maxThreadsPerBlock(), \
+                  (TDATA *)y,                              \
+                  (const TDATA *)x,                        \
+                  (const TINDEX *)pos_ids,                 \
+                  (const TDATA *)sin_table,                \
+                  (const TDATA *)cos_table,                \
+                  (hcStream_t)stream)
+
+#define ROPE_TYPE(TDATA)                        \
+    switch (_info.pos_type) {                   \
+    case INFINI_DTYPE_U8:                       \
+        return CALCULATE_ROPE(TDATA, uint8_t);  \
+    case INFINI_DTYPE_U16:                      \
+        return CALCULATE_ROPE(TDATA, uint16_t); \
+    case INFINI_DTYPE_U32:                      \
+        return CALCULATE_ROPE(TDATA, uint32_t); \
+    case INFINI_DTYPE_U64:                      \
+        return CALCULATE_ROPE(TDATA, uint64_t); \
+    case INFINI_DTYPE_I8:                       \
+        return CALCULATE_ROPE(TDATA, int8_t);   \
+    case INFINI_DTYPE_I16:                      \
+        return CALCULATE_ROPE(TDATA, int16_t);  \
+    case INFINI_DTYPE_I32:                      \
+        return CALCULATE_ROPE(TDATA, int32_t);  \
+    case INFINI_DTYPE_I64:                      \
+        return CALCULATE_ROPE(TDATA, int64_t);  \
+    default:                                    \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;  \
+    }
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) const {
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        ROPE_TYPE(half);
+    case INFINI_DTYPE_BF16:
+        ROPE_TYPE(cuda_bfloat16);
+    case INFINI_DTYPE_F32:
+        ROPE_TYPE(float);
+    case INFINI_DTYPE_F64:
+        ROPE_TYPE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#undef ROPE_TYPE
+#undef CALCULATE_ROPE
+
+} // namespace op::rope::metax
--- a/src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cu
+++ b/src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cu
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "rope_v2_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::rope_v2::nvidia {
+
+template <typename Tdata, typename Tindex, typename Tangle>
+INFINIOP_CUDA_KERNEL ropev2ThreadPerItemKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+    ropeThreadPerItemBlock(
+        y_, x_, pos_ids,
+        sin_table, cos_table,
+        table_dim,
+        y_stride_seqlen, y_stride_nhead,
+        x_stride_seqlen, x_stride_nhead);
+}
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t pos_desc,
+    infiniopTensorDescriptor_t sin_desc,
+    infiniopTensorDescriptor_t cos_desc) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    auto info = RoPEv2Info::createRoPEv2Info(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+    CHECK_RESULT(info);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata, typename Tindex>
+infiniStatus_t calculateRoPEv2(const RoPEv2Info &info,
+                               int block_size,
+                               Tdata *y,
+                               const Tdata *x,
+                               const Tindex *pos_ids,
+                               const Tdata *sin_table,
+                               const Tdata *cos_table,
+                               cudaStream_t stream) {
+    auto dimx = uint32_t(info.seqlen),
+         dimy = uint32_t(info.nhead);
+    int nthreads = std::max(int(info.table_dim), block_size);
+
+    ropev2ThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
+        y, x, pos_ids, sin_table, cos_table, info.table_dim,
+        info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROPE_V2(TDATA, TINDEX)                     \
+    calculateRoPEv2(_info,                                   \
+                    _opaque->internal->maxThreadsPerBlock(), \
+                    (TDATA *)y,                              \
+                    (const TDATA *)x,                        \
+                    (const TINDEX *)pos_ids,                 \
+                    (const TDATA *)sin_table,                \
+                    (const TDATA *)cos_table,                \
+                    (cudaStream_t)stream)
+
+#define ROPE_TYPE(TDATA)                           \
+    switch (_info.pos_type) {                      \
+    case INFINI_DTYPE_U8:                          \
+        return CALCULATE_ROPE_V2(TDATA, uint8_t);  \
+    case INFINI_DTYPE_U16:                         \
+        return CALCULATE_ROPE_V2(TDATA, uint16_t); \
+    case INFINI_DTYPE_U32:                         \
+        return CALCULATE_ROPE_V2(TDATA, uint32_t); \
+    case INFINI_DTYPE_U64:                         \
+        return CALCULATE_ROPE_V2(TDATA, uint64_t); \
+    case INFINI_DTYPE_I8:                          \
+        return CALCULATE_ROPE_V2(TDATA, int8_t);   \
+    case INFINI_DTYPE_I16:                         \
+        return CALCULATE_ROPE_V2(TDATA, int16_t);  \
+    case INFINI_DTYPE_I32:                         \
+        return CALCULATE_ROPE_V2(TDATA, int32_t);  \
+    case INFINI_DTYPE_I64:                         \
+        return CALCULATE_ROPE_V2(TDATA, int64_t);  \
+    default:                                       \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;     \
+    }
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) const {
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        ROPE_TYPE(half);
+    case INFINI_DTYPE_BF16:
+        ROPE_TYPE(cuda_bfloat16);
+    case INFINI_DTYPE_F32:
+        ROPE_TYPE(float);
+    case INFINI_DTYPE_F64:
+        ROPE_TYPE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#undef ROPE_TYPE
+#undef CALCULATE_ROPE
+
+} // namespace op::rope_v2::nvidia
--- a/src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cuh
+++ b/src/infiniop/ops/rope_v2/nvidia/rope_v2_nvidia.cuh
+#ifndef __INFINIOP_ROPE_V2_CUDA_H__
+#define __INFINIOP_ROPE_V2_CUDA_H__
+
+#include "../rope_v2.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INFINIOP_ROPE_V2_CUDA_H__
--- a/src/infiniop/ops/rope_v2/operator.cc
+++ b/src/infiniop/ops/rope_v2/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/rope_v2.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/rope_v2_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/rope_v2_nvidia.cuh"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/rope_v2_ascend.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/rope_v2_bang.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/rope_v2_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateRoPEv2Descriptor(
+    infiniopHandle_t handle,
+    infiniopRoPEv2Descriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table) {
+
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::rope_v2::NAMESPACE::Descriptor::create(                     \
+            handle,                                                            \
+            reinterpret_cast<op::rope_v2::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                                 \
+            x,                                                                 \
+            pos_ids,                                                           \
+            sin_table,                                                         \
+            cos_table)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaCreateRoPEDescriptor((MusaHandle_t)handle,
+                                        (RoPEMusaDescriptor_t *)desc_ptr, t,
+                                        pos_ids, sin_table, cos_table);
+    }
+#endif
+    }
+
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetRoPEv2WorkspaceSize(infiniopRoPEv2Descriptor_t desc,
+                                                  size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t)desc, size);
+    }
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopRoPEv2(
+    infiniopRoPEv2Descriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                    \
+        return reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaRoPE((RoPEMusaDescriptor_t)desc, workspace, workspace_size,
+                        t, pos_ids, sin_table, cos_table, stream);
+    }
+#endif
+    }
+
+#undef CALCULATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t
+infiniopDestroyRoPEv2Descriptor(infiniopRoPEv2Descriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        delete reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t)desc);
+    }
+#endif
+    }
+
+#undef DELETE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/rope_v2/rope_v2.h
+++ b/src/infiniop/ops/rope_v2/rope_v2.h
+#ifndef __ROPE_V2_H__
+#define __ROPE_V2_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::rope_v2::NAMESPACE {                           \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        RoPEv2Info _info;                                        \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            RoPEv2Info info,                                     \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t pos_desc,                 \
+            infiniopTensorDescriptor_t sin_desc,                 \
+            infiniopTensorDescriptor_t cos_desc);                \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *y,                                             \
+            const void *x,                                       \
+            const void *pos_ids,                                 \
+            const void *sin_table,                               \
+            const void *cos_table,                               \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+class RoPEv2Info {
+private:
+    RoPEv2Info() = default;
+
+public:
+    infiniDtype_t data_type, pos_type;
+    size_t seqlen, nhead, dhead, table_len, table_dim;
+    ptrdiff_t
+        y_stride_seqlen,
+        y_stride_nhead,
+        x_stride_seqlen,
+        x_stride_nhead;
+
+    static utils::Result<RoPEv2Info> createRoPEv2Info(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t pos_desc,
+        infiniopTensorDescriptor_t sin_desc,
+        infiniopTensorDescriptor_t cos_desc) {
+        CHECK_OR_RETURN(
+            y_desc != nullptr && pos_desc != nullptr && sin_desc != nullptr && cos_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t data_type = y_desc->dtype();
+        const infiniDtype_t pos_type = pos_desc->dtype();
+        CHECK_OR_RETURN(data_type == x_desc->dtype() && data_type == sin_desc->dtype() && data_type == cos_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE_ANY_INT(pos_type);
+
+        CHECK_OR_RETURN(y_desc->ndim() == 3
+                            && x_desc->ndim() == 3
+                            && pos_desc->ndim() == 1
+                            && sin_desc->ndim() == 2
+                            && cos_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        const auto seqlen = y_desc->dim(0),
+                   nhead = y_desc->dim(1),
+                   dhead = y_desc->dim(2),
+                   table_len = sin_desc->dim(0),
+                   table_dim = sin_desc->dim(1);
+
+        CHECK_OR_RETURN(seqlen == x_desc->dim(0)
+                            && seqlen == pos_desc->dim(0)
+                            && nhead == x_desc->dim(1) && dhead == x_desc->dim(2)
+                            && table_len == cos_desc->dim(0) && table_dim == cos_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        CHECK_OR_RETURN(dhead == table_dim * 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        // Last dimension of x and y must be contiguous
+        CHECK_OR_RETURN(y_desc->stride(2) == 1 && x_desc->stride(2) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
+        // sin table and cos table must be totally contiguous
+        CHECK_OR_RETURN(sin_desc->isContiguous() && cos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+        return utils::Result<RoPEv2Info>(RoPEv2Info{
+            data_type,
+            pos_type,
+            seqlen,
+            nhead,
+            dhead,
+            table_len,
+            table_dim,
+            y_desc->stride(0),
+            y_desc->stride(1),
+            x_desc->stride(0),
+            x_desc->stride(1),
+        });
+    }
+};
+
+#endif
--- a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
+++ b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
+#include "topkrouter_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::topkrouter::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t correction_bias_desc) {
+
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    float *values, int *indices, void *x, float *correction_bias,
+    float routed_scaling_factor,
+    size_t topk,
+    void *stream) const {
+
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+}
+} // namespace op::topkrouter::cpu
--- a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
+++ b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
+#ifndef __Topkrouter_CPU_H__
+#define __Topkrouter_CPU_H__
+#include "../topkrouter.h"
+
+DESCRIPTOR(cpu)
+
+#endif
--- a/src/infiniop/ops/topkrouter/cuda/kernel.cuh
+++ b/src/infiniop/ops/topkrouter/cuda/kernel.cuh
+#ifndef _Topkrouter_KERNEL_CUH__
+#define _Topkrouter_KERNEL_CUH__
+#include <cfloat>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/cub.cuh>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+template <typename T>
+inline __device__ float exp_func(T x) {
+    float data;
+    if constexpr (std::is_same_v<T, float>) {
+        data = x;
+    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        data = __bfloat162float(x);
+    } else if constexpr (std::is_same_v<T, half>) {
+        data = __half2float(x);
+    }
+    return __expf(data);
+}
+
+template <typename T>
+inline __device__ T sigmoid_func(T x) {
+    // sigmoid(x) = 1 / (1 + exp(-x))
+    return 1.0f / (1.0f + exp_func<T>(-x));
+}
+
+struct CustomLess {
+    template <typename DataType>
+    __device__ bool operator()(const DataType &lhs, const DataType &rhs) {
+        return lhs > rhs;
+    }
+};
+
+//
+// deepseek的topk
+//
+template <typename T, int BLOCK_THREADS = 256>
+__global__ void topkrouter_kernel(float *values_topk,          // 输出值, 形状[N, topk]
+                                  int *indices_topk,           // 输出索引, 形状[N, topk]
+                                  T *input,                    // 输入数据 [N, width]
+                                  float *d_correction_bias,    // 输入数据 [width]
+                                  float routed_scaling_factor, //
+                                  const size_t N,              // 总行数,toen数量
+                                  const size_t width,          // 每行元素数量
+                                  const size_t topk
+
+) {
+    const int bid = blockIdx.x;
+    if (bid >= N) {
+        return;
+    }
+    const int tid = threadIdx.x;
+    const T *data_input = input + bid * width;
+    float *values_topk_output = values_topk + bid * topk;
+    int *indices_topk_output = indices_topk + bid * topk;
+
+    constexpr int warp_threads = 32;
+    constexpr int block_threads = 256;
+    constexpr int warps_per_block = block_threads / warp_threads;
+    const int warp_id = tid / warp_threads;
+    const int lane_id = tid % warp_threads;
+
+    __shared__ float share_data[256];
+    __shared__ float share_data_group[8];
+    __shared__ float share_data_group_mask[8]; // 有效的group
+    __shared__ float share_sum;
+    if (tid < 8) {
+        share_data_group_mask[tid] = 0.0f;
+    }
+
+    // ------------------------------------------------------ //
+    //             对输入数据做 sigmoid                         //
+    // ------------------------------------------------------ //
+    float value = sigmoid_func(data_input[tid]);
+
+    // ------------------------------------------------------ //
+    //             对输入数据加偏执                              //
+    // ------------------------------------------------------ //
+    value += d_correction_bias[tid];
+
+    // ----------------------------------------------------------- //
+    //      每个warp为一组，一共8组，找出每组的最大的前两个数据            //
+    // ----------------------------------------------------------- //
+    float thread_values[1] = {value};
+    int thread_indices[1] = {tid};
+    using WarpMergeSortT = cub::WarpMergeSort<float, 1, warp_threads, int>;
+    {
+        __shared__ typename WarpMergeSortT::TempStorage temp_storage[warps_per_block];
+        WarpMergeSortT(temp_storage[warp_id]).Sort(thread_values, thread_indices, CustomLess());
+    }
+    __syncthreads();
+    share_data[tid] = thread_values[0];
+
+    // ----------------------------------------------------------- //
+    //              每个组中,前两个数据的和                            //
+    // ----------------------------------------------------------- //
+
+    __syncthreads();
+    if (0 == lane_id) {
+        share_data_group[warp_id] = share_data[warp_id * warp_threads] + share_data[warp_id * warp_threads + 1];
+    }
+    __syncthreads();
+    // ----------------------------------------------------------- //
+    //                  再选前 4 个                                 //
+    // ----------------------------------------------------------- //
+    if (0 == warp_id) {
+        float thread_values[1] = {-FLT_MAX};
+        int thread_indices[1] = {-1};
+        if (lane_id < 8) {
+            thread_values[0] = share_data_group[lane_id];
+            thread_indices[0] = lane_id;
+        }
+
+        __shared__ typename WarpMergeSortT::TempStorage temp_storage[1];
+        WarpMergeSortT(temp_storage[0]).Sort(thread_values, thread_indices, CustomLess());
+        if (lane_id < 4) {
+            int indices = thread_indices[0];
+            share_data_group_mask[indices] = 1.0f;
+        }
+    }
+    __syncthreads();
+
+    // ----------------------------------------------------------- //
+    //                 求得 最后一次topk                             //
+    // ----------------------------------------------------------- //
+    value = value * share_data_group_mask[warp_id];
+    thread_values[0] = value;
+    thread_indices[0] = tid;
+    {
+        typedef cub::BlockRadixSort<float, BLOCK_THREADS, 1, int> BlockRadixSort;
+        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+        BlockRadixSort(temp_storage).SortDescending(thread_values, thread_indices);
+    }
+    __syncthreads();
+
+    // ----------------------------------------------------------- //
+    //                 归一化                                       //
+    // ----------------------------------------------------------- //
+    if (0 == warp_id) {
+        value = 0.0f;
+        if (tid < 8) {
+            int index = thread_indices[0];
+            value = sigmoid_func(data_input[index]);
+        }
+
+        typedef cub::WarpReduce<float, warp_threads> WarpReduce;
+        __shared__ typename WarpReduce::TempStorage temp_storage;
+        // 使用有效项group 进行部分归约
+        float warp_sum = WarpReduce(temp_storage).Sum(value);
+        if (0 == tid) {
+            share_sum = warp_sum + 1e-20;
+        }
+        __syncwarp();
+
+        if (tid < 8) {
+            int index = thread_indices[0];
+            indices_topk_output[tid] = index;
+            values_topk_output[tid] = routed_scaling_factor * value / share_sum;
+        }
+    }
+}
+
+#endif // _topkrouter_KERNEL_CUH__
--- a/src/infiniop/ops/topkrouter/info.h
+++ b/src/infiniop/ops/topkrouter/info.h
+#ifndef __topkrouter_INFO_H__
+#define __topkrouter_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::topkrouter {
+
+class TopkrouterInfo {
+    TopkrouterInfo() = default;
+
+public:
+    infiniDtype_t xtype;
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> x_strides;
+    size_t N;
+    size_t width;
+
+public:
+    size_t ndim() const { return shape.size(); }
+    size_t dim() const { return shape[ndim() - 1]; }
+
+    static utils::Result<TopkrouterInfo> create(infiniopTensorDescriptor_t x_desc) {
+
+        auto xtype = x_desc->dtype();
+        if ((xtype != infiniDtype_t::INFINI_DTYPE_F32) && (xtype != infiniDtype_t::INFINI_DTYPE_F16) && (xtype != infiniDtype_t::INFINI_DTYPE_BF16)) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        size_t N = x_desc->shape()[0];     // token数量
+        size_t width = x_desc->shape()[1]; // 专家数量
+
+        if (x_desc->ndim() != 2) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        return utils::Result<TopkrouterInfo>(TopkrouterInfo{xtype, x_desc->shape(), x_desc->strides(), N, width});
+    }
+};
+
+} // namespace op::topkrouter
+
+#endif // __Topkrouter_INFO_H__
--- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
+++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "topkrouter_nvidia.cuh"
+#include <cub/block/block_reduce.cuh>
+
+namespace op::topkrouter::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t correction_bias_desc) {
+    auto result = TopkrouterInfo::create(x_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    if (info.x_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template <int BLOCK_SIZE = 128>
+infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, void *d_input, float *d_correction_bias, float routed_scaling_factor,
+                                 size_t N, size_t width, size_t topk, infiniDtype_t xtype, cudaStream_t stream) {
+
+    const int block_threads = BLOCK_SIZE;
+    dim3 blocks(N);
+    dim3 threads(block_threads);
+
+    if (xtype == INFINI_DTYPE_F32) {
+        topkrouter_kernel<float, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (float *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
+    } else if (xtype == INFINI_DTYPE_F16) {
+        topkrouter_kernel<half, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (half *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
+    } else if (xtype == INFINI_DTYPE_BF16) {
+        topkrouter_kernel<__nv_bfloat16, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (__nv_bfloat16 *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    float *values, int *indices, void *x, float *correction_bias, float routed_scaling_factor, size_t topk, void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    size_t N = _info.N;
+    size_t width = _info.width; // 256
+
+    // size_t n_routed_experts = 256;
+    // size_t n_group = 8;
+    // size_t topk_group = 4;
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    if (256 == width) {
+        launch_topkrouter<256>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, cuda_stream);
+    } else {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::topkrouter::nvidia
--- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
+++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
+#ifndef __Topkrouter_CUDA_H__
+#define __Topkrouter_CUDA_H__
+
+#include "../topkrouter.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
--- a/src/infiniop/ops/topkrouter/operator.cc
+++ b/src/infiniop/ops/topkrouter/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/topkrouter.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/topkrouter_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API)
+#include "nvidia/topkrouter_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateTopkrouterDescriptor(
+    infiniopHandle_t handle,
+    infiniopTopkrouterDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t correction_bias_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::topkrouter::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc, correction_bias_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size,
+                                      void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
+        return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, topk, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef CALCULATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                \
+    case CASE:                                                                  \
+        delete reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef DESTROY
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/topkrouter/topkrouter.h
+++ b/src/infiniop/ops/topkrouter/topkrouter.h
+#ifndef _Topkrouter_H_
+#define _Topkrouter_H_
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::topkrouter::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        TopkrouterInfo _info;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            TopkrouterInfo info,                                 \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t correction_bias_desc);    \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            float *values,                                       \
+            int *indices,                                        \
+            void *x,                                             \
+            float *correction_bias,                              \
+            float routed_scaling_factor,                         \
+            size_t topk,                                         \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // _Topkrouter_H_
--- a/test/infiniop/dequantize.py
+++ b/test/infiniop/dequantize.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
+    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
+    (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
+    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
+    (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
+    (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 5e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+# PyTorch implementation for matrix multiplication
+def gemm(d, _c, beta, _a, _b, alpha):
+    try:
+        if _c.ndim == 2:
+            torch.addmm(_c, _a, _b, beta=beta, alpha=alpha, out=d)
+        elif _c.ndim == 3:
+            torch.baddbmm(_c, _a, _b, beta=beta, alpha=alpha, out=d)
+        else:
+            raise
+    except Exception:
+        torch.matmul(_a, _b, out=d)
+        d.mul_(alpha).add_(_c, alpha=beta)
+
+
+# The argument list should be (lib, handle, torch_device, <param list>, dtype)
+# The <param list> should keep the same order as the one specified in _TEST_CASES
+def test(
+    handle,
+    device,
+    alpha,
+    beta,
+    a_shape,
+    b_shape,
+    c_shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Gemm on {InfiniDeviceNames[device]} with alpha:{alpha}, beta:{beta},"
+        f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
+        f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+        
+    qweight = TestTensor((8192, 256), None, InfiniDtype.I32, device, mode="randint")
+    scales = TestTensor((64, 2048), None, InfiniDtype.F16, device)
+    zeros = TestTensor((64, 256), None, InfiniDtype.I32, device, mode="zeros")
+    out = TestTensor((8192, 2048), None, InfiniDtype.F16, device, mode="zeros")
+    
+    print(out.actual_tensor())
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDequantizeDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out.descriptor,
+            qweight.descriptor,
+            scales.descriptor,
+            zeros.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    # for tensor in [a, b, c]:
+    #     tensor.destroy_desc()
+
+    # Get workspace size and create workspace
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDequantizeWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    # Execute infiniop gemm operator
+    def lib_dequantize():
+        check_error(
+            LIBINFINIOP.infiniopDequantize(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                qweight.data(),
+                scales.data(),
+                zeros.data(),
+                0,
+                0,
+                0,
+                None,
+            )
+        )
+
+    lib_dequantize()
+    
+    print(out.actual_tensor())
+
+    # # Validate results
+    # atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+
+    # if DEBUG:
+    #     debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
+
+    # assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
+
+    # # Profiling workflow
+    # if PROFILE:
+    #     # fmt: off
+    #     profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
+    #     profile_operation("    lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
+    #     # fmt: on
+    # check_error(LIBINFINIOP.infiniopDestroyDequantizeDescriptor(descriptor))
+
+
+# ==============================================================================
+#  Main Execution
+# ==============================================================================
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -387,6 +387,42 @@ def rope_(lib):
    ]


+@OpRegister.operator
+def rope_v2_(lib):
+    lib.infiniopCreateRoPEv2Descriptor.restype = c_int32
+    lib.infiniopCreateRoPEv2Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRoPEv2WorkspaceSize.restype = c_int32
+    lib.infiniopGetRoPEv2WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRoPEv2.restype = c_int32
+    lib.infiniopRoPEv2.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRoPEv2Descriptor.restype = c_int32
+    lib.infiniopDestroyRoPEv2Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def sub_(lib):
    lib.infiniopCreateSubDescriptor.restype = c_int32
@@ -489,3 +525,74 @@ def conv_(lib):
    lib.infiniopDestroyConvDescriptor.argtypes = [
        infiniopOperatorDescriptor_t,
    ]
+    
+@OpRegister.operator
+def topkrouter_(lib):
+    lib.infiniopCreateTopkrouterDescriptor.restype = c_int32
+    lib.infiniopCreateTopkrouterDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t
+    ]
+
+    lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32
+    lib.infiniopGetTopkrouterWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopTopkrouter.restype = c_int32
+    lib.infiniopTopkrouter.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_float,
+        c_size_t,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyTopkrouterDescriptor.restype = c_int32
+    lib.infiniopDestroyTopkrouterDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+
+@OpRegister.operator
+def dequantize_(lib):
+    lib.infiniopCreateDequantizeDescriptor.restype = c_int32
+    lib.infiniopCreateDequantizeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetDequantizeWorkspaceSize.restype = c_int32
+    lib.infiniopGetDequantizeWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopDequantize.restype = c_int32
+    lib.infiniopDequantize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_size_t,
+        c_size_t,
+        c_size_t,
+        c_void_p,
+    ]
+    lib.infiniopDestroyDequantizeDescriptor.restype = c_int32
+    lib.infiniopDestroyDequantizeDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
\ No newline at end of file
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -78,6 +78,8 @@ class TestTensor(CTensor):
            self._torch_tensor = torch.ones(
                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
            )
+        elif mode == "randint":
+            self._torch_tensor = torch.randint(-2000000000,2000000000, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
        elif mode == "manual":
            assert set_tensor is not None
            assert torch_shape == list(set_tensor.shape)

--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
@@ -37,7 +37,7 @@ _TEST_CASES_ = [

 # w (weight) types
 # Note: 'None' means the same as input dtype
-_WEIGHT_DTYPES = [None, InfiniDtype.F32]
+_WEIGHT_DTYPES = [None, InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
 # x types used for testing
 _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]