issue/251: 摩尔平台 rope 算子开发

9c87dbb1 · spike-zhu · GitHub · f2bb97ad · 9c87dbb1 · 9c87dbb1
Unverified Commit 9c87dbb1 authored Aug 27, 2025 by spike-zhu Committed by GitHub Aug 27, 2025
4 changed files
--- a/src/infiniop/ops/rope/moore/rope_kernel_moore.h
+++ b/src/infiniop/ops/rope/moore/rope_kernel_moore.h
+#ifndef __INFINIOP_ROPE_MOORE_KERNEL_H__
+#define __INFINIOP_ROPE_MOORE_KERNEL_H__
+
+/*
+ * This file contains the RoPE operation implementation for the MUSA backend.
+ *
+ * It maintain a consistent code structure and interface with the CUDA implementation,
+ * which ensuring code alignment across different hardware platforms.
+ */
+
+template <typename Tdata, typename Tindex, typename Tangle>
+__device__ void ropeThreadPerItemBlock(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+
+    auto y_offset = blockIdx.x * y_stride_seqlen + blockIdx.y * y_stride_nhead;
+    auto x_offset = blockIdx.x * x_stride_seqlen + blockIdx.y * x_stride_nhead;
+    size_t pos_id = size_t(pos_ids[blockIdx.x]);
+    auto table_offset = pos_id * table_dim;
+
+    for (size_t i = threadIdx.x; i < table_dim; i += blockDim.x) {
+        Tangle sin__ = sin_table[table_offset + i],
+               cos__ = cos_table[table_offset + i];
+        if constexpr (std::is_same<Tdata, half>::value) {
+            auto &y = reinterpret_cast<half2 &>(y_[y_offset + 2 * i]);
+            auto &x = reinterpret_cast<const half2 &>(x_[x_offset + 2 * i]);
+            Tangle y0 = x.x * cos__ - x.y * sin__,
+                   y1 = x.x * sin__ + x.y * cos__;
+            y = half2(y0, y1);
+        } else if constexpr (std::is_same<Tdata, cuda_bfloat16>::value) {
+            auto &y = reinterpret_cast<cuda_bfloat162 &>(y_[y_offset + 2 * i]);
+            auto &x = reinterpret_cast<const cuda_bfloat162 &>(x_[x_offset + 2 * i]);
+
+            /*
+             * The original code used CUDA-specific functions (__low2bfloat16, __high2bfloat16)
+             * to extract bfloat16 values from a packed variable.
+             *
+             * This code has been modified for the MUSA platform, which does not support
+             * these CUDA built-in functions. Instead, MUSA provides a different set of
+             * built-in functions (`__low2float`, `__high2float`) that directly convert
+             * the bfloat16 values to float.
+             *
+             * This change ensures cross-platform compatibility and resolves compilation errors.
+             */
+
+            Tangle x0 = __low2float(x);
+            Tangle x1 = __high2float(x);
+
+            Tangle y0 = x0 * cos__ - x1 * sin__;
+            Tangle y1 = x0 * sin__ + x1 * cos__;
+
+            y = __floats2bfloat162_rn(y0, y1);
+        } else {
+            Tangle x0 = x_[x_offset + 2 * i],
+                   x1 = x_[x_offset + 2 * i + 1];
+            y_[y_offset + 2 * i] = Tdata(x0 * cos__ - x1 * sin__);
+            y_[y_offset + 2 * i + 1] = Tdata(x0 * sin__ + x1 * cos__);
+        }
+    }
+}
+
+#endif
--- a/src/infiniop/ops/rope/moore/rope_moore.h
+++ b/src/infiniop/ops/rope/moore/rope_moore.h
+#ifndef __INFINIOP_ROPE_MOORE_H__
+#define __INFINIOP_ROPE_MOORE_H__
+
+#include "../rope.h"
+
+DESCRIPTOR(moore)
+
+#endif // __INFINIOP_ROPE_MOORE_H__
--- a/src/infiniop/ops/rope/moore/rope_moore.mu
+++ b/src/infiniop/ops/rope/moore/rope_moore.mu
+#include "../../../devices/moore/moore_common.h"
+#include "rope_moore.h"
+
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "rope_kernel_moore.h"
+
+template <typename Tdata, typename Tindex, typename Tangle>
+INFINIOP_MOORE_KERNEL ropeThreadPerItemKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+    ropeThreadPerItemBlock(
+        y_, x_, pos_ids,
+        sin_table, cos_table,
+        table_dim,
+        y_stride_seqlen, y_stride_nhead,
+        x_stride_seqlen, x_stride_nhead);
+}
+
+namespace op::rope::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t pos_desc,
+    infiniopTensorDescriptor_t sin_desc,
+    infiniopTensorDescriptor_t cos_desc) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+    CHECK_RESULT(info);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata, typename Tindex>
+infiniStatus_t calculateRoPE(const RoPEInfo &info,
+                             int block_size,
+                             Tdata *y,
+                             const Tdata *x,
+                             const Tindex *pos_ids,
+                             const Tdata *sin_table,
+                             const Tdata *cos_table,
+                             musaStream_t stream) {
+    auto dimx = uint32_t(info.seqlen),
+         dimy = uint32_t(info.nhead);
+    int nthreads = std::max(int(info.table_dim), block_size);
+
+    ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
+        y, x, pos_ids, sin_table, cos_table, info.table_dim,
+        info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROPE(TDATA, TINDEX)                      \
+    calculateRoPE(_info,                                   \
+                  _opaque->internal->maxThreadsPerBlock(), \
+                  (TDATA *)y,                              \
+                  (const TDATA *)x,                        \
+                  (const TINDEX *)pos_ids,                 \
+                  (const TDATA *)sin_table,                \
+                  (const TDATA *)cos_table,                \
+                  (musaStream_t)stream)
+
+#define ROPE_TYPE(TDATA)                        \
+    switch (_info.pos_type) {                   \
+    case INFINI_DTYPE_U8:                       \
+        return CALCULATE_ROPE(TDATA, uint8_t);  \
+    case INFINI_DTYPE_U16:                      \
+        return CALCULATE_ROPE(TDATA, uint16_t); \
+    case INFINI_DTYPE_U32:                      \
+        return CALCULATE_ROPE(TDATA, uint32_t); \
+    case INFINI_DTYPE_U64:                      \
+        return CALCULATE_ROPE(TDATA, uint64_t); \
+    case INFINI_DTYPE_I8:                       \
+        return CALCULATE_ROPE(TDATA, int8_t);   \
+    case INFINI_DTYPE_I16:                      \
+        return CALCULATE_ROPE(TDATA, int16_t);  \
+    case INFINI_DTYPE_I32:                      \
+        return CALCULATE_ROPE(TDATA, int32_t);  \
+    case INFINI_DTYPE_I64:                      \
+        return CALCULATE_ROPE(TDATA, int64_t);  \
+    default:                                    \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;  \
+    }
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) const {
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        ROPE_TYPE(half);
+    case INFINI_DTYPE_BF16:
+        ROPE_TYPE(cuda_bfloat16);
+    case INFINI_DTYPE_F32:
+        ROPE_TYPE(float);
+    case INFINI_DTYPE_F64:
+        ROPE_TYPE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#undef ROPE_TYPE
+#undef CALCULATE_ROPE
+
+} // namespace op::rope::moore
--- a/src/infiniop/ops/rope/operator.cc
+++ b/src/infiniop/ops/rope/operator.cc
@@ -20,6 +20,9 @@
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/rope_kunlun.h"
 #endif
+#ifdef ENABLE_MOORE_API
+#include "moore/rope_moore.h"
+#endif

 __C infiniStatus_t infiniopCreateRoPEDescriptor(
    infiniopHandle_t handle,
@@ -51,6 +54,9 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
 #ifdef ENABLE_ILUVATAR_API
        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
 #ifdef ENABLE_METAX_API
        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
@@ -62,13 +68,6 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
 #endif
 #ifdef ENABLE_CAMBRICON_API
        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu: {
-        return musaCreateRoPEDescriptor((MusaHandle_t)handle,
-                                        (RoPEMusaDescriptor_t *)desc_ptr, t,
-                                        pos_ids, sin_table, cos_table);
-    }
 #endif
    }

@@ -94,6 +93,9 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
 #ifdef ENABLE_ILUVATAR_API
        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
 #ifdef ENABLE_METAX_API
        GET(INFINI_DEVICE_METAX, metax);
 #endif
@@ -105,11 +107,6 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
 #endif
 #ifdef ENABLE_ASCEND_API
        GET(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu: {
-        return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t)desc, size);
-    }
 #endif
    }

@@ -144,6 +141,9 @@ __C infiniStatus_t infiniopRoPE(
 #ifdef ENABLE_ILUVATAR_API
        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
 #ifdef ENABLE_METAX_API
        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
@@ -155,12 +155,6 @@ __C infiniStatus_t infiniopRoPE(
 #endif
 #ifdef ENABLE_ASCEND_API
        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu: {
-        return musaRoPE((RoPEMusaDescriptor_t)desc, workspace, workspace_size,
-                        t, pos_ids, sin_table, cos_table, stream);
-    }
 #endif
    }

@@ -187,6 +181,9 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
 #ifdef ENABLE_ILUVATAR_API
        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
 #ifdef ENABLE_METAX_API
        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
@@ -198,11 +195,6 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
 #endif
 #ifdef ENABLE_ASCEND_API
        DELETE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_MTHREADS_GPU
-    case DevMthreadsGpu: {
-        return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t)desc);
-    }
 #endif
    }