Merge pull request #160 from InfiniTensor/issue/40

issue/40: 实现沐曦rms_norm算子

Merge pull request #160 from InfiniTensor/issue/40
issue/40: 实现沐曦rms_norm算子
b985bc5e · PanZezhong1725 · GitHub · c667efbd · 7847868e · b985bc5e
Unverified Commit b985bc5e authored Apr 14, 2025 by PanZezhong1725 Committed by GitHub Apr 14, 2025
6 changed files
--- a/src/infiniop/devices/maca/common_maca.h
+++ b/src/infiniop/devices/maca/common_maca.h
@@ -17,9 +17,24 @@ class Handle::Internal {
    template <typename T>
    using Fn = std::function<infiniStatus_t(T)>;

+    int _warp_size,
+        _max_threads_per_block,
+        _block_size[3],
+        _grid_size[3];
+
 public:
+    Internal(int);
    infiniStatus_t useMcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const;
    infiniStatus_t useMcdnn(hcStream_t stream, const Fn<hcdnnHandle_t> &f) const;
+
+    int warpSize() const;
+    int maxThreadsPerBlock() const;
+    int blockSizeX() const;
+    int blockSizeY() const;
+    int blockSizeZ() const;
+    int gridSizeX() const;
+    int gridSizeY() const;
+    int gridSizeZ() const;
 };

 hcdnnDataType_t getHcdnnDtype(infiniDtype_t dt);

--- a/src/infiniop/devices/maca/maca_handle.cc
+++ b/src/infiniop/devices/maca/maca_handle.cc
@@ -3,7 +3,7 @@
 namespace device::maca {
 Handle::Handle(infiniDevice_t device, int device_id)
    : InfiniopHandle{device, device_id},
-      _internal(std::make_shared<Handle::Internal>()) {}
+      _internal(std::make_shared<Handle::Internal>(device_id)) {}

 Handle::Handle(int device_id) : Handle(INFINI_DEVICE_METAX, device_id) {}

@@ -11,6 +11,19 @@ auto Handle::internal() const -> const std::shared_ptr<Internal> & {
    return _internal;
 }

+Handle::Internal::Internal(int device_id) {
+    hcDeviceProp_t prop;
+    hcGetDeviceProperties(&prop, device_id);
+    _warp_size = prop.warpSize;
+    _max_threads_per_block = prop.maxThreadsPerBlock;
+    _block_size[0] = prop.maxThreadsDim[0];
+    _block_size[1] = prop.maxThreadsDim[1];
+    _block_size[2] = prop.maxThreadsDim[2];
+    _grid_size[0] = prop.maxGridSize[0];
+    _grid_size[1] = prop.maxGridSize[1];
+    _grid_size[2] = prop.maxGridSize[2];
+}
+
 infiniStatus_t Handle::Internal::useMcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const {
    auto handle = mcblas_handles.pop();
    if (!handle) {
@@ -33,6 +46,15 @@ infiniStatus_t Handle::Internal::useMcdnn(hcStream_t stream, const Fn<hcdnnHandl
    return INFINI_STATUS_SUCCESS;
 }

+int Handle::Internal::warpSize() const { return _warp_size; }
+int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
+int Handle::Internal::blockSizeX() const { return _block_size[0]; }
+int Handle::Internal::blockSizeY() const { return _block_size[1]; }
+int Handle::Internal::blockSizeZ() const { return _block_size[2]; }
+int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
+int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
+int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }
+
 hcdnnDataType_t getHcdnnDtype(infiniDtype_t dt) {
    switch (dt) {
    case INFINI_DTYPE_F16:

--- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
+++ b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
+#ifndef __RMS_NORM_MACA_CUH__
+#define __RMS_NORM_MACA_CUH__
+
+#include "../rms_norm.h"
+
+DESCRIPTOR(maca)
+
+#endif
--- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
+++ b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
+#include "../../../devices/maca/common_maca.h"
+#include "../cuda/rms_norm_kernel.cuh"
+#include "rms_norm_maca.cuh"
+
+namespace op::rms_norm::maca {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::maca::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon) {
+    auto result = RMSNormInfo::create(y_desc, x_desc, w_desc, epsilon);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    // only support contiguous last dimension
+    if (info.x_strides[1] != 1 || info.y_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::maca::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+// launch kernel with different data types
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    uint32_t batch_size, size_t dim,
+    void *y, infiniDtype_t atype, ptrdiff_t stride_y,
+    const void *x, ptrdiff_t stride_x,
+    const void *w, infiniDtype_t wtype,
+    float epsilon,
+    hcStream_t maca_stream) {
+
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
+    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                               \
+        stride_y,                                                                                   \
+        reinterpret_cast<const Tdata *>(x),                                                         \
+        stride_x,                                                                                   \
+        reinterpret_cast<const Tweight *>(w),                                                       \
+        dim,                                                                                        \
+        epsilon)
+
+    if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
+        LAUNCH_KERNEL(half, half, float);
+    } else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(half, float, float);
+    } else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(float, float, float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y, const void *x, const void *w,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto stride_x = _info.x_strides[0];
+    auto stride_y = _info.y_strides[0];
+    auto dim = _info.dim();
+    uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
+    auto maca_stream = reinterpret_cast<hcStream_t>(stream);
+
+    // launch kernel with different block sizes
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::rms_norm::maca
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -11,6 +11,9 @@
 #ifdef ENABLE_ASCEND_API
 #include "ascend/rms_norm_aclnn.h"
 #endif
+#ifdef ENABLE_METAX_API
+#include "maca/rms_norm_maca.cuh"
+#endif
 #ifdef ENABLE_MOORE_API
 #include "musa/rms_norm_musa.cuh"
 #endif
@@ -54,10 +57,8 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
 #ifdef ENABLE_ASCEND_API
        CREATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCreateRMSNormDescriptor((MacaHandle_t)handle, (RMSNormMacaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
-    }
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MOORE_API
        CREATE(INFINI_DEVICE_MOORE, musa)
@@ -94,10 +95,8 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
 #ifdef ENABLE_ASCEND_API
        GET(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t)desc, size);
-    }
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MOORE_API
        GET(INFINI_DEVICE_MOORE, musa)
@@ -135,10 +134,8 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
 #ifdef ENABLE_ASCEND_API
        CALCULATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaRMSNorm((RMSNormMacaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
-    }
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MOORE_API
        CALCULATE(INFINI_DEVICE_MOORE, musa)
@@ -175,10 +172,8 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
 #ifdef ENABLE_ASCEND_API
        DESTROY(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t)desc);
-    }
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MOORE_API
        DESTROY(INFINI_DEVICE_MOORE, musa)

--- a/xmake/maca.lua
+++ b/xmake/maca.lua

 local MACA_ROOT = os.getenv("MACA_PATH") or os.getenv("MACA_HOME") or os.getenv("MACA_ROOT")
-
 add_includedirs(MACA_ROOT .. "/include")
 add_linkdirs(MACA_ROOT .. "/lib")
-add_links("libhcdnn.so")
-add_links("libhcblas.so")
-add_links("libhcruntime.so")
+add_links("hcdnn", "hcblas", "hcruntime")

 rule("maca")
    set_extensions(".maca")
@@ -34,13 +31,11 @@ rule_end()
 target("infiniop-metax")
    set_kind("static")
    on_install(function (target) end)
-    add_cxflags("-lstdc++ -Wall -fPIC")
    set_languages("cxx17")
-    set_warnings("all")
-
+    set_warnings("all", "error")
+    add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
    add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
    add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
-
 target_end()

 target("infinirt-metax")
@@ -48,7 +43,7 @@ target("infinirt-metax")
    set_languages("cxx17")
    on_install(function (target) end)
    add_deps("infini-utils")
-    -- Add files
-    add_files("$(projectdir)/src/infinirt/maca/*.cc")
-    add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+    set_warnings("all", "error")
+    add_cxflags("-lstdc++ -fPIC")
+    add_files("../src/infinirt/maca/*.cc")
 target_end()