issue/41 添加沐曦matmul 算子，fix format

fd58c99c · qinyiqun · d3409518 · fd58c99c · fd58c99c · fd58c99c
Commit fd58c99c authored Mar 12, 2025 by qinyiqun
10 changed files
--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -17,7 +17,7 @@
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/kunlun_handle.h"
 #endif
-#ifdef ENABLE_MACA_API
+#ifdef ENABLE_METAX_API
 #include "maca/maca_handle.h"
 #endif
@@ -56,7 +56,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
        return createKunlunHandle((infiniopKunlunHandle_t *)handle_ptr);
    }
 #endif
-#ifdef ENABLE_MACA_API
+#ifdef ENABLE_METAX_API
        CREATE(INFINI_DEVICE_METAX, maca);
 #endif
@@ -96,7 +96,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
        return destroyKunlunHandle((infiniopKunlunHandle_t)handle);
    }
 #endif
-#ifdef ENABLE_MACA_API
+#ifdef ENABLE_METAX_API
        DELETE(INFINI_DEVICE_METAX, maca);
 #endif
    default:

--- a/src/infiniop/devices/maca/common_maca.h
+++ b/src/infiniop/devices/maca/common_maca.h
@@ -18,8 +18,8 @@ class Handle::Internal {
    using Fn = std::function<infiniStatus_t(T)>;
 public:
-    infiniStatus_t use_mcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const;
+    infiniStatus_t useMcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const;
-    infiniStatus_t use_mcdnn(hcStream_t stream, const Fn<hcdnnHandle_t> &f) const;
+    infiniStatus_t useMcdnn(hcStream_t stream, const Fn<hcdnnHandle_t> &f) const;
 };
 hcdnnDataType_t getHcdnnDtype(infiniDtype_t dt);

--- a/src/infiniop/devices/maca/maca_handle.cc
+++ b/src/infiniop/devices/maca/maca_handle.cc
@@ -9,7 +9,7 @@ auto Handle::internal() const -> const std::shared_ptr<Internal> & {
    return _internal;
 }
-infiniStatus_t Handle::Internal::use_mcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const {
+infiniStatus_t Handle::Internal::useMcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const {
    auto handle = mcblas_handles.pop();
    if (!handle) {
        CHECK_MCBLAS(hcblasCreate(&(*handle)));
@@ -20,7 +20,7 @@ infiniStatus_t Handle::Internal::use_mcblas(hcStream_t stream, const Fn<hcblasHa
    return INFINI_STATUS_SUCCESS;
 }
-infiniStatus_t Handle::Internal::use_mcdnn(hcStream_t stream, const Fn<hcdnnHandle_t> &f) const {
+infiniStatus_t Handle::Internal::useMcdnn(hcStream_t stream, const Fn<hcdnnHandle_t> &f) const {
    auto handle = mcdnn_handles.pop();
    if (!handle) {
        CHECK_MCDNN(hcdnnCreate(&(*handle)));

--- a/src/infiniop/ops/matmul/maca/matmul_maca.cc
+++ b/src/infiniop/ops/matmul/maca/matmul_maca.cc
+#include "matmul_maca.h"
+#include "../../../devices/maca/common_maca.h"
+#include "../../../devices/maca/maca_handle.h"
+namespace op::matmul::maca {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::maca::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
+    auto dtype = c_desc->dtype();
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    infiniStatus_t status;
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
+    if (status != INFINI_STATUS_SUCCESS) {
+        return status;
+    }
+    *desc_ptr = new Descriptor(
+        dtype, info, 0,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    float beta,
+    const void *a,
+    const void *b,
+    float alpha,
+    void *stream) const {
+    hpccDataType a_type, b_type, c_type;
+    hcblasComputeType_t compute_type;
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        a_type = b_type = c_type = HPCC_R_16F;
+        compute_type = HCBLAS_COMPUTE_32F;
+        break;
+    case INFINI_DTYPE_F32:
+        a_type = b_type = c_type = HPCC_R_32F;
+        compute_type = HCBLAS_COMPUTE_32F_FAST_TF32;
+#endif
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (_info.is_transed) {
+        std::swap(a, b);
+    }
+    auto op_a = _info.a_matrix.row_stride == 1 ? HCBLAS_OP_N : HCBLAS_OP_T;
+    auto op_b = _info.b_matrix.row_stride == 1 ? HCBLAS_OP_N : HCBLAS_OP_T;
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(
+                hcblasGemmStridedBatchedEx(
+                    handle,
+                    op_a,
+                    op_b,
+                    static_cast<int>(_info.m),
+                    static_cast<int>(_info.n),
+                    static_cast<int>(_info.k),
+                    &alpha,
+                    a,
+                    a_type,
+                    static_cast<int>(_info.a_matrix.ld()),
+                    _info.a_matrix.stride,
+                    b,
+                    b_type,
+                    static_cast<int>(_info.b_matrix.ld()),
+                    _info.b_matrix.stride,
+                    &beta,
+                    c,
+                    c_type,
+                    static_cast<int>(_info.c_matrix.ld()),
+                    _info.c_matrix.stride,
+                    static_cast<int>(_info.batch),
+                    compute_type,
+                    HCBLAS_GEMM_DEFAULT_TENSOR_OP));
+            return INFINI_STATUS_SUCCESS;
+        }));
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::matmul::maca
--- a/src/infiniop/ops/matmul/maca/matmul_maca.h
+++ b/src/infiniop/ops/matmul/maca/matmul_maca.h
+#ifndef __MATMUL_MACA_H__
+#define __MATMUL_MACA_H__
+#include "../matmul.h"
+DESCRIPTOR(maca)
+#endif // __MATMUL_MACA_H__
--- a/src/infiniop/ops/matmul/operator.cc
+++ b/src/infiniop/ops/matmul/operator.cc
@@ -14,6 +14,9 @@
 #ifdef ENABLE_ASCEND_API
 #include "ascend/matmul_ascend.h"
 #endif
+#ifdef ENABLE_METAX_API
+#include "maca/matmul_maca.h"
+#endif
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/matmul_kunlun.h"
 #endif
@@ -48,6 +51,9 @@ __C infiniStatus_t infiniopCreateMatmulDescriptor(
 #ifdef ENABLE_ASCEND_API
        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, maca);
+#endif
 #ifdef ENABLE_KUNLUN_API
        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
@@ -83,6 +89,9 @@ infiniopGetMatmulWorkspaceSize(
 #ifdef ENABLE_ASCEND_API
        GET(INFINI_DEVICE_ASCEND, ascend);
 #endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, maca);
+#endif
 #ifdef ENABLE_KUNLUN_API
        GET(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
@@ -126,6 +135,9 @@ __C infiniStatus_t infiniopMatmul(
 #ifdef ENABLE_ASCEND_API
        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, maca);
+#endif
 #ifdef ENABLE_KUNLUN_API
        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
@@ -159,6 +171,9 @@ infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
 #ifdef ENABLE_ASCEND_API
        DELETE(INFINI_DEVICE_ASCEND, ascend);
 #endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, maca);
+#endif
 #ifdef ENABLE_KUNLUN_API
        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif

--- a/src/infinirt/maca/infinirt_maca.cc
+++ b/src/infinirt/maca/infinirt_maca.cc
@@ -5,7 +5,7 @@
 #define CHECK_MACART(RT_API) CHECK_INTERNAL(RT_API, hcSuccess)
-namespace infinirt::cuda {
+namespace infinirt::maca {
 infiniStatus_t getDeviceCount(int *count) {
    CHECK_MACART(hcGetDeviceCount(count));
    return INFINI_STATUS_SUCCESS;
@@ -90,7 +90,7 @@ infiniStatus_t freeHost(void *ptr) {
    return INFINI_STATUS_SUCCESS;
 }
-cudaMemcpyKind toMacaMemcpyKind(infinirtMemcpyKind_t kind) {
+hcMemcpyKind toMacaMemcpyKind(infinirtMemcpyKind_t kind) {
    switch (kind) {
    case INFINIRT_MEMCPY_H2D:
        return hcMemcpyHostToDevice;
@@ -124,4 +124,4 @@ infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
    CHECK_MACART(hcFreeAsync(ptr, (hcStream_t)stream));
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace infinirt::cuda
+} // namespace infinirt::maca
--- a/src/infinirt/maca/infinirt_maca.h
+++ b/src/infinirt/maca/infinirt_maca.h
@@ -2,12 +2,12 @@
 #define __INFINIRT_MACA_H__
 #include "../infinirt_impl.h"
-namespace infinirt::mca {
+namespace infinirt::maca {
-#ifdef ENABLE_MACA_API
+#ifdef ENABLE_METAX_API
 INFINIRT_DEVICE_API_IMPL
 #else
 INFINIRT_DEVICE_API_NOOP
 #endif
-} // namespace infinirt::mca
+} // namespace infinirt::maca
 #endif // __INFINIRT_MACA_H__
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -166,6 +166,11 @@ def get_args():
        action="store_true",
        help="Run ASCEND NPU test",
    )
+    parser.add_argument(
+        "--metax",
+        action="store_true",
+        help="Run METAX GPU test",
+    )
    parser.add_argument(
        "--kunlun",
        action="store_true",
@@ -434,6 +439,10 @@ def get_test_devices(args):
        torch.npu.set_device(0)  # Ascend NPU needs explicit device initialization
        devices_to_test.append(InfiniDeviceEnum.ASCEND)
+    if args.metax:
+        import torch
+        devices_to_test.append(InfiniDeviceEnum.METAX)
    if args.kunlun:
        import torch_xmlir

--- a/xmake.lua
+++ b/xmake.lua
@@ -74,7 +74,7 @@ option("metax-gpu")
 option_end()
 if has_config("metax-gpu") then
-    add_defines("ENABLE_MACA_API")
+    add_defines("ENABLE_METAX_API")
    includes("xmake/maca.lua")
 end