issue/25: kunlun fundation and matmul

8583cae7 · zhangyue · 45175dbf · 8583cae7 · 8583cae7 · 8583cae7
Commit 8583cae7 authored Feb 19, 2025 by zhangyue
11 changed files
--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -11,6 +11,9 @@
 #ifdef ENABLE_ASCEND_API
 #include "ascend/ascend_handle.h"
 #endif
+#ifdef ENABLE_KUNLUN_API
+#include "./kunlun/kunlun_handle.h"
+#endif
 __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr,
                                        infiniDevice_t device) {
@@ -37,6 +40,11 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr,
    case INFINI_DEVICE_ASCEND: {
        return createAscendHandle((infiniopAscendHandle_t *)handle_ptr);
    }
+#endif
+#ifdef ENABLE_KUNLUN_API
+    case INFINI_DEVICE_KUNLUN: {
+        return createKunlunHandle((infiniopKunlunHandle_t *)handle_ptr);
+    }
 #endif
    }
    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -62,6 +70,11 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
    case INFINI_DEVICE_ASCEND: {
        return destroyAscendHandle((infiniopAscendHandle_t)handle);
    }
+#endif
+#ifdef ENABLE_KUNLUN_API
+    case INFINI_DEVICE_KUNLUN: {
+        return destroyKunlunHandle((infiniopKunlunHandle_t)handle);
+    }
 #endif
    }
    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/devices/kunlun/common_kunlun.h
+++ b/src/infiniop/devices/kunlun/common_kunlun.h
+#ifndef __INFINIOP_COMMON_KUNLUN_H__
+#define __INFINIOP_COMMON_KUNLUN_H__
+#include "../pool.h"
+#include "infinicore.h"
+#include "kunlun_handle.h"
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+#include <memory>
+namespace xdnn = baidu::xpu::api;
+typedef xdnn::Context *xdnnHandle_t;
+#define CHECK_KUNLUN(call)                                             \
+    {                                                                  \
+        auto err = call;                                               \
+        if (XPU_SUCCESS != err) {                                      \
+            fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__, \
+                    __LINE__, xpu_strerror(err));                      \
+            return INFINIOP_STATUS_INTERNAL_ERROR;                     \
+        }                                                              \
+    }
+struct InfiniopKunlunHandle {
+    infiniDevice_t device;
+    int device_id;
+    std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handle_pool;
+};
+template <typename T>
+infiniopStatus_t use_xdnn(std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handle_pool,
+                          XPUStream stream,
+                          T const &f) {
+    auto handle = xdnn_handle_pool->pop();
+    if (!handle) {
+        *handle = xdnn::create_context();
+    }
+    (*handle)->set_stream(stream);
+    auto ret = f(*handle);
+    xdnn_handle_pool->push(std::move(*handle));
+    return ret;
+}
+#endif
--- a/src/infiniop/devices/kunlun/kunlun_handle.cc
+++ b/src/infiniop/devices/kunlun/kunlun_handle.cc
+#include "common_kunlun.h"
+infiniopStatus_t createKunlunHandle(infiniopKunlunHandle_t *handle_ptr) {
+    int device_id;
+    CHECK_KUNLUN(xpu_current_device(&device_id))
+    auto pool = std::make_shared<Pool<xdnnHandle_t>>();
+    xdnnHandle_t handle = xdnn::create_context();
+    pool->push(std::move(handle));
+    *handle_ptr = new InfiniopKunlunHandle{
+        INFINI_DEVICE_KUNLUN,
+        device_id,
+        std::move(pool),
+    };
+    return INFINIOP_STATUS_SUCCESS;
+}
+infiniopStatus_t destroyKunlunHandle(infiniopKunlunHandle_t handle) {
+    handle->xdnn_handle_pool = nullptr;
+    delete handle;
+    return INFINIOP_STATUS_SUCCESS;
+}
--- a/src/infiniop/devices/kunlun/kunlun_handle.h
+++ b/src/infiniop/devices/kunlun/kunlun_handle.h
+#ifndef __INFINIOP_KUNLUN_HANDLE_H__
+#define __INFINIOP_KUNLUN_HANDLE_H__
+#include "infiniop/handle.h"
+struct InfiniopKunlunHandle;
+typedef struct InfiniopKunlunHandle *infiniopKunlunHandle_t;
+infiniopStatus_t createKunlunHandle(infiniopKunlunHandle_t *handle_ptr);
+infiniopStatus_t destroyKunlunHandle(infiniopKunlunHandle_t handle);
+#endif
--- a/src/infiniop/ops/matmul/kunlun/matmul_xdnn.cc
+++ b/src/infiniop/ops/matmul/kunlun/matmul_xdnn.cc
+#include "matmul_xdnn.h"
+template <typename T>
+infiniopStatus_t matmulKunlunCommon(infiniopMatmulKunlunDescriptor_t desc,
+                                    void *c,
+                                    float beta,
+                                    void const *a,
+                                    void const *b,
+                                    float alpha,
+                                    void *stream) {
+    auto info = desc->info;
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+    auto transA = info.a_matrix.col_stride == 1 ? false : true;
+    auto transB = info.b_matrix.col_stride == 1 ? false : true;
+    auto ret = use_xdnn(desc->xdnn_handle_pool,
+                        (XPUStream)stream,
+                        [&](xdnnHandle_t handle) {
+                            for (size_t i = 0; i < info.batch; i++) {
+                                CHECK_KUNLUN((
+                                    xdnn::fc_fusion<T, T, T, int16_t>(
+                                        handle,
+                                        (T *)((char *)a + i * info.a_matrix.stride * infiniSizeof(desc->dtype)),
+                                        (T *)((char *)b + i * info.b_matrix.stride * infiniSizeof(desc->dtype)),
+                                        (T *)((char *)c + i * info.c_matrix.stride * infiniSizeof(desc->dtype)),
+                                        info.m,
+                                        info.n,
+                                        info.k,
+                                        transA,
+                                        transB,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        info.a_matrix.ld(),
+                                        info.b_matrix.ld(),
+                                        info.c_matrix.ld(),
+                                        alpha,
+                                        beta,
+                                        nullptr,
+                                        xdnn::Activation_t::LINEAR,
+                                        nullptr)));
+                            }
+                            return INFINIOP_STATUS_SUCCESS;
+                        });
+    return ret;
+}
+infiniopStatus_t kunlunCreateMatmulDescriptor(infiniopKunlunHandle_t handle,
+                                              infiniopMatmulKunlunDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t c_desc,
+                                              infiniopTensorDescriptor_t a_desc,
+                                              infiniopTensorDescriptor_t b_desc) {
+    infiniDtype_t dtype = c_desc->dtype;
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
+        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+    }
+    infiniopStatus_t status;
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, false);
+    if (status != INFINIOP_STATUS_SUCCESS) {
+        return status;
+    }
+    *desc_ptr = new InfiniopMatmulKunlunDescriptor{
+        INFINI_DEVICE_KUNLUN,
+        dtype,
+        handle->device_id,
+        info,
+        handle->xdnn_handle_pool};
+    return INFINIOP_STATUS_SUCCESS;
+}
+infiniopStatus_t kunlunGetMatmulWorkspaceSize(infiniopMatmulKunlunDescriptor_t desc,
+                                              size_t *size) {
+    *size = 0;
+    return INFINIOP_STATUS_SUCCESS;
+}
+infiniopStatus_t kunlunMatmul(infiniopMatmulKunlunDescriptor_t desc,
+                              void *workspace,
+                              size_t workspace_size,
+                              void *c,
+                              void const *a,
+                              void const *b,
+                              float alpha,
+                              float beta,
+                              void *stream) {
+    if (desc->dtype == INFINI_DTYPE_F16) {
+        return matmulKunlunCommon<float16>(desc, c, beta, a, b, alpha, stream);
+    }
+    if (desc->dtype == INFINI_DTYPE_F32) {
+        return matmulKunlunCommon<float>(desc, c, beta, a, b, alpha, stream);
+    }
+    return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+}
+infiniopStatus_t kunlunDestroyMatmulDescriptor(infiniopMatmulKunlunDescriptor_t desc) {
+    desc->xdnn_handle_pool = nullptr;
+    delete desc;
+    return INFINIOP_STATUS_SUCCESS;
+}
--- a/src/infiniop/ops/matmul/kunlun/matmul_xdnn.h
+++ b/src/infiniop/ops/matmul/kunlun/matmul_xdnn.h
+#ifndef __MATMUL_XDNN_H__
+#define __MATMUL_XDNN_H__
+#include "../../../devices/kunlun/common_kunlun.h"
+#include "../../utils.h"
+#include "../blas.h"
+#include "matmul_xdnn_api.h"
+struct InfiniopMatmulKunlunDescriptor {
+    infiniDevice_t device;
+    infiniDtype_t dtype;
+    int device_id;
+    MatmulInfo info;
+    std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handle_pool;
+};
+#endif
--- a/src/infiniop/ops/matmul/kunlun/matmul_xdnn_api.h
+++ b/src/infiniop/ops/matmul/kunlun/matmul_xdnn_api.h
+#ifndef __MATMUL_XDNN_API_H__
+#define __MATMUL_XDNN_API_H__
+#include "../../../devices/kunlun/kunlun_handle.h"
+#include "infiniop/operator.h"
+struct InfiniopMatmulKunlunDescriptor;
+typedef struct InfiniopMatmulKunlunDescriptor *infiniopMatmulKunlunDescriptor_t;
+infiniopStatus_t kunlunCreateMatmulDescriptor(infiniopKunlunHandle_t handle,
+                                              infiniopMatmulKunlunDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t c_desc,
+                                              infiniopTensorDescriptor_t a_desc,
+                                              infiniopTensorDescriptor_t b_desc);
+infiniopStatus_t kunlunGetMatmulWorkspaceSize(infiniopMatmulKunlunDescriptor_t desc,
+                                              size_t *size);
+infiniopStatus_t kunlunMatmul(infiniopMatmulKunlunDescriptor_t desc,
+                              void *workspace,
+                              size_t workspace_size,
+                              void *c,
+                              void const *a,
+                              void const *b,
+                              float alpha,
+                              float beta,
+                              void *stream);
+infiniopStatus_t kunlunDestroyMatmulDescriptor(infiniopMatmulKunlunDescriptor_t desc);
+#endif
--- a/src/infiniop/ops/matmul/operator.cc
+++ b/src/infiniop/ops/matmul/operator.cc
@@ -12,6 +12,9 @@
 #ifdef ENABLE_ASCEND_API
 #include "ascend/matmul_ascend.h"
 #endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/matmul_xdnn_api.h"
+#endif
 __C infiniStatus_t infiniopCreateMatmulDescriptor(
    infiniopHandle_t handle,

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -166,6 +166,11 @@ def get_args():
        action="store_true",
        help="Run ASCEND NPU test",
    )
+    parser.add_argument(
+        "--kunlun",
+        action="store_true",
+        help="Run KUNLUN XPU test",
+    )
    return parser.parse_args()
@@ -428,6 +433,9 @@ def get_test_devices(args):
        torch.npu.set_device(0)  # Ascend NPU needs explicit device initialization
        devices_to_test.append(InfiniDeviceEnum.ASCEND)
+    if args.kunlun:
+        import torch_xmlir
+        devices_to_test.append(InfiniDeviceEnum.KUNLUN)
    if not devices_to_test:
        devices_to_test = [InfiniDeviceEnum.CPU]

--- a/xmake.lua
+++ b/xmake.lua
@@ -100,6 +100,17 @@ if has_config("sugon-dcu") then
    add_defines("ENABLE_SUGON_CUDA_API")
 end
+-- 昆仑芯
+option("kunlun-xpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable Kunlun XPU kernel")
+option_end()
+if has_config("kunlun-xpu") then
+    add_defines("ENABLE_KUNLUN_API")
+    includes("xmake/kunlun.lua")
+end
 target("infiniop")
    set_kind("shared")
@@ -134,6 +145,9 @@ target("infiniop")
    if has_config("metax-gpu") then
        add_deps("metax-gpu")
    end
+    if has_config("kunlun-xpu") then
+        add_deps("infiniop-kunlun")
+    end
    set_languages("cxx17")
    add_files("src/infiniop/devices/handle.cc")
    add_files("src/infiniop/ops/*/operator.cc")

--- a/xmake/kunlun.lua
+++ b/xmake/kunlun.lua
+add_defines("ENABLE_KUNLUN_API")
+local KUNLUN_HOME = os.getenv("KUNLUN_HOME")
+-- Add include dirs
+add_includedirs(path.join(KUNLUN_HOME, "include"), {public=true})
+add_linkdirs(path.join(KUNLUN_HOME, "lib64"))
+add_links("xpurt")
+add_links("xpuapi")
+target("infiniop-kunlun")
+    -- Other configs
+    set_kind("static")
+    set_languages("cxx17")
+    on_install(function (target) end)
+    -- Add files
+    add_files("$(projectdir)/src/infiniop/devices/kunlun/*.cc", "$(projectdir)/src/infiniop/ops/*/kunlun/*.cc")
+    add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+target_end()