issue/216 - Merge remote-tracking branch main into issue/216

b2b4526b · wooway777 · 3c696932 · 0c803397 · b2b4526b · b2b4526b
Commit b2b4526b authored May 16, 2025 by wooway777
17 changed files
--- a/src/infiniop/devices/ascend/CMakeLists.txt
+++ b/src/infiniop/devices/ascend/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
 # project information
 project(Ascend_C)
 set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
-set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH "ASCEND CANN package installation directory")
+set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
 set(RUN_MODE "npu" CACHE STRING "run mode: npu")
 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
 set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
@@ -19,10 +19,14 @@ else()
 endif()

 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+include_directories(
+    ${CMAKE_SOURCE_DIR}/../../../../include/infiniop/ 
+)
+

 ascendc_library(ascend_kernels STATIC
-    ../../ops/swiglu/ascend/swiglu_kernel.cpp
-    ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
-    ../../ops/random_sample/ascend/random_sample_kernel.cpp
+    ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
+    # ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    # ../../ops/random_sample/ascend/random_sample_kernel.cpp
 )

--- a/src/infiniop/devices/ascend/ascend_kernel_common.h
+++ b/src/infiniop/devices/ascend/ascend_kernel_common.h
+#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
+#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
+
+#include "../../../../include/infinicore.h"
+#include "kernel_operator.h"
+
+constexpr int32_t BLOCK_NUM = 8;
+constexpr int32_t BUFFER_NUM = 2;
+constexpr int32_t BYTE_ALIGN = 32;
+
+#endif
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
 #include "common_ascend.h"

 std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
-    auto index = std::max_element(strides.begin(), strides.end());
-    uint64_t max_stride_index = std::distance(strides.begin(), index);
-    auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
+    if (shape.size() != strides.size()) {
+        throw std::invalid_argument("Shape and strides must have the same length.");
+    }
+
+    int64_t max_offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        max_offset += (shape[i] - 1) * strides[i];
+    }

-    return storageShape;
+    // storage shape is 1D buffer that must cover all accessed elements
+    return {max_offset + 1};
 }

 size_t aclnnTensorDescriptor::numel() const {
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
    this->strides = std::vector<int64_t>(ndim);
    for (uint64_t i = 0; i < ndim; ++i) {
        this->shape[i] = static_cast<int64_t>(desc->dim(i));
-        this->strides[i] = desc->stride(i);
+        this->strides[i] = static_cast<int64_t>(desc->stride(i));
    }
    this->storageShape = inferStorageShape(this->shape, this->strides);
    this->dataType = toAclDataType(desc->dtype());

--- a/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
+++ b/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
-#include "causal_softmax_aclnn.h"
+#include "causal_softmax_ascend.h"
 #include "../../../devices/ascend/common_ascend.h"
 #include <aclnnop/aclnn_masked_fill_tensor.h>
 #include <aclnnop/aclnn_softmax.h>
@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
    aclnnTensorDescriptor_t value;
    void *mask_addr;
    void *value_addr;
+    uint64_t workspacesize;
+    aclOpExecutor *executor;

    ~Opaque() {
        delete x;
@@ -21,6 +23,9 @@ struct Descriptor::Opaque {

        aclrtFree(mask_addr);
        aclrtFree(value_addr);
+
+        // Delete useless executor
+        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
    aclTensor *tvalue = value->tensor;

    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
-    int64_t dim = 2;

+    int64_t dim = 2;
    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
+    // set executor reusable
+    aclSetAclOpExecutorRepeatable(executor);

-    // Create the descriptor
-    size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
-    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
-                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
+    // Create the descripto
+    size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);

-    // Delete useless executor
-    aclDestroyAclOpExecutor(executor);
-    aclDestroyAclOpExecutor(mask_executor);
+    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr,
+                                          workspacesize_softmax, executor},
+                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);

    return INFINI_STATUS_SUCCESS;
 }
@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
    auto ty = _opaque->y->tensor;
    auto tmask = _opaque->mask->tensor;
    auto tvalue = _opaque->value->tensor;
-    aclOpExecutor *executor = nullptr;
    aclOpExecutor *mask_executor = nullptr;
-    size_t workspacesize_softmax = 0;
    size_t workspacesize_mask = 0;
-    int64_t dim = 2;

    AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
    AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
    AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
    CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
-    CHECK_ACL(aclrtSynchronizeStream(stream));

-    AclSetTensorAddr(executor, 0, tx, (void *)x);
-    AclSetTensorAddr(executor, 1, ty, y);
-    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
-    CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));
+    AclSetTensorAddr(_opaque->executor, 0, tx, (void *)x);
+    AclSetTensorAddr(_opaque->executor, 1, ty, y);
+    CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize, _opaque->executor, stream));

    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.h
+++ b/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.h
--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
@@ -9,7 +9,7 @@
 #include "cuda/causal_softmax_cuda.cuh"
 #endif
 #ifdef ENABLE_ASCEND_API
-#include "ascend/causal_softmax_aclnn.h"
+#include "ascend/causal_softmax_ascend.h"
 #endif

 __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(

--- a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+++ b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+#include "rearrange_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+#include <aclnnop/aclnn_copy.h>
+
+namespace op::rearrange::ascend {
+
+struct Descriptor::Opaque {
+    aclnnTensorDescriptor_t dst;
+    aclnnTensorDescriptor_t src;
+    void *workspace; // aclnnInplaceCopy workspace
+    uint64_t workspace_size;
+    ~Opaque() {
+        delete dst;
+        delete src;
+
+        aclrtFree(workspace);
+    }
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+};
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+    auto ndim = y_desc->ndim();
+    auto shape = y_desc->shape();
+    CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < ndim; ++i) {
+        CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+    auto dst_strides = y_desc->strides();
+    auto src_strides = x_desc->strides();
+    auto element_size = infiniSizeOf(dtype);
+
+    auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
+    CHECK_RESULT(result);
+
+    aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
+    aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+    void *workspace = nullptr;
+    aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
+                                     &workspace_size, &executor);
+    if (workspace_size != 0) {
+        CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        new Opaque{
+            dst,
+            src,
+            workspace,
+            workspace_size},
+        handle->device,
+        handle->device_id);
+
+    // Delete useless executor
+    aclDestroyAclOpExecutor(executor);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *y,
+    const void *x,
+    void *stream) const {
+    auto tdst = _opaque->dst->tensor;
+    auto tsrc = _opaque->src->tensor;
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+
+    AclSetTensorAddr(executor, 0, tdst, y);
+    AclSetTensorAddr(executor, 1, tsrc, (void *)x);
+    CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
+    // Execute InplaceCopy
+    CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
+                               executor, stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rearrange::ascend
--- a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+++ b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+#ifndef __REARRANGE_ASCEND_H__
+#define __REARRANGE_ASCNED_H__
+
+#include "../rearrange.h"
+
+DESCRIPTOR(ascend)
+
+#endif // __REARRANGE_ASCEND_H__
--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rearrange_cpu.h"
 #endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/rearrange_ascend.h"
+#endif

 #ifdef ENABLE_CUDA_API
 #include "cuda/rearrange_cuda.cuh"
@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        CREATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        DELETE(INFINI_DEVICE_NVIDIA, cuda);

--- a/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+++ b/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
    aclnnTensorDescriptor_t w;
    aclnnTensorDescriptor_t rstd;
    size_t workspaceSize;
+    aclOpExecutor *executor;

    ~Opaque() {
        delete y;
        delete x;
        delete w;
        delete rstd;
+
+        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(

    // Get WorkspaceSize and set executor
    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
+    aclSetAclOpExecutorRepeatable(executor);

    auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
    size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
    *desc_ptr = new Descriptor(
-        new Opaque{y, x, w, rstd, workspace_size},
+        new Opaque{y, x, w, rstd, workspace_size, executor},
        std::move(info),
        all_workspace_size,
        handle_ascend->device, handle_ascend->device_id);

-    aclDestroyAclOpExecutor(executor);
-
    return INFINI_STATUS_SUCCESS;
 }

@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
    auto tx = _opaque->x->tensor;
    auto ty = _opaque->y->tensor;
    auto trstd = _opaque->rstd->tensor;
-    size_t workspace_size_ = 0;
-    aclOpExecutor *executor = nullptr;
-
-    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
-    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);

    auto unit = infiniSizeOf(_info.atype);
-    AclSetTensorAddr(executor, 1, tw, (void *)w);
-    AclSetTensorAddr(executor, 3, trstd, rstdPtr);
+    AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
+    AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
    for (size_t i = 0; i < (_info.shape)[0]; ++i) {
-        AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
-        AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
-        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
+        AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
+        AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
+        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
    }
    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
+++ b/src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
+#include "swiglu_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+
+namespace op::swiglu::ascend {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t c_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
+
+    auto dtype = c_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    const auto &a_desc = input_descs[0];
+    const auto &b_desc = input_descs[1];
+
+    auto result = SwigluInfo::create(c_desc, a_desc, b_desc);
+    CHECK_RESULT(result);
+    SwigluInfo info = result.take();
+
+    // https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/ascendcopapi/atlasascendc_api_07_0777.html
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(std::move(info), workspace_size, handle_ascend->device, handle_ascend->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+extern "C" infiniStatus_t swiglu_kernel_launch(
+    void *c, void *a, void *b,
+    infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
+    ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
+    ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream);
+
+infiniStatus_t Descriptor::calculate(void *workspace,
+                                     size_t workspace_size,
+                                     void *c,
+                                     std::vector<const void *> inputs,
+                                     void *stream) const {
+    auto batch = _info.ndim == 2 ? 1 : _info.shape[0];
+    auto seq_len = _info.ndim == 2 ? _info.shape[0] : _info.shape[1];
+    auto hidden_size = _info.shape[_info.ndim - 1];
+    auto stride_batch_c = _info.ndim == 2 ? 1 : _info.c_strides[0];
+    auto stride_batch_a = _info.ndim == 2 ? 1 : _info.a_strides[0];
+    auto stride_batch_b = _info.ndim == 2 ? 1 : _info.b_strides[0];
+    auto stride_seq_c = _info.ndim == 2 ? _info.c_strides[0] : _info.c_strides[1];
+    auto stride_seq_a = _info.ndim == 2 ? _info.a_strides[0] : _info.a_strides[1];
+    auto stride_seq_b = _info.ndim == 2 ? _info.b_strides[0] : _info.b_strides[1];
+    auto status = swiglu_kernel_launch(c, (void *)inputs[0], (void *)inputs[1], _info.dtype, batch, seq_len, hidden_size, stride_batch_c, stride_batch_a, stride_batch_b, stride_seq_c, stride_seq_a, stride_seq_b, stream);
+    return status;
+}
+
+} // namespace op::swiglu::ascend
--- a/src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
+++ b/src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
+#ifndef __ACLNN_SWIGLU_H__
+#define __ACLNN_SWIGLU_H__
+
+#include "../../../../utils.h"
+#include "../../../../utils/check.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::swiglu::ascend {
+class SwigluInfo {
+
+    SwigluInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    std::vector<size_t> shape;
+    int32_t ndim;
+    std::vector<ptrdiff_t> c_strides;
+    std::vector<ptrdiff_t> a_strides;
+    std::vector<ptrdiff_t> b_strides;
+
+    static utils::Result<SwigluInfo> create(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
+        CHECK_OR_RETURN(c_desc && a_desc && b_desc, INFINI_STATUS_BAD_PARAM);
+        CHECK_OR_RETURN(!c_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+        CHECK_OR_RETURN(c_desc->ndim() == a_desc->ndim()
+                            && c_desc->ndim() == b_desc->ndim()
+                            && (c_desc->ndim() == 2 || c_desc->ndim() == 3),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_SAME_SHAPE(c_desc->shape(), a_desc->shape(), b_desc->shape());
+        int32_t ndim = c_desc->ndim();
+        CHECK_OR_RETURN(c_desc->stride(ndim - 1) == 1
+                            && a_desc->stride(ndim - 1) == 1
+                            && b_desc->stride(ndim - 1) == 1,
+                        INFINI_STATUS_BAD_TENSOR_STRIDES);
+        CHECK_OR_RETURN(c_desc->dtype() == a_desc->dtype()
+                            && c_desc->dtype() == b_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+        return utils::Result<SwigluInfo>(SwigluInfo{
+            c_desc->dtype(),
+            c_desc->shape(),
+            ndim,
+            c_desc->strides(),
+            a_desc->strides(),
+            b_desc->strides(),
+        });
+    }
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    SwigluInfo _info;
+    size_t _workspace_size;
+
+    Descriptor(SwigluInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) : InfiniopDescriptor{device_type, device_id},
+                                                                                                    _info(info), _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+    static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr,
+                                 infiniopTensorDescriptor_t c_desc,
+                                 std::vector<infiniopTensorDescriptor_t> input_descs);
+    size_t workspaceSize() const { return _workspace_size; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *c,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+
+} // namespace op::swiglu::ascend
+#endif // __ACLNN_SWIGLU_H__
--- a/src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
+++ b/src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
+#include "../../../devices/ascend/ascend_kernel_common.h"
+
+using namespace AscendC;
+
+template <typename T>
+class SwigluKernel {
+public:
+    __aicore__ inline SwigluKernel() {}
+    __aicore__ inline void init(GM_ADDR c, GM_ADDR a, GM_ADDR b, int64_t batch_, int64_t seq, int64_t hd,
+                                int64_t stride_batch_c, int64_t stride_batch_a, int64_t stride_batch_b,
+                                int64_t stride_seq_c, int64_t stride_seq_a, int64_t stride_seq_b);
+    __aicore__ inline void process();
+
+private:
+    __aicore__ inline void copyIn(int64_t i);
+    __aicore__ inline void compute(int64_t i);
+    __aicore__ inline void copyOut(int64_t i);
+
+private:
+    GlobalTensor<T> _c_gm, _a_gm, _b_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _in_queue_a, _in_queue_b;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> _out_queue_c;
+
+    TPipe _pipe;
+    float _beta_value = 1.0f;
+    int64_t _block_idx, _tile_len, _copy_len,
+        _batch, _seq_len, _hidden_size,
+        _stride_seq_a, _stride_seq_b, _stride_seq_c;
+    int64_t _stride_batch_a = 1, _stride_batch_b = 1, _stride_batch_c = 1;
+};
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::init(GM_ADDR c, GM_ADDR a, GM_ADDR b, int64_t batch_, int64_t seq, int64_t hd,
+                                             int64_t stride_batch_c, int64_t stride_batch_a, int64_t stride_batch_b,
+                                             int64_t stride_seq_c, int64_t stride_seq_a, int64_t stride_seq_b) {
+    // Init Shape & StrideVariables
+    _batch = batch_;
+    _seq_len = seq;
+    _hidden_size = hd;
+    _stride_batch_a = stride_batch_a;
+    _stride_batch_b = stride_batch_b;
+    _stride_batch_c = stride_batch_c;
+    _stride_seq_a = stride_seq_a;
+    _stride_seq_b = stride_seq_b;
+    _stride_seq_c = stride_seq_c;
+
+    _block_idx = GetBlockIdx();
+    _tile_len = _block_idx < (_hidden_size % BLOCK_NUM) ? (_hidden_size / BLOCK_NUM) + 1 : (_hidden_size / BLOCK_NUM);
+    _copy_len = (_tile_len * sizeof(T)) % BYTE_ALIGN == 0 ? _tile_len : (_tile_len * sizeof(T) + (BYTE_ALIGN - _tile_len * sizeof(T) % BYTE_ALIGN)) / sizeof(T);
+
+    // Set global tensor
+    _a_gm.SetGlobalBuffer((__gm__ T *)a);
+    _b_gm.SetGlobalBuffer((__gm__ T *)b);
+    _c_gm.SetGlobalBuffer((__gm__ T *)c);
+
+    // _pipe alloc memory to queue, the unit is bytes
+    _pipe.InitBuffer(_in_queue_a, BUFFER_NUM, _copy_len * sizeof(T));
+    _pipe.InitBuffer(_in_queue_b, BUFFER_NUM, _copy_len * sizeof(T));
+    _pipe.InitBuffer(_out_queue_c, BUFFER_NUM, _copy_len * sizeof(T));
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::copyIn(int64_t i) {
+    // Alloc tensor from queue memory
+    LocalTensor<T> aLocal = _in_queue_a.AllocTensor<T>();
+    LocalTensor<T> bLocal = _in_queue_b.AllocTensor<T>();
+    // Get idx of current tile
+    auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
+    auto seq_idx = _batch == 1 ? i : i % _seq_len;
+
+    int64_t idxa = batch_idx * _stride_batch_a + seq_idx * _stride_seq_a + _block_idx * _tile_len;
+    int64_t idxb = batch_idx * _stride_batch_b + seq_idx * _stride_seq_b + _block_idx * _tile_len;
+    // Copy process_th tile from global tensor to local tensor
+    DataCopy(aLocal, _a_gm[idxa], _copy_len);
+    DataCopy(bLocal, _b_gm[idxb], _copy_len);
+
+    // Enque input tensor to VECIN queue
+    _in_queue_a.EnQue(aLocal);
+    _in_queue_b.EnQue(bLocal);
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::compute(int64_t i) {
+    // Deque input tensors from VECIN queue
+    LocalTensor<T> aLocal = _in_queue_a.DeQue<T>();
+    LocalTensor<T> bLocal = _in_queue_b.DeQue<T>();
+    LocalTensor<T> cLocal = _out_queue_c.AllocTensor<T>();
+    // Call SwiGLU ascend api
+    SwiGLU<T, false>(cLocal, aLocal, bLocal, _beta_value, _copy_len);
+    // Enque result and free input
+    _out_queue_c.EnQue<T>(cLocal);
+    _in_queue_a.FreeTensor(aLocal);
+    _in_queue_b.FreeTensor(bLocal);
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::copyOut(int64_t i) {
+    // Deque output tensor from VECOUT queue
+    LocalTensor<T> cLocal = _out_queue_c.DeQue<T>();
+    auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
+    auto seq_idx = _batch == 1 ? i : i % _seq_len;
+    int64_t idxc = batch_idx * _stride_batch_c + seq_idx * _stride_seq_c + _block_idx * _tile_len;
+    // Copy progress_th tile from local tensor to global tensor
+    if (_tile_len * sizeof(T) % BYTE_ALIGN != 0) {
+        DataCopyExtParams dcep = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
+        DataCopyPad(_c_gm[idxc], cLocal, dcep);
+    } else {
+        DataCopy(_c_gm[idxc], cLocal, _tile_len);
+    }
+    // Free output Local tensor
+    _out_queue_c.FreeTensor(cLocal);
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::process() {
+    for (int64_t i = 0; i < _batch * _seq_len; ++i) {
+        copyIn(i);
+        compute(i);
+        copyOut(i);
+    }
+}
+
+#define DEFINE_SWIGLU_KERNEL(KERNEL_NAME, TYPE)                                    \
+    __global__ __aicore__ void KERNEL_NAME(GM_ADDR c, GM_ADDR a, GM_ADDR b,        \
+                                           int64_t batch, int64_t seq, int64_t hd, \
+                                           int64_t stride_batch_c,                 \
+                                           int64_t stride_batch_a,                 \
+                                           int64_t stride_batch_b,                 \
+                                           int64_t stride_seq_c,                   \
+                                           int64_t stride_seq_a,                   \
+                                           int64_t stride_seq_b) {                 \
+        SwigluKernel<TYPE> op;                                                     \
+        op.init(c, a, b,                                                           \
+                batch, seq, hd,                                                    \
+                stride_batch_c, stride_batch_a, stride_batch_b,                    \
+                stride_seq_c, stride_seq_a, stride_seq_b);                         \
+        op.process();                                                              \
+    }
+
+DEFINE_SWIGLU_KERNEL(swiglu_kernel_half, half)
+DEFINE_SWIGLU_KERNEL(swiglu_kernel_float, float)
+
+#undef DEFINE_SWIGLU_KERNEL
+
+extern "C" infiniStatus_t swiglu_kernel_launch(
+    void *c, void *a, void *b,
+    infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
+    ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
+    ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream) {
+
+#define LAUNCH_SWIGLU_KERNEL(DTYPE_ENUM, KERNEL_NAME)       \
+    case DTYPE_ENUM:                                        \
+        KERNEL_NAME<<<BLOCK_NUM, nullptr, stream>>>(        \
+            c, a, b,                                        \
+            static_cast<int64_t>(batch),                    \
+            static_cast<int64_t>(seq),                      \
+            static_cast<int64_t>(hd),                       \
+            stride_batch_c, stride_batch_a, stride_batch_b, \
+            stride_seq_c, stride_seq_a, stride_seq_b);      \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (dtype) {
+        LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F16, swiglu_kernel_half)
+        LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F32, swiglu_kernel_float)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH_SWIGLU_KERNEL
+}
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -11,6 +11,9 @@
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/swiglu_kunlun.h"
 #endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/swiglu_ascend.h"
+#endif

 __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
    infiniopHandle_t handle,
@@ -46,11 +49,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
                                          c_desc, a_desc, b_desc);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu:
-        return ascendCreateSwiGLUDescriptor(
-            (AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
-            c_desc, a_desc, b_desc);
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
@@ -94,7 +94,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
        return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
+#ifdef ENABLE_ASCEND_API
        GET(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_METAX_GPU
@@ -144,9 +144,8 @@ __C infiniStatus_t infiniopSwiGLU(
        return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu:
-        return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu:
@@ -188,9 +187,8 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
        return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu:
-        return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu:

--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
 }


@@ -143,6 +143,9 @@ def test(
        )

    lib_causal_softmax()
+    
+    if sync is not None:
+        sync() 

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -476,10 +476,11 @@ def get_test_devices(args):

 def get_sync_func(device):
    import torch
+    device_str = infiniDeviceEnum_str_map[device]
    
    if device == InfiniDeviceEnum.CPU:
        sync = None
    else:
-        sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
+        sync = getattr(torch, device_str).synchronize
    
    return sync
--- a/xmake/ascend.lua
+++ b/xmake/ascend.lua
@@ -50,9 +50,8 @@ target("infiniop-ascend")
    add_files("$(projectdir)/src/infiniop/devices/ascend/*.cc", "$(projectdir)/src/infiniop/ops/*/ascend/*.cc")

    -- Add operator
-    -- TODO: add it back after ascend-kernels is fixed
-    -- add_rules("ascend-kernels")
-    -- add_links(builddir.."/libascend_kernels.a")
+    add_rules("ascend-kernels")
+    add_links(builddir.."/libascend_kernels.a")
 target_end()

 target("infinirt-ascend")