Merge pull request #175 from InfiniTensor/issue/174-rearrange-ascend

issue/174: Rearrange ASCEND

Merge pull request #175 from InfiniTensor/issue/174-rearrange-ascend
issue/174: Rearrange ASCEND
bd37042c · PanZezhong1725 · GitHub · 125afeb5 · b4302732 · bd37042c
Unverified Commit bd37042c authored May 14, 2025 by PanZezhong1725 Committed by GitHub May 14, 2025
9 changed files
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
 #include "common_ascend.h"

 std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
-    auto index = std::max_element(strides.begin(), strides.end());
-    uint64_t max_stride_index = std::distance(strides.begin(), index);
-    auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
+    if (shape.size() != strides.size()) {
+        throw std::invalid_argument("Shape and strides must have the same length.");
+    }
+
+    int64_t max_offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        max_offset += (shape[i] - 1) * strides[i];
+    }

-    return storageShape;
+    // storage shape is 1D buffer that must cover all accessed elements
+    return {max_offset + 1};
 }

 size_t aclnnTensorDescriptor::numel() const {
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
    this->strides = std::vector<int64_t>(ndim);
    for (uint64_t i = 0; i < ndim; ++i) {
        this->shape[i] = static_cast<int64_t>(desc->dim(i));
-        this->strides[i] = desc->stride(i);
+        this->strides[i] = static_cast<int64_t>(desc->stride(i));
    }
    this->storageShape = inferStorageShape(this->shape, this->strides);
    this->dataType = toAclDataType(desc->dtype());

--- a/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
+++ b/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
    aclnnTensorDescriptor_t value;
    void *mask_addr;
    void *value_addr;
+    uint64_t workspacesize;
+    aclOpExecutor *executor;

    ~Opaque() {
        delete x;
@@ -21,6 +23,9 @@ struct Descriptor::Opaque {

        aclrtFree(mask_addr);
        aclrtFree(value_addr);
+
+        // Delete useless executor
+        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
    aclTensor *tvalue = value->tensor;

    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
-    int64_t dim = 2;

+    int64_t dim = 2;
    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
+    // set executor reusable
+    aclSetAclOpExecutorRepeatable(executor);

-    // Create the descriptor
-    size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
-    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
-                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
+    // Create the descripto
+    size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);

-    // Delete useless executor
-    aclDestroyAclOpExecutor(executor);
-    aclDestroyAclOpExecutor(mask_executor);
+    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr,
+                                          workspacesize_softmax, executor},
+                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);

    return INFINI_STATUS_SUCCESS;
 }
@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
    auto ty = _opaque->y->tensor;
    auto tmask = _opaque->mask->tensor;
    auto tvalue = _opaque->value->tensor;
-    aclOpExecutor *executor = nullptr;
    aclOpExecutor *mask_executor = nullptr;
-    size_t workspacesize_softmax = 0;
    size_t workspacesize_mask = 0;
-    int64_t dim = 2;

    AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
    AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
    AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
    CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
-    CHECK_ACL(aclrtSynchronizeStream(stream));

-    AclSetTensorAddr(executor, 0, tx, (void *)x);
-    AclSetTensorAddr(executor, 1, ty, y);
-    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
-    CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));
+    AclSetTensorAddr(_opaque->executor, 0, tx, (void *)x);
+    AclSetTensorAddr(_opaque->executor, 1, ty, y);
+    CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize, _opaque->executor, stream));

    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+++ b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+#include "rearrange_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+#include <aclnnop/aclnn_copy.h>
+
+namespace op::rearrange::ascend {
+
+struct Descriptor::Opaque {
+    aclnnTensorDescriptor_t dst;
+    aclnnTensorDescriptor_t src;
+    void *workspace; // aclnnInplaceCopy workspace
+    uint64_t workspace_size;
+    ~Opaque() {
+        delete dst;
+        delete src;
+
+        aclrtFree(workspace);
+    }
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+};
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+    auto ndim = y_desc->ndim();
+    auto shape = y_desc->shape();
+    CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < ndim; ++i) {
+        CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+    auto dst_strides = y_desc->strides();
+    auto src_strides = x_desc->strides();
+    auto element_size = infiniSizeOf(dtype);
+
+    auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
+    CHECK_RESULT(result);
+
+    aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
+    aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+    void *workspace = nullptr;
+    aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
+                                     &workspace_size, &executor);
+    if (workspace_size != 0) {
+        CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        new Opaque{
+            dst,
+            src,
+            workspace,
+            workspace_size},
+        handle->device,
+        handle->device_id);
+
+    // Delete useless executor
+    aclDestroyAclOpExecutor(executor);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *y,
+    const void *x,
+    void *stream) const {
+    auto tdst = _opaque->dst->tensor;
+    auto tsrc = _opaque->src->tensor;
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+
+    AclSetTensorAddr(executor, 0, tdst, y);
+    AclSetTensorAddr(executor, 1, tsrc, (void *)x);
+    CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
+    // Execute InplaceCopy
+    CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
+                               executor, stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rearrange::ascend
--- a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+++ b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+#ifndef __REARRANGE_ASCEND_H__
+#define __REARRANGE_ASCNED_H__
+
+#include "../rearrange.h"
+
+DESCRIPTOR(ascend)
+
+#endif // __REARRANGE_ASCEND_H__
--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rearrange_cpu.h"
 #endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/rearrange_ascend.h"
+#endif

 #ifdef ENABLE_CUDA_API
 #include "cuda/rearrange_cuda.cuh"
@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        CREATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        DELETE(INFINI_DEVICE_NVIDIA, cuda);

--- a/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+++ b/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
    aclnnTensorDescriptor_t w;
    aclnnTensorDescriptor_t rstd;
    size_t workspaceSize;
+    aclOpExecutor *executor;

    ~Opaque() {
        delete y;
        delete x;
        delete w;
        delete rstd;
+
+        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(

    // Get WorkspaceSize and set executor
    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
+    aclSetAclOpExecutorRepeatable(executor);

    auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
    size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
    *desc_ptr = new Descriptor(
-        new Opaque{y, x, w, rstd, workspace_size},
+        new Opaque{y, x, w, rstd, workspace_size, executor},
        std::move(info),
        all_workspace_size,
        handle_ascend->device, handle_ascend->device_id);

-    aclDestroyAclOpExecutor(executor);
-
    return INFINI_STATUS_SUCCESS;
 }

@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
    auto tx = _opaque->x->tensor;
    auto ty = _opaque->y->tensor;
    auto trstd = _opaque->rstd->tensor;
-    size_t workspace_size_ = 0;
-    aclOpExecutor *executor = nullptr;
-
-    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
-    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);

    auto unit = infiniSizeOf(_info.atype);
-    AclSetTensorAddr(executor, 1, tw, (void *)w);
-    AclSetTensorAddr(executor, 3, trstd, rstdPtr);
+    AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
+    AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
    for (size_t i = 0; i < (_info.shape)[0]; ++i) {
-        AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
-        AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
-        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
+        AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
+        AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
+        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
    }
    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -94,8 +94,8 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
        return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-        GET(INFINI_DEVICE_ASCEND, ascend)
+#ifdef ENABLE_ASCEND_API
+    // GET(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {

--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
 }


@@ -144,6 +144,9 @@ def test(

    lib_causal_softmax()
    
+    if sync is not None:
+        sync() 
+
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
        debug(y, ans, atol=atol, rtol=rtol)

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -476,10 +476,11 @@ def get_test_devices(args):

 def get_sync_func(device):
    import torch
+    device_str = infiniDeviceEnum_str_map[device]
    
-    if device == "cpu":
+    if device_str == "cpu":
        sync = None
    else:
-        sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
+        sync = getattr(torch, device_str).synchronize
    
    return sync