issue/174: fix rearrange, change getStorageShape

98270602 · zhangyue · 46a2678f · 98270602 · 98270602 · 98270602
Commit 98270602 authored May 14, 2025 by zhangyue
5 changed files
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
 #include "common_ascend.h"

 std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
-    auto index = std::max_element(strides.begin(), strides.end());
-    uint64_t max_stride_index = std::distance(strides.begin(), index);
-    auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
+    if (shape.size() != strides.size()) {
+        throw std::invalid_argument("Shape and strides must have the same length.");
+    }
+
+    int64_t max_offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        max_offset += (shape[i] - 1) * strides[i];
+    }

-    return storageShape;
+    // storage shape is 1D buffer that must cover all accessed elements
+    return {max_offset + 1};
 }

 size_t aclnnTensorDescriptor::numel() const {
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
    this->strides = std::vector<int64_t>(ndim);
    for (uint64_t i = 0; i < ndim; ++i) {
        this->shape[i] = static_cast<int64_t>(desc->dim(i));
-        this->strides[i] = desc->stride(i);
+        this->strides[i] = static_cast<int64_t>(desc->stride(i));
    }
    this->storageShape = inferStorageShape(this->shape, this->strides);
    this->dataType = toAclDataType(desc->dtype());

--- a/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
+++ b/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
@@ -97,7 +97,8 @@ infiniStatus_t Descriptor::create(
    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));

    // Create the descriptor
-    size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
+    size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);
+
    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);

@@ -127,7 +128,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
    AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
    CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
-    CHECK_ACL(aclrtSynchronizeStream(stream));

    AclSetTensorAddr(executor, 0, tx, (void *)x);
    AclSetTensorAddr(executor, 1, ty, y);

--- a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+++ b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
@@ -5,10 +5,16 @@
 namespace op::rearrange::ascend {

 struct Descriptor::Opaque {
-    aclDataType dt;
-    std::vector<int64_t> shape;
-    std::vector<int64_t> dst_strides;
-    std::vector<int64_t> src_strides;
+    aclnnTensorDescriptor_t dst;
+    aclnnTensorDescriptor_t src;
+    void *workspace; // aclnnInplaceCopy workspace
+    uint64_t workspace_size;
+    ~Opaque() {
+        delete dst;
+        delete src;
+
+        aclrtFree(workspace);
+    }
 };

 Descriptor::~Descriptor() {
@@ -37,24 +43,31 @@ infiniStatus_t Descriptor::create(
    auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
    CHECK_RESULT(result);

-    std::vector<int64_t> shape_(ndim);
-    std::vector<int64_t> dst_strides_(ndim);
-    std::vector<int64_t> src_strides_(ndim);
-    for (size_t i = 0; i < ndim; i++) {
-        shape_[i] = static_cast<int64_t>(shape[i]);
-        dst_strides_[i] = static_cast<int64_t>(dst_strides[i]);
-        src_strides_[i] = static_cast<int64_t>(src_strides[i]);
+    aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
+    aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+    void *workspace = nullptr;
+    aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
+                                     &workspace_size, &executor);
+    if (workspace_size != 0) {
+        CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
    }

    *desc_ptr = new Descriptor(
        result.take(),
        new Opaque{
-            toAclDataType(dtype),
-            shape_,
-            dst_strides_,
-            src_strides_},
+            dst,
+            src,
+            workspace,
+            workspace_size},
        handle->device,
        handle->device_id);
+
+    // Delete useless executor
+    aclDestroyAclOpExecutor(executor);
+
    return INFINI_STATUS_SUCCESS;
 }

@@ -62,20 +75,19 @@ infiniStatus_t Descriptor::calculate(
    void *y,
    const void *x,
    void *stream) const {
+    auto tdst = _opaque->dst->tensor;
+    auto tsrc = _opaque->src->tensor;

-    auto y_ = aclnnTensorDescriptor(_opaque->dt, _opaque->shape, _opaque->dst_strides, y);
-    auto x_ = aclnnTensorDescriptor(_opaque->dt, _opaque->shape, _opaque->src_strides, (void *)x);
-
-    auto ty = y_.tensor;
-    auto tx = x_.tensor;
-    size_t workspace_size = 0;
+    uint64_t workspace_size = 0;
    aclOpExecutor *executor = nullptr;
-    void *workspace = nullptr;
-    CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(ty, tx, &workspace_size, &executor));
-    if (workspace_size != 0) {
-        CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
-    }
-    CHECK_ACL(aclnnInplaceCopy(workspace, workspace_size, executor, stream));
+
+    AclSetTensorAddr(executor, 0, tdst, y);
+    AclSetTensorAddr(executor, 1, tsrc, (void *)x);
+    CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
+    // Execute InplaceCopy
+    CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
+                               executor, stream));
+
    return INFINI_STATUS_SUCCESS;
 }


--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
 }


@@ -143,6 +143,9 @@ def test(
        )

    lib_causal_softmax()
+    
+    if sync is not None:
+        sync() 

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -476,10 +476,11 @@ def get_test_devices(args):

 def get_sync_func(device):
    import torch
+    device_str = infiniDeviceEnum_str_map[device]
    
-    if device == "cpu":
+    if device_str == "cpu":
        sync = None
    else:
-        sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
+        sync = getattr(torch, device_str).synchronize
    
    return sync