Merge pull request #199 from InfiniTensor/fix-ascend-executor

修复昇腾调用aclSetAclOpExecutorRepeatable的潜在危险

Merge pull request #199 from InfiniTensor/fix-ascend-executor
修复昇腾调用aclSetAclOpExecutorRepeatable的潜在危险
11e7df93 · PanZezhong1725 · GitHub · 17415721 · 66e7dc56 · 11e7df93
Unverified Commit 11e7df93 authored Apr 25, 2025 by PanZezhong1725 Committed by GitHub Apr 25, 2025
3 changed files
--- a/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
+++ b/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
@@ -6,22 +6,21 @@
 namespace op::causal_softmax::ascend {

 struct Descriptor::Opaque {
-    mutable aclOpExecutor *executor;
-    mutable aclOpExecutor *mask_executor;
    aclnnTensorDescriptor_t x;
    aclnnTensorDescriptor_t mask;
    aclnnTensorDescriptor_t y;
+    aclnnTensorDescriptor_t value;
    void *mask_addr;
-    size_t workspacesize_softmax;
-    size_t workspacesize_mask;
+    void *value_addr;

    ~Opaque() {
        delete x;
        delete mask;
        delete y;
+        delete value;

-        aclDestroyAclOpExecutor(executor);
-        aclDestroyAclOpExecutor(mask_executor);
+        aclrtFree(mask_addr);
+        aclrtFree(value_addr);
    }
 };

@@ -64,13 +63,13 @@ infiniStatus_t Descriptor::create(
        auto size = aclDataTypeSize(aclDataType::ACL_FLOAT16);
        CHECK_ACL(aclrtMalloc(&value_addr, size, ACL_MEM_MALLOC_HUGE_FIRST));
        CHECK_ACL(aclrtMemcpy(value_addr, size, &mask_value, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT16, {}, {}, value_addr);
+        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT16, {}, {});
    } else {
        uint32_t mask_value = 0xff800000;
        auto size = aclDataTypeSize(aclDataType::ACL_FLOAT);
        CHECK_ACL(aclrtMalloc(&value_addr, size, ACL_MEM_MALLOC_HUGE_FIRST));
        CHECK_ACL(aclrtMemcpy(value_addr, size, &mask_value, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT, {}, {}, value_addr);
+        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT, {}, {});
    }

    // Fill Mask Tensor
@@ -93,17 +92,19 @@ infiniStatus_t Descriptor::create(
    aclTensor *tvalue = value->tensor;

    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
-    aclSetAclOpExecutorRepeatable(mask_executor);
    int64_t dim = 2;

    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
-    aclSetAclOpExecutorRepeatable(executor);

    // Create the descriptor
    size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
-    *desc_ptr = new Descriptor(new Opaque{executor, mask_executor, x, mask, y, mask_addr, workspacesize_softmax, workspacesize_mask},
+    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);

+    // Delete useless executor
+    aclDestroyAclOpExecutor(executor);
+    aclDestroyAclOpExecutor(mask_executor);
+
    return INFINI_STATUS_SUCCESS;
 }

@@ -114,18 +115,24 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
    auto tx = _opaque->x->tensor;
    auto ty = _opaque->y->tensor;
    auto tmask = _opaque->mask->tensor;
-    auto executor = _opaque->executor;
-    auto mask_executor = _opaque->mask_executor;
-    auto mask_addr = _opaque->mask_addr;
+    auto tvalue = _opaque->value->tensor;
+    aclOpExecutor *executor = nullptr;
+    aclOpExecutor *mask_executor = nullptr;
+    size_t workspacesize_softmax = 0;
+    size_t workspacesize_mask = 0;
+    int64_t dim = 2;

    AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
-    AclSetTensorAddr(mask_executor, 1, tmask, mask_addr);
-    CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, _opaque->workspacesize_mask, mask_executor, stream));
+    AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
+    AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
+    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
+    CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
    CHECK_ACL(aclrtSynchronizeStream(stream));

    AclSetTensorAddr(executor, 0, tx, (void *)x);
    AclSetTensorAddr(executor, 1, ty, y);
-    CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize_softmax, executor, stream));
+    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
+    CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));

    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+++ b/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
@@ -6,7 +6,6 @@
 namespace op::gemm::ascend {

 struct Descriptor::Opaque {
-    mutable aclOpExecutor *executor;
    aclnnTensorDescriptor_t c, a, b;
    // cubeMathType
    // see doc:
@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
        delete c;
        delete a;
        delete b;
-        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -56,8 +54,8 @@ infiniStatus_t Descriptor::create(
         ta = a->tensor,
         tb = b->tensor;

-    aclOpExecutor *executor;
-    size_t workspace_size;
+    aclOpExecutor *executor = nullptr;
+    size_t workspace_size = 0;
    // aclnnGemm support C = alpha * A @ B + beta * C
    // see
    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
@@ -69,13 +67,15 @@ infiniStatus_t Descriptor::create(
    *desc_ptr = new Descriptor(
        dtype, info, workspace_size,
        new Opaque{
-            executor,
            c,
            a,
            b,
            mt,
        },
        handle->device, handle->device_id);
+
+    aclDestroyAclOpExecutor(executor);
+
    return INFINI_STATUS_SUCCESS;
 }

@@ -93,22 +93,24 @@ infiniStatus_t Descriptor::calculate(
         ta = _opaque->a->tensor,
         tb = _opaque->b->tensor;

-    size_t workspace_size;
+    size_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+
    CHECK_ACL(aclnnGemmGetWorkspaceSize(
        ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
-        &workspace_size, &(_opaque->executor)));
+        &workspace_size, &executor));
    if (workspaceSize_ < workspace_size) {
        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
    }
-    aclSetAclOpExecutorRepeatable(_opaque->executor);
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    auto unit = infiniSizeOf(_dtype);
    for (size_t i = 0; i < _info.batch; ++i) {
-        AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
-        AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
-        AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
-        AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
-        CHECK_ACL(aclnnGemm(workspace, workspace_size, _opaque->executor, stream));
+        AclSetTensorAddr(executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
+        AclSetTensorAddr(executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
+        AclSetTensorAddr(executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        AclSetTensorAddr(executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        CHECK_ACL(aclnnGemm(workspace, workspace_size, executor, stream));
    }

    return INFINI_STATUS_SUCCESS;

--- a/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+++ b/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -5,7 +5,6 @@
 namespace op::rms_norm::ascend {

 struct Descriptor::Opaque {
-    mutable aclOpExecutor *executor;
    aclnnTensorDescriptor_t y;
    aclnnTensorDescriptor_t x;
    aclnnTensorDescriptor_t w;
@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
        delete x;
        delete w;
        delete rstd;
-        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -64,16 +62,17 @@ infiniStatus_t Descriptor::create(

    // Get WorkspaceSize and set executor
    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
-    aclSetAclOpExecutorRepeatable(executor);

    auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
    size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
    *desc_ptr = new Descriptor(
-        new Opaque{executor, y, x, w, rstd, workspace_size},
+        new Opaque{y, x, w, rstd, workspace_size},
        std::move(info),
        all_workspace_size,
        handle_ascend->device, handle_ascend->device_id);

+    aclDestroyAclOpExecutor(executor);
+
    return INFINI_STATUS_SUCCESS;
 }

@@ -89,16 +88,21 @@ infiniStatus_t Descriptor::calculate(
    auto tx = _opaque->x->tensor;
    auto ty = _opaque->y->tensor;
    auto trstd = _opaque->rstd->tensor;
+    size_t workspace_size_ = 0;
+    aclOpExecutor *executor = nullptr;
+
+    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);

    auto unit = infiniSizeOf(_info.atype);
-    AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
-    AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
+    AclSetTensorAddr(executor, 1, tw, (void *)w);
+    AclSetTensorAddr(executor, 3, trstd, rstdPtr);
    for (size_t i = 0; i < (_info.shape)[0]; ++i) {
-        AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
-        AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
-        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
+        AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
+        AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
+        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
    }
    return INFINI_STATUS_SUCCESS;
 }