Unverified Commit 11e7df93 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #199 from InfiniTensor/fix-ascend-executor

修复昇腾调用aclSetAclOpExecutorRepeatable的潜在危险
parents 17415721 66e7dc56
......@@ -6,22 +6,21 @@
namespace op::causal_softmax::ascend {
struct Descriptor::Opaque {
mutable aclOpExecutor *executor;
mutable aclOpExecutor *mask_executor;
aclnnTensorDescriptor_t x;
aclnnTensorDescriptor_t mask;
aclnnTensorDescriptor_t y;
aclnnTensorDescriptor_t value;
void *mask_addr;
size_t workspacesize_softmax;
size_t workspacesize_mask;
void *value_addr;
~Opaque() {
delete x;
delete mask;
delete y;
delete value;
aclDestroyAclOpExecutor(executor);
aclDestroyAclOpExecutor(mask_executor);
aclrtFree(mask_addr);
aclrtFree(value_addr);
}
};
......@@ -64,13 +63,13 @@ infiniStatus_t Descriptor::create(
auto size = aclDataTypeSize(aclDataType::ACL_FLOAT16);
CHECK_ACL(aclrtMalloc(&value_addr, size, ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMemcpy(value_addr, size, &mask_value, size, ACL_MEMCPY_HOST_TO_DEVICE));
value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT16, {}, {}, value_addr);
value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT16, {}, {});
} else {
uint32_t mask_value = 0xff800000;
auto size = aclDataTypeSize(aclDataType::ACL_FLOAT);
CHECK_ACL(aclrtMalloc(&value_addr, size, ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMemcpy(value_addr, size, &mask_value, size, ACL_MEMCPY_HOST_TO_DEVICE));
value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT, {}, {}, value_addr);
value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT, {}, {});
}
// Fill Mask Tensor
......@@ -93,17 +92,19 @@ infiniStatus_t Descriptor::create(
aclTensor *tvalue = value->tensor;
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
aclSetAclOpExecutorRepeatable(mask_executor);
int64_t dim = 2;
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
aclSetAclOpExecutorRepeatable(executor);
// Create the descriptor
size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
*desc_ptr = new Descriptor(new Opaque{executor, mask_executor, x, mask, y, mask_addr, workspacesize_softmax, workspacesize_mask},
*desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
aclDestroyAclOpExecutor(mask_executor);
return INFINI_STATUS_SUCCESS;
}
......@@ -114,18 +115,24 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
auto tx = _opaque->x->tensor;
auto ty = _opaque->y->tensor;
auto tmask = _opaque->mask->tensor;
auto executor = _opaque->executor;
auto mask_executor = _opaque->mask_executor;
auto mask_addr = _opaque->mask_addr;
auto tvalue = _opaque->value->tensor;
aclOpExecutor *executor = nullptr;
aclOpExecutor *mask_executor = nullptr;
size_t workspacesize_softmax = 0;
size_t workspacesize_mask = 0;
int64_t dim = 2;
AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
AclSetTensorAddr(mask_executor, 1, tmask, mask_addr);
CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, _opaque->workspacesize_mask, mask_executor, stream));
AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
CHECK_ACL(aclrtSynchronizeStream(stream));
AclSetTensorAddr(executor, 0, tx, (void *)x);
AclSetTensorAddr(executor, 1, ty, y);
CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize_softmax, executor, stream));
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));
return INFINI_STATUS_SUCCESS;
}
......
......@@ -6,7 +6,6 @@
namespace op::gemm::ascend {
struct Descriptor::Opaque {
mutable aclOpExecutor *executor;
aclnnTensorDescriptor_t c, a, b;
// cubeMathType
// see doc:
......@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
delete c;
delete a;
delete b;
aclDestroyAclOpExecutor(executor);
}
};
......@@ -56,8 +54,8 @@ infiniStatus_t Descriptor::create(
ta = a->tensor,
tb = b->tensor;
aclOpExecutor *executor;
size_t workspace_size;
aclOpExecutor *executor = nullptr;
size_t workspace_size = 0;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
......@@ -69,13 +67,15 @@ infiniStatus_t Descriptor::create(
*desc_ptr = new Descriptor(
dtype, info, workspace_size,
new Opaque{
executor,
c,
a,
b,
mt,
},
handle->device, handle->device_id);
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
......@@ -93,22 +93,24 @@ infiniStatus_t Descriptor::calculate(
ta = _opaque->a->tensor,
tb = _opaque->b->tensor;
size_t workspace_size;
size_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
CHECK_ACL(aclnnGemmGetWorkspaceSize(
ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
&workspace_size, &(_opaque->executor)));
&workspace_size, &executor));
if (workspaceSize_ < workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
aclSetAclOpExecutorRepeatable(_opaque->executor);
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
auto unit = infiniSizeOf(_dtype);
for (size_t i = 0; i < _info.batch; ++i) {
AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
CHECK_ACL(aclnnGemm(workspace, workspace_size, _opaque->executor, stream));
AclSetTensorAddr(executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
AclSetTensorAddr(executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
AclSetTensorAddr(executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
AclSetTensorAddr(executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
CHECK_ACL(aclnnGemm(workspace, workspace_size, executor, stream));
}
return INFINI_STATUS_SUCCESS;
......
......@@ -5,7 +5,6 @@
namespace op::rms_norm::ascend {
struct Descriptor::Opaque {
mutable aclOpExecutor *executor;
aclnnTensorDescriptor_t y;
aclnnTensorDescriptor_t x;
aclnnTensorDescriptor_t w;
......@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
delete x;
delete w;
delete rstd;
aclDestroyAclOpExecutor(executor);
}
};
......@@ -64,16 +62,17 @@ infiniStatus_t Descriptor::create(
// Get WorkspaceSize and set executor
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
aclSetAclOpExecutorRepeatable(executor);
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
*desc_ptr = new Descriptor(
new Opaque{executor, y, x, w, rstd, workspace_size},
new Opaque{y, x, w, rstd, workspace_size},
std::move(info),
all_workspace_size,
handle_ascend->device, handle_ascend->device_id);
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
......@@ -89,16 +88,21 @@ infiniStatus_t Descriptor::calculate(
auto tx = _opaque->x->tensor;
auto ty = _opaque->y->tensor;
auto trstd = _opaque->rstd->tensor;
size_t workspace_size_ = 0;
aclOpExecutor *executor = nullptr;
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);
auto unit = infiniSizeOf(_info.atype);
AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
AclSetTensorAddr(executor, 1, tw, (void *)w);
AclSetTensorAddr(executor, 3, trstd, rstdPtr);
for (size_t i = 0; i < (_info.shape)[0]; ++i) {
AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
}
return INFINI_STATUS_SUCCESS;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment