Commit 98270602 authored by zhangyue's avatar zhangyue
Browse files

issue/174: fix rearrange, change getStorageShape

parent 46a2678f
#include "common_ascend.h"
std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
auto index = std::max_element(strides.begin(), strides.end());
uint64_t max_stride_index = std::distance(strides.begin(), index);
auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
if (shape.size() != strides.size()) {
throw std::invalid_argument("Shape and strides must have the same length.");
}
int64_t max_offset = 0;
for (size_t i = 0; i < shape.size(); ++i) {
max_offset += (shape[i] - 1) * strides[i];
}
return storageShape;
// storage shape is 1D buffer that must cover all accessed elements
return {max_offset + 1};
}
size_t aclnnTensorDescriptor::numel() const {
......@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
this->strides = std::vector<int64_t>(ndim);
for (uint64_t i = 0; i < ndim; ++i) {
this->shape[i] = static_cast<int64_t>(desc->dim(i));
this->strides[i] = desc->stride(i);
this->strides[i] = static_cast<int64_t>(desc->stride(i));
}
this->storageShape = inferStorageShape(this->shape, this->strides);
this->dataType = toAclDataType(desc->dtype());
......
......@@ -97,7 +97,8 @@ infiniStatus_t Descriptor::create(
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
// Create the descriptor
size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);
*desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
......@@ -127,7 +128,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
CHECK_ACL(aclrtSynchronizeStream(stream));
AclSetTensorAddr(executor, 0, tx, (void *)x);
AclSetTensorAddr(executor, 1, ty, y);
......
......@@ -5,10 +5,16 @@
namespace op::rearrange::ascend {
struct Descriptor::Opaque {
aclDataType dt;
std::vector<int64_t> shape;
std::vector<int64_t> dst_strides;
std::vector<int64_t> src_strides;
aclnnTensorDescriptor_t dst;
aclnnTensorDescriptor_t src;
void *workspace; // aclnnInplaceCopy workspace
uint64_t workspace_size;
~Opaque() {
delete dst;
delete src;
aclrtFree(workspace);
}
};
Descriptor::~Descriptor() {
......@@ -37,24 +43,31 @@ infiniStatus_t Descriptor::create(
auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
CHECK_RESULT(result);
std::vector<int64_t> shape_(ndim);
std::vector<int64_t> dst_strides_(ndim);
std::vector<int64_t> src_strides_(ndim);
for (size_t i = 0; i < ndim; i++) {
shape_[i] = static_cast<int64_t>(shape[i]);
dst_strides_[i] = static_cast<int64_t>(dst_strides[i]);
src_strides_[i] = static_cast<int64_t>(src_strides[i]);
aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
void *workspace = nullptr;
aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
&workspace_size, &executor);
if (workspace_size != 0) {
CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
}
*desc_ptr = new Descriptor(
result.take(),
new Opaque{
toAclDataType(dtype),
shape_,
dst_strides_,
src_strides_},
dst,
src,
workspace,
workspace_size},
handle->device,
handle->device_id);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
......@@ -62,20 +75,19 @@ infiniStatus_t Descriptor::calculate(
void *y,
const void *x,
void *stream) const {
auto tdst = _opaque->dst->tensor;
auto tsrc = _opaque->src->tensor;
auto y_ = aclnnTensorDescriptor(_opaque->dt, _opaque->shape, _opaque->dst_strides, y);
auto x_ = aclnnTensorDescriptor(_opaque->dt, _opaque->shape, _opaque->src_strides, (void *)x);
auto ty = y_.tensor;
auto tx = x_.tensor;
size_t workspace_size = 0;
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
void *workspace = nullptr;
CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(ty, tx, &workspace_size, &executor));
if (workspace_size != 0) {
CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
}
CHECK_ACL(aclnnInplaceCopy(workspace, workspace_size, executor, stream));
AclSetTensorAddr(executor, 0, tdst, y);
AclSetTensorAddr(executor, 1, tsrc, (void *)x);
CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
// Execute InplaceCopy
CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
executor, stream));
return INFINI_STATUS_SUCCESS;
}
......
......@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
}
......@@ -143,6 +143,9 @@ def test(
)
lib_causal_softmax()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
......
......@@ -476,10 +476,11 @@ def get_test_devices(args):
def get_sync_func(device):
import torch
device_str = infiniDeviceEnum_str_map[device]
if device == "cpu":
if device_str == "cpu":
sync = None
else:
sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
sync = getattr(torch, device_str).synchronize
return sync
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment