Unverified Commit bd37042c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #175 from InfiniTensor/issue/174-rearrange-ascend

issue/174: Rearrange ASCEND
parents 125afeb5 b4302732
#include "common_ascend.h"
std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
auto index = std::max_element(strides.begin(), strides.end());
uint64_t max_stride_index = std::distance(strides.begin(), index);
auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
if (shape.size() != strides.size()) {
throw std::invalid_argument("Shape and strides must have the same length.");
}
int64_t max_offset = 0;
for (size_t i = 0; i < shape.size(); ++i) {
max_offset += (shape[i] - 1) * strides[i];
}
return storageShape;
// storage shape is 1D buffer that must cover all accessed elements
return {max_offset + 1};
}
size_t aclnnTensorDescriptor::numel() const {
......@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
this->strides = std::vector<int64_t>(ndim);
for (uint64_t i = 0; i < ndim; ++i) {
this->shape[i] = static_cast<int64_t>(desc->dim(i));
this->strides[i] = desc->stride(i);
this->strides[i] = static_cast<int64_t>(desc->stride(i));
}
this->storageShape = inferStorageShape(this->shape, this->strides);
this->dataType = toAclDataType(desc->dtype());
......
......@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t value;
void *mask_addr;
void *value_addr;
uint64_t workspacesize;
aclOpExecutor *executor;
~Opaque() {
delete x;
......@@ -21,6 +23,9 @@ struct Descriptor::Opaque {
aclrtFree(mask_addr);
aclrtFree(value_addr);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
}
};
......@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
aclTensor *tvalue = value->tensor;
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
int64_t dim = 2;
int64_t dim = 2;
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
// set executor reusable
aclSetAclOpExecutorRepeatable(executor);
// Create the descriptor
size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
*desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
// Create the descripto
size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
aclDestroyAclOpExecutor(mask_executor);
*desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr,
workspacesize_softmax, executor},
std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
return INFINI_STATUS_SUCCESS;
}
......@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
auto ty = _opaque->y->tensor;
auto tmask = _opaque->mask->tensor;
auto tvalue = _opaque->value->tensor;
aclOpExecutor *executor = nullptr;
aclOpExecutor *mask_executor = nullptr;
size_t workspacesize_softmax = 0;
size_t workspacesize_mask = 0;
int64_t dim = 2;
AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
CHECK_ACL(aclrtSynchronizeStream(stream));
AclSetTensorAddr(executor, 0, tx, (void *)x);
AclSetTensorAddr(executor, 1, ty, y);
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));
AclSetTensorAddr(_opaque->executor, 0, tx, (void *)x);
AclSetTensorAddr(_opaque->executor, 1, ty, y);
CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize, _opaque->executor, stream));
return INFINI_STATUS_SUCCESS;
}
......
#include "rearrange_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_copy.h>
namespace op::rearrange::ascend {
struct Descriptor::Opaque {
aclnnTensorDescriptor_t dst;
aclnnTensorDescriptor_t src;
void *workspace; // aclnnInplaceCopy workspace
uint64_t workspace_size;
~Opaque() {
delete dst;
delete src;
aclrtFree(workspace);
}
};
Descriptor::~Descriptor() {
delete _opaque;
};
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
auto dtype = y_desc->dtype();
auto ndim = y_desc->ndim();
auto shape = y_desc->shape();
CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < ndim; ++i) {
CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
}
auto dst_strides = y_desc->strides();
auto src_strides = x_desc->strides();
auto element_size = infiniSizeOf(dtype);
auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
CHECK_RESULT(result);
aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
void *workspace = nullptr;
aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
&workspace_size, &executor);
if (workspace_size != 0) {
CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
}
*desc_ptr = new Descriptor(
result.take(),
new Opaque{
dst,
src,
workspace,
workspace_size},
handle->device,
handle->device_id);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *y,
const void *x,
void *stream) const {
auto tdst = _opaque->dst->tensor;
auto tsrc = _opaque->src->tensor;
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
AclSetTensorAddr(executor, 0, tdst, y);
AclSetTensorAddr(executor, 1, tsrc, (void *)x);
CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
// Execute InplaceCopy
CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
executor, stream));
return INFINI_STATUS_SUCCESS;
}
} // namespace op::rearrange::ascend
#ifndef __REARRANGE_ASCEND_H__
#define __REARRANGE_ASCNED_H__
#include "../rearrange.h"
DESCRIPTOR(ascend)
#endif // __REARRANGE_ASCEND_H__
......@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
......@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
......@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
......@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
......
......@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t w;
aclnnTensorDescriptor_t rstd;
size_t workspaceSize;
aclOpExecutor *executor;
~Opaque() {
delete y;
delete x;
delete w;
delete rstd;
aclDestroyAclOpExecutor(executor);
}
};
......@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(
// Get WorkspaceSize and set executor
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
aclSetAclOpExecutorRepeatable(executor);
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
*desc_ptr = new Descriptor(
new Opaque{y, x, w, rstd, workspace_size},
new Opaque{y, x, w, rstd, workspace_size, executor},
std::move(info),
all_workspace_size,
handle_ascend->device, handle_ascend->device_id);
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
......@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
auto tx = _opaque->x->tensor;
auto ty = _opaque->y->tensor;
auto trstd = _opaque->rstd->tensor;
size_t workspace_size_ = 0;
aclOpExecutor *executor = nullptr;
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);
auto unit = infiniSizeOf(_info.atype);
AclSetTensorAddr(executor, 1, tw, (void *)w);
AclSetTensorAddr(executor, 3, trstd, rstdPtr);
AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
for (size_t i = 0; i < (_info.shape)[0]; ++i) {
AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
}
return INFINI_STATUS_SUCCESS;
}
......
......@@ -94,8 +94,8 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
GET(INFINI_DEVICE_ASCEND, ascend)
#ifdef ENABLE_ASCEND_API
// GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
......
......@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
}
......@@ -144,6 +144,9 @@ def test(
lib_causal_softmax()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y, ans, atol=atol, rtol=rtol)
......
......@@ -476,10 +476,11 @@ def get_test_devices(args):
def get_sync_func(device):
import torch
device_str = infiniDeviceEnum_str_map[device]
if device == "cpu":
if device_str == "cpu":
sync = None
else:
sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
sync = getattr(torch, device_str).synchronize
return sync
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment