Commit 9b32b4b1 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/150

parents 15bcbdfc 4799ddbf
......@@ -175,6 +175,10 @@ options:
{
"clangd.arguments": [
"--compile-commands-dir=.vscode"
]
],
"xmake.additionalConfigArguments": [
// 在这里配置 XMAKE_CONFIG_FLAGS
"--nv-gpu=y"
],
}
```
......@@ -6,6 +6,7 @@
#include "infiniop/ops/attention.h"
#include "infiniop/ops/avg_pool.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/expand.h"
#include "infiniop/ops/gemm.h"
......
#ifndef __INFINIOP_CLIP_API_H__
#define __INFINIOP_CLIP_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopClipDescriptor_t;
__C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
infiniopClipDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t min_val,
infiniopTensorDescriptor_t max_val);
__C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *min_val,
const void *max_val,
void *stream);
__C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
#endif
......@@ -6,10 +6,10 @@
typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
......@@ -20,7 +20,7 @@ __C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
const void *a,
const void *b,
void *stream);
__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
#endif
......@@ -18,6 +18,7 @@ def run_tests(args):
"rms_norm.py",
"rope.py",
"swiglu.py",
"attention.py",
]:
result = subprocess.run(
f"python {test} {args}", text=True, encoding="utf-8", shell=True
......
......@@ -9,6 +9,9 @@ DECLARE_INFINIOP_TEST(gemm)
DECLARE_INFINIOP_TEST(random_sample)
DECLARE_INFINIOP_TEST(mul)
DECLARE_INFINIOP_TEST(rope)
DECLARE_INFINIOP_TEST(clip)
DECLARE_INFINIOP_TEST(swiglu)
DECLARE_INFINIOP_TEST(add)
#define REGISTER_INFINIOP_TEST(name) \
{ \
......@@ -17,6 +20,7 @@ DECLARE_INFINIOP_TEST(rope)
infiniop_test::name::Test::build, \
infiniop_test::name::Test::attribute_names(), \
infiniop_test::name::Test::tensor_names(), \
infiniop_test::name::Test::output_names(), \
}},
/*
......@@ -26,7 +30,10 @@ DECLARE_INFINIOP_TEST(rope)
{ \
REGISTER_INFINIOP_TEST(gemm) \
REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(add) \
REGISTER_INFINIOP_TEST(mul) \
REGISTER_INFINIOP_TEST(clip) \
REGISTER_INFINIOP_TEST(swiglu) \
REGISTER_INFINIOP_TEST(rope) \
}
......
......@@ -58,7 +58,9 @@ private:
public:
Tensor(const GGUFTensorInfo *info,
const void *ggml_ptr,
const GGUFKeyValue *strides_meta = nullptr);
const GGUFKeyValue *shape_meta = nullptr,
const GGUFKeyValue *strides_meta = nullptr,
bool isOutput = false);
Tensor(std::shared_ptr<Memory> memory, size_t offset,
const std::vector<size_t> &shape,
const std::vector<ptrdiff_t> &strides,
......
......@@ -92,6 +92,7 @@ public:
\
static std::vector<std::string> attribute_names(); \
static std::vector<std::string> tensor_names(); \
static std::vector<std::string> output_names(); \
\
std::shared_ptr<infiniop_test::Result> run( \
infiniopHandle_t handle, infiniDevice_t device, int device_id, \
......@@ -121,6 +122,7 @@ struct TestBuilder {
BuilderFunc build;
std::vector<std::string> attribute_names;
std::vector<std::string> tensor_names;
std::vector<std::string> output_names;
};
} // namespace infiniop_test
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::add {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> c;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopAddDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateAddDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetAddWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopAdd(op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopAdd(
op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"c"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::add
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::clip {
struct Test::Attributes {
std::shared_ptr<Tensor> x;
std::shared_ptr<Tensor> min_val;
std::shared_ptr<Tensor> max_val;
std::shared_ptr<Tensor> y;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("x") == tensors.end()
|| tensors.find("min_val") == tensors.end()
|| tensors.find("max_val") == tensors.end()
|| tensors.find("y") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->x = tensors["x"];
test->_attributes->min_val = tensors["min_val"];
test->_attributes->max_val = tensors["max_val"];
test->_attributes->y = tensors["y"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopClipDescriptor_t op_desc;
auto x = _attributes->x->to(device, device_id);
auto min_val = _attributes->min_val->to(device, device_id);
auto max_val = _attributes->max_val->to(device, device_id);
auto y = _attributes->y->to(device, device_id);
CHECK_OR(infiniopCreateClipDescriptor(handle, &op_desc,
y->desc(),
x->desc(),
min_val->desc(),
max_val->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create clip descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetClipWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopClip(op_desc, workspace, workspace_size,
y->data(),
x->data(),
min_val->data(),
max_val->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(y, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopClip(
op_desc, workspace, workspace_size,
y->data(),
x->data(),
min_val->data(),
max_val->data(),
nullptr);
},
warm_ups, iterations);
infiniopDestroyClipDescriptor(op_desc);
infinirtFree(workspace);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"x", "min_val", "max_val", "y", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"y"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- x: " << _attributes->x->info() << std::endl;
oss << "- min_val: " << _attributes->min_val->info() << std::endl;
oss << "- max_val: " << _attributes->max_val->info() << std::endl;
oss << "- y: " << _attributes->y->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::clip
......@@ -113,6 +113,10 @@ std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::vector<std::string> Test::output_names() {
return {};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
......
......@@ -87,6 +87,10 @@ std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"c"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
......
......@@ -109,6 +109,10 @@ std::vector<std::string> Test::tensor_names() {
return {"data", "ans", "result"};
}
std::vector<std::string> Test::output_names() {
return {"result"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::swiglu {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> ans;
std::shared_ptr<Tensor> c;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopSwiGLUDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateSwiGLUDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetSwiGLUWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr),
return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution."));
try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopSwiGLU(
op_desc,
workspace,
workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"c"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::swiglu
......@@ -98,20 +98,28 @@ void *Tensor::data() const {
Tensor::Tensor(const GGUFTensorInfo *info,
const void *ggml_ptr,
const GGUFKeyValue *strides_meta) {
const GGUFKeyValue *shape_meta,
const GGUFKeyValue *strides_meta,
bool isOutput) {
_ggml_type = info->ggml_type;
_offset = 0;
size_t ndim = static_cast<size_t>(info->ndim);
// `_shape`存储真实的tensor形状(来自shape_meta),`temp_shape`存储用于rearrange和计算内存的tensor形状
_shape = std::vector<size_t>(ndim);
std::vector<size_t> temp_shape(ndim);
_strides = std::vector<ptrdiff_t>(ndim);
std::vector<ptrdiff_t> contiguous_strides(ndim);
for (size_t i = 0; i < ndim; i++) {
_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
temp_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
if (i == 0) {
contiguous_strides[ndim - 1] = (ptrdiff_t)1;
} else {
contiguous_strides[ndim - 1 - i] = (ptrdiff_t)info->shape[i - 1] * contiguous_strides[ndim - i];
}
if (isOutput) {
contiguous_strides[i] = (ptrdiff_t)0;
}
}
if (strides_meta == nullptr) {
......@@ -120,7 +128,6 @@ Tensor::Tensor(const GGUFTensorInfo *info,
}
} else {
for (size_t i = 0; i < ndim; i++) {
_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
if (strides_meta->gguf_type == GGUF_TYPE_INT64) {
_strides[i] = (ptrdiff_t)(reinterpret_cast<const int64_t *>(
strides_meta->value.data())[ndim - 1 - i]);
......@@ -133,18 +140,62 @@ Tensor::Tensor(const GGUFTensorInfo *info,
}
}
infiniopCreateTensorDescriptor(&_desc, ndim, _shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
if (isOutput) {
if (shape_meta == nullptr) {
throw std::runtime_error("Error Creating Tensor: shape_meta cannot be null for output tensor");
}
for (size_t i = 0; i < ndim; i++) {
if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
temp_shape[i] = static_cast<size_t>(val);
} else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
temp_shape[i] = static_cast<size_t>(val);
} else {
throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
}
}
}
infiniopCreateTensorDescriptor(&_desc, ndim, temp_shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
size_t size;
calculateTensorMemory(size, _offset, _shape, _strides, ggmlTypeSize(_ggml_type));
calculateTensorMemory(size, _offset, temp_shape, _strides, ggmlTypeSize(_ggml_type));
_memory = std::make_shared<Memory>(size, INFINI_DEVICE_CPU, 0);
utils::rearrange(
(char *)_memory->ptr() + _offset,
(char *)ggml_ptr + info->data_offset,
_shape.data(),
temp_shape.data(),
_strides.data(),
contiguous_strides.data(),
ndim,
ggmlTypeSize(_ggml_type));
if (shape_meta == nullptr) {
_shape = temp_shape;
} else {
for (size_t i = 0; i < ndim; i++) {
if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
_shape[i] = static_cast<size_t>(val);
} else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
_shape[i] = static_cast<size_t>(val);
} else {
throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
}
}
}
}
Tensor::Tensor(std::shared_ptr<Memory> memory, size_t offset,
......
......@@ -90,14 +90,19 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
attrs[attr_name] = attr->second->value;
}
}
for (auto tensor_name : builder.tensor_names) {
auto info = tensor_info.find("test." + std::to_string(test_id) + "." + tensor_name);
if (info != tensor_info.end()) {
auto shape = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".shape");
auto strides = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".strides");
bool is_output = std::find(builder.output_names.begin(), builder.output_names.end(), tensor_name) != builder.output_names.end();
tensors[tensor_name] = std::make_shared<Tensor>(
info->second.get(),
gguf_reader.getGgmlStart(),
strides != meta.end() ? strides->second.get() : nullptr);
shape != meta.end() ? shape->second.get() : nullptr,
strides != meta.end() ? strides->second.get() : nullptr,
is_output);
}
}
std::shared_ptr<infiniop_test::base::Test> test;
......
......@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
# project information
project(Ascend_C)
set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH "ASCEND CANN package installation directory")
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
set(RUN_MODE "npu" CACHE STRING "run mode: npu")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
......@@ -19,10 +19,13 @@ else()
endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
include_directories(
${CMAKE_SOURCE_DIR}/../../../../include/infiniop/
)
ascendc_library(ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_kernel.cpp
../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
../../ops/random_sample/ascend/random_sample_kernel.cpp
../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
../../ops/rope/ascend/rope_ascend_kernel.cpp
# ../../ops/random_sample/ascend/random_sample_kernel.cpp
)
#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
#include "../../../../include/infinicore.h"
#include "kernel_operator.h"
constexpr size_t BLOCK_NUM = 8;
constexpr size_t BUFFER_NUM = 2;
constexpr size_t BYTE_ALIGN = 32;
template <typename T>
__aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {
size_t bytes = tile_len * sizeof(T);
size_t aligned_bytes = (bytes % byte_align == 0)
? bytes
: (bytes + (byte_align - bytes % byte_align));
return aligned_bytes / sizeof(T);
}
#endif
#include "common_ascend.h"
std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
auto index = std::max_element(strides.begin(), strides.end());
uint64_t max_stride_index = std::distance(strides.begin(), index);
auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
if (shape.size() != strides.size()) {
throw std::invalid_argument("Shape and strides must have the same length.");
}
int64_t max_offset = 0;
for (size_t i = 0; i < shape.size(); ++i) {
max_offset += (shape[i] - 1) * strides[i];
}
return storageShape;
// storage shape is 1D buffer that must cover all accessed elements
return {max_offset + 1};
}
size_t aclnnTensorDescriptor::numel() const {
......@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
this->strides = std::vector<int64_t>(ndim);
for (uint64_t i = 0; i < ndim; ++i) {
this->shape[i] = static_cast<int64_t>(desc->dim(i));
this->strides[i] = desc->stride(i);
this->strides[i] = static_cast<int64_t>(desc->stride(i));
}
this->storageShape = inferStorageShape(this->shape, this->strides);
this->dataType = toAclDataType(desc->dtype());
......
......@@ -16,7 +16,7 @@ typedef XPUStream kunlunStream_t;
typedef XPUEvent kunlunEvent_t;
typedef xdnn::Context *xdnnHandle_t;
#define CHECK_XDNN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
#define CHECK_KUNLUN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
namespace device::kunlun {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment