Commit c2e87202 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/142

parents 41818f84 c203635b
#include "infiniccl.h"
#include "./cuda/infiniccl_cuda.h"
__C infiniStatus_t infinicclCommInitAll(
infiniDevice_t device_type,
infinicclComm_t *comms,
int ndevice,
const int *device_ids) {
#define COMM_INIT_ALL(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::commInitAll(comms, ndevice, device_ids);
switch (device_type) {
COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda)
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef COMM_INIT_ALL
}
__C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
if (comm == nullptr) {
return INFINI_STATUS_SUCCESS;
}
#define COMM_DESTROY(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::commDestroy(comm);
switch (comm->device_type) {
COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda)
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef COMM_DESTROY
}
__C infiniStatus_t infinicclAllReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t dataype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream) {
if (comm == nullptr) {
return INFINI_STATUS_NULL_POINTER;
}
#define ALL_REDUCE(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::allReduce(sendbuf, recvbuf, count, dataype, op, comm, stream);
switch (comm->device_type) {
ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda)
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef ALL_REDUCE
}
#ifndef INFINICCL_IMPL_H
#define INFINICCL_IMPL_H
#include "infiniccl.h"
struct InfinicclComm {
infiniDevice_t device_type;
int device_id; // the actual device ID, not rank number
void *comm; // the actual communicator
};
#define INFINICCL_DEVICE_API(NAMSPACE, IMPL) \
namespace infiniccl::NAMSPACE { \
infiniStatus_t commInitAll( \
infinicclComm_t *comms, \
int ndevice, \
const int *device_ids) IMPL; \
\
infiniStatus_t commDestroy(infinicclComm_t comm) IMPL; \
\
infiniStatus_t allReduce( \
void *sendbuf, \
void *recvbuf, \
size_t count, \
infiniDtype_t datatype, \
infinicclReduceOp_t op, \
infinicclComm_t comm, \
infinirtStream_t stream) IMPL; \
};
#define INFINICCL_DEVICE_API_IMPL(NAMSPACE) \
INFINICCL_DEVICE_API(NAMSPACE, )
#define INFINICCL_DEVICE_API_NOOP(NAMSPACE) \
INFINICCL_DEVICE_API(NAMSPACE, { return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; })
#endif // INFINICCL_IMPL_H
...@@ -8,6 +8,10 @@ ...@@ -8,6 +8,10 @@
DECLARE_INFINIOP_TEST(gemm) DECLARE_INFINIOP_TEST(gemm)
DECLARE_INFINIOP_TEST(random_sample) DECLARE_INFINIOP_TEST(random_sample)
DECLARE_INFINIOP_TEST(rms_norm) DECLARE_INFINIOP_TEST(rms_norm)
DECLARE_INFINIOP_TEST(mul)
DECLARE_INFINIOP_TEST(clip)
DECLARE_INFINIOP_TEST(swiglu)
DECLARE_INFINIOP_TEST(add)
#define REGISTER_INFINIOP_TEST(name) \ #define REGISTER_INFINIOP_TEST(name) \
{ \ { \
...@@ -16,6 +20,7 @@ DECLARE_INFINIOP_TEST(rms_norm) ...@@ -16,6 +20,7 @@ DECLARE_INFINIOP_TEST(rms_norm)
infiniop_test::name::Test::build, \ infiniop_test::name::Test::build, \
infiniop_test::name::Test::attribute_names(), \ infiniop_test::name::Test::attribute_names(), \
infiniop_test::name::Test::tensor_names(), \ infiniop_test::name::Test::tensor_names(), \
infiniop_test::name::Test::output_names(), \
}}, }},
/* /*
...@@ -25,6 +30,10 @@ DECLARE_INFINIOP_TEST(rms_norm) ...@@ -25,6 +30,10 @@ DECLARE_INFINIOP_TEST(rms_norm)
{ \ { \
REGISTER_INFINIOP_TEST(gemm) \ REGISTER_INFINIOP_TEST(gemm) \
REGISTER_INFINIOP_TEST(random_sample) \ REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(add) \
REGISTER_INFINIOP_TEST(mul) \
REGISTER_INFINIOP_TEST(clip) \
REGISTER_INFINIOP_TEST(swiglu) \
REGISTER_INFINIOP_TEST(rms_norm) \ REGISTER_INFINIOP_TEST(rms_norm) \
} }
......
...@@ -58,7 +58,9 @@ private: ...@@ -58,7 +58,9 @@ private:
public: public:
Tensor(const GGUFTensorInfo *info, Tensor(const GGUFTensorInfo *info,
const void *ggml_ptr, const void *ggml_ptr,
const GGUFKeyValue *strides_meta = nullptr); const GGUFKeyValue *shape_meta = nullptr,
const GGUFKeyValue *strides_meta = nullptr,
bool isOutput = false);
Tensor(std::shared_ptr<Memory> memory, size_t offset, Tensor(std::shared_ptr<Memory> memory, size_t offset,
const std::vector<size_t> &shape, const std::vector<size_t> &shape,
const std::vector<ptrdiff_t> &strides, const std::vector<ptrdiff_t> &strides,
......
...@@ -92,6 +92,7 @@ public: ...@@ -92,6 +92,7 @@ public:
\ \
static std::vector<std::string> attribute_names(); \ static std::vector<std::string> attribute_names(); \
static std::vector<std::string> tensor_names(); \ static std::vector<std::string> tensor_names(); \
static std::vector<std::string> output_names(); \
\ \
std::shared_ptr<infiniop_test::Result> run( \ std::shared_ptr<infiniop_test::Result> run( \
infiniopHandle_t handle, infiniDevice_t device, int device_id, \ infiniopHandle_t handle, infiniDevice_t device, int device_id, \
...@@ -121,6 +122,7 @@ struct TestBuilder { ...@@ -121,6 +122,7 @@ struct TestBuilder {
BuilderFunc build; BuilderFunc build;
std::vector<std::string> attribute_names; std::vector<std::string> attribute_names;
std::vector<std::string> tensor_names; std::vector<std::string> tensor_names;
std::vector<std::string> output_names;
}; };
} // namespace infiniop_test } // namespace infiniop_test
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::add {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> c;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopAddDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateAddDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetAddWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopAdd(op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopAdd(
op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"c"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::add
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::clip {
struct Test::Attributes {
std::shared_ptr<Tensor> x;
std::shared_ptr<Tensor> min_val;
std::shared_ptr<Tensor> max_val;
std::shared_ptr<Tensor> y;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("x") == tensors.end()
|| tensors.find("min_val") == tensors.end()
|| tensors.find("max_val") == tensors.end()
|| tensors.find("y") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->x = tensors["x"];
test->_attributes->min_val = tensors["min_val"];
test->_attributes->max_val = tensors["max_val"];
test->_attributes->y = tensors["y"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopClipDescriptor_t op_desc;
auto x = _attributes->x->to(device, device_id);
auto min_val = _attributes->min_val->to(device, device_id);
auto max_val = _attributes->max_val->to(device, device_id);
auto y = _attributes->y->to(device, device_id);
CHECK_OR(infiniopCreateClipDescriptor(handle, &op_desc,
y->desc(),
x->desc(),
min_val->desc(),
max_val->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create clip descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetClipWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopClip(op_desc, workspace, workspace_size,
y->data(),
x->data(),
min_val->data(),
max_val->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(y, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopClip(
op_desc, workspace, workspace_size,
y->data(),
x->data(),
min_val->data(),
max_val->data(),
nullptr);
},
warm_ups, iterations);
infiniopDestroyClipDescriptor(op_desc);
infinirtFree(workspace);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"x", "min_val", "max_val", "y", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"y"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- x: " << _attributes->x->info() << std::endl;
oss << "- min_val: " << _attributes->min_val->info() << std::endl;
oss << "- max_val: " << _attributes->max_val->info() << std::endl;
oss << "- y: " << _attributes->y->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::clip
...@@ -113,6 +113,10 @@ std::vector<std::string> Test::tensor_names() { ...@@ -113,6 +113,10 @@ std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"}; return {"a", "b", "c", "ans"};
} }
std::vector<std::string> Test::output_names() {
return {};
}
std::string Test::toString() const { std::string Test::toString() const {
std::ostringstream oss; std::ostringstream oss;
oss << op_name() << std::endl; oss << op_name() << std::endl;
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::mul {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> c;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopMulDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateMulDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetMulWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopMul(op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopMul(
op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"c"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::mul
...@@ -109,6 +109,10 @@ std::vector<std::string> Test::tensor_names() { ...@@ -109,6 +109,10 @@ std::vector<std::string> Test::tensor_names() {
return {"data", "ans", "result"}; return {"data", "ans", "result"};
} }
std::vector<std::string> Test::output_names() {
return {"result"};
}
std::string Test::toString() const { std::string Test::toString() const {
std::ostringstream oss; std::ostringstream oss;
oss << op_name() << std::endl; oss << op_name() << std::endl;
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::swiglu {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> ans;
std::shared_ptr<Tensor> c;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopSwiGLUDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateSwiGLUDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetSwiGLUWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr),
return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution."));
try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopSwiGLU(
op_desc,
workspace,
workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"c"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::swiglu
...@@ -98,20 +98,28 @@ void *Tensor::data() const { ...@@ -98,20 +98,28 @@ void *Tensor::data() const {
Tensor::Tensor(const GGUFTensorInfo *info, Tensor::Tensor(const GGUFTensorInfo *info,
const void *ggml_ptr, const void *ggml_ptr,
const GGUFKeyValue *strides_meta) { const GGUFKeyValue *shape_meta,
const GGUFKeyValue *strides_meta,
bool isOutput) {
_ggml_type = info->ggml_type; _ggml_type = info->ggml_type;
_offset = 0; _offset = 0;
size_t ndim = static_cast<size_t>(info->ndim); size_t ndim = static_cast<size_t>(info->ndim);
// `_shape`存储真实的tensor形状(来自shape_meta),`temp_shape`存储用于rearrange和计算内存的tensor形状
_shape = std::vector<size_t>(ndim); _shape = std::vector<size_t>(ndim);
std::vector<size_t> temp_shape(ndim);
_strides = std::vector<ptrdiff_t>(ndim); _strides = std::vector<ptrdiff_t>(ndim);
std::vector<ptrdiff_t> contiguous_strides(ndim); std::vector<ptrdiff_t> contiguous_strides(ndim);
for (size_t i = 0; i < ndim; i++) { for (size_t i = 0; i < ndim; i++) {
_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]); temp_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
if (i == 0) { if (i == 0) {
contiguous_strides[ndim - 1] = (ptrdiff_t)1; contiguous_strides[ndim - 1] = (ptrdiff_t)1;
} else { } else {
contiguous_strides[ndim - 1 - i] = (ptrdiff_t)info->shape[i - 1] * contiguous_strides[ndim - i]; contiguous_strides[ndim - 1 - i] = (ptrdiff_t)info->shape[i - 1] * contiguous_strides[ndim - i];
} }
if (isOutput) {
contiguous_strides[i] = (ptrdiff_t)0;
}
} }
if (strides_meta == nullptr) { if (strides_meta == nullptr) {
...@@ -120,7 +128,6 @@ Tensor::Tensor(const GGUFTensorInfo *info, ...@@ -120,7 +128,6 @@ Tensor::Tensor(const GGUFTensorInfo *info,
} }
} else { } else {
for (size_t i = 0; i < ndim; i++) { for (size_t i = 0; i < ndim; i++) {
_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
if (strides_meta->gguf_type == GGUF_TYPE_INT64) { if (strides_meta->gguf_type == GGUF_TYPE_INT64) {
_strides[i] = (ptrdiff_t)(reinterpret_cast<const int64_t *>( _strides[i] = (ptrdiff_t)(reinterpret_cast<const int64_t *>(
strides_meta->value.data())[ndim - 1 - i]); strides_meta->value.data())[ndim - 1 - i]);
...@@ -133,18 +140,62 @@ Tensor::Tensor(const GGUFTensorInfo *info, ...@@ -133,18 +140,62 @@ Tensor::Tensor(const GGUFTensorInfo *info,
} }
} }
infiniopCreateTensorDescriptor(&_desc, ndim, _shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type)); if (isOutput) {
if (shape_meta == nullptr) {
throw std::runtime_error("Error Creating Tensor: shape_meta cannot be null for output tensor");
}
for (size_t i = 0; i < ndim; i++) {
if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
temp_shape[i] = static_cast<size_t>(val);
} else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
temp_shape[i] = static_cast<size_t>(val);
} else {
throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
}
}
}
infiniopCreateTensorDescriptor(&_desc, ndim, temp_shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
size_t size; size_t size;
calculateTensorMemory(size, _offset, _shape, _strides, ggmlTypeSize(_ggml_type)); calculateTensorMemory(size, _offset, temp_shape, _strides, ggmlTypeSize(_ggml_type));
_memory = std::make_shared<Memory>(size, INFINI_DEVICE_CPU, 0); _memory = std::make_shared<Memory>(size, INFINI_DEVICE_CPU, 0);
utils::rearrange( utils::rearrange(
(char *)_memory->ptr() + _offset, (char *)_memory->ptr() + _offset,
(char *)ggml_ptr + info->data_offset, (char *)ggml_ptr + info->data_offset,
_shape.data(), temp_shape.data(),
_strides.data(), _strides.data(),
contiguous_strides.data(), contiguous_strides.data(),
ndim, ndim,
ggmlTypeSize(_ggml_type)); ggmlTypeSize(_ggml_type));
if (shape_meta == nullptr) {
_shape = temp_shape;
} else {
for (size_t i = 0; i < ndim; i++) {
if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
_shape[i] = static_cast<size_t>(val);
} else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
if (val < 0) {
throw std::runtime_error("Shape must be non-negative");
}
_shape[i] = static_cast<size_t>(val);
} else {
throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
}
}
}
} }
Tensor::Tensor(std::shared_ptr<Memory> memory, size_t offset, Tensor::Tensor(std::shared_ptr<Memory> memory, size_t offset,
......
...@@ -90,14 +90,19 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader, ...@@ -90,14 +90,19 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
attrs[attr_name] = attr->second->value; attrs[attr_name] = attr->second->value;
} }
} }
for (auto tensor_name : builder.tensor_names) { for (auto tensor_name : builder.tensor_names) {
auto info = tensor_info.find("test." + std::to_string(test_id) + "." + tensor_name); auto info = tensor_info.find("test." + std::to_string(test_id) + "." + tensor_name);
if (info != tensor_info.end()) { if (info != tensor_info.end()) {
auto shape = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".shape");
auto strides = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".strides"); auto strides = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".strides");
bool is_output = std::find(builder.output_names.begin(), builder.output_names.end(), tensor_name) != builder.output_names.end();
tensors[tensor_name] = std::make_shared<Tensor>( tensors[tensor_name] = std::make_shared<Tensor>(
info->second.get(), info->second.get(),
gguf_reader.getGgmlStart(), gguf_reader.getGgmlStart(),
strides != meta.end() ? strides->second.get() : nullptr); shape != meta.end() ? shape->second.get() : nullptr,
strides != meta.end() ? strides->second.get() : nullptr,
is_output);
} }
} }
std::shared_ptr<infiniop_test::base::Test> test; std::shared_ptr<infiniop_test::base::Test> test;
......
...@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0) ...@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
# project information # project information
project(Ascend_C) project(Ascend_C)
set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type") set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH "ASCEND CANN package installation directory") set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
set(RUN_MODE "npu" CACHE STRING "run mode: npu") set(RUN_MODE "npu" CACHE STRING "run mode: npu")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
...@@ -19,10 +19,13 @@ else() ...@@ -19,10 +19,13 @@ else()
endif() endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
include_directories(
${CMAKE_SOURCE_DIR}/../../../../include/infiniop/
)
ascendc_library(ascend_kernels STATIC ascendc_library(ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_kernel.cpp ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp ../../ops/rope/ascend/rope_ascend_kernel.cpp
../../ops/random_sample/ascend/random_sample_kernel.cpp # ../../ops/random_sample/ascend/random_sample_kernel.cpp
) )
#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
#include "../../../../include/infinicore.h"
#include "kernel_operator.h"
constexpr size_t BLOCK_NUM = 8;
constexpr size_t BUFFER_NUM = 2;
constexpr size_t BYTE_ALIGN = 32;
template <typename T>
__aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {
size_t bytes = tile_len * sizeof(T);
size_t aligned_bytes = (bytes % byte_align == 0)
? bytes
: (bytes + (byte_align - bytes % byte_align));
return aligned_bytes / sizeof(T);
}
#endif
#include "common_ascend.h" #include "common_ascend.h"
std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) { std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
auto index = std::max_element(strides.begin(), strides.end()); if (shape.size() != strides.size()) {
uint64_t max_stride_index = std::distance(strides.begin(), index); throw std::invalid_argument("Shape and strides must have the same length.");
auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]}); }
int64_t max_offset = 0;
for (size_t i = 0; i < shape.size(); ++i) {
max_offset += (shape[i] - 1) * strides[i];
}
return storageShape; // storage shape is 1D buffer that must cover all accessed elements
return {max_offset + 1};
} }
size_t aclnnTensorDescriptor::numel() const { size_t aclnnTensorDescriptor::numel() const {
...@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo ...@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
this->strides = std::vector<int64_t>(ndim); this->strides = std::vector<int64_t>(ndim);
for (uint64_t i = 0; i < ndim; ++i) { for (uint64_t i = 0; i < ndim; ++i) {
this->shape[i] = static_cast<int64_t>(desc->dim(i)); this->shape[i] = static_cast<int64_t>(desc->dim(i));
this->strides[i] = desc->stride(i); this->strides[i] = static_cast<int64_t>(desc->stride(i));
} }
this->storageShape = inferStorageShape(this->shape, this->strides); this->storageShape = inferStorageShape(this->shape, this->strides);
this->dataType = toAclDataType(desc->dtype()); this->dataType = toAclDataType(desc->dtype());
...@@ -41,7 +47,12 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(aclDataType dtype, const std::vecto ...@@ -41,7 +47,12 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(aclDataType dtype, const std::vecto
this->strides = strides; this->strides = strides;
this->dataType = dtype; this->dataType = dtype;
this->format = aclFormat::ACL_FORMAT_ND; this->format = aclFormat::ACL_FORMAT_ND;
this->storageShape = inferStorageShape(this->shape, this->strides); if (this->ndim != 0) {
this->storageShape = inferStorageShape(this->shape, this->strides);
} else {
this->storageShape = shape;
this->storageNdim = 0;
}
this->tensor = aclCreateTensor(this->shape.data(), this->tensor = aclCreateTensor(this->shape.data(),
this->ndim, this->ndim,
this->dataType, this->dataType,
......
#ifndef __INFINIOP_CUDA_COMMON_CUH__ #ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__ #define __INFINIOP_CUDA_COMMON_CUH__
#include "../../reduce/cuda/reduce.cuh"
#include "cuda_handle.cuh" #include "cuda_handle.cuh"
#include "infinicore.h" #include "infinicore.h"
#ifdef ENABLE_SUGON_CUDA_API
#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
#else
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_1024 1024
#define CUDA_BLOCK_SIZE_512 512
namespace device::cuda { namespace device::cuda {
cudnnDataType_t getCudnnDtype(infiniDtype_t dt); cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda } // namespace device::cuda
#endif // __INFINIOP_CUDA_COMMON_CUH__ #endif // __INFINIOP_CUDA_COMMON_CUH__
#ifdef ENABLE_SUGON_CUDA_API
#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
#else
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_1024 1024
#define CUDA_BLOCK_SIZE_512 512
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace device::cuda {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda
#ifdef ENABLE_CUDA_API
#include <cuda_fp16.h>
__forceinline__ __device__ float
exp_(const float val) {
return expf(val);
}
__forceinline__ __device__ long double
exp_(const long double val) {
return expl(val);
}
__forceinline__ __device__ double
exp_(const double val) {
return exp(val);
}
__forceinline__ __device__ __half
exp_(const __half x) {
return hexp(x);
}
#endif
...@@ -16,7 +16,7 @@ typedef XPUStream kunlunStream_t; ...@@ -16,7 +16,7 @@ typedef XPUStream kunlunStream_t;
typedef XPUEvent kunlunEvent_t; typedef XPUEvent kunlunEvent_t;
typedef xdnn::Context *xdnnHandle_t; typedef xdnn::Context *xdnnHandle_t;
#define CHECK_XDNN(API) CHECK_INTERNAL(API, XPU_SUCCESS) #define CHECK_KUNLUN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
namespace device::kunlun { namespace device::kunlun {
......
#ifndef __INFINIOP_KUNLUN_KERNEL_COMMON_H__
#define __INFINIOP_KUNLUN_KERNEL_COMMON_H__
// This header file will only be include by .xpu file
#include "kunlun_kernel_dtype.h"
#include "xpu/kernel/xtdk.h"
#include "xpu/kernel/xtdk_math.h"
#include "xpu/kernel/xtdk_simd.h"
#include "xpu/runtime.h"
namespace device::kunlun::kernel {
// Get mask for kunlun xpu 512bit register calculation
// if data is not enough to 512bit, padding zero and use
// mask to identify real data
// 0 - i bit 1, others 0
inline __device__ float lowerBitMask(int i) {
return (1 << (i + 1)) - 1;
}
// Atomic add for reduce
inline __device__ void atomicAddF32(__shared_ptr__ float *ptr, float value) {
int success = 1;
while (success) {
// SM2REG read 32bit data to register
float a = SM2REG_atomic(ptr);
a = a + value;
success = REG2SM_atomic(ptr, a);
}
}
inline __device__ size_t indexToReducedOffset(
size_t flat_index,
size_t ndim,
const _ptrdiff_t *broadcasted_strides,
const _ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i].value * target_strides[i].value;
flat_index %= broadcasted_strides[i].value;
mfence();
}
return res;
}
inline __device__ size_t indexToOffset(
size_t flat_index,
size_t ndim,
const _size_t *shape,
const _ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i].value) * strides[i].value;
flat_index /= shape[i].value;
mfence();
}
return res;
}
} // namespace device::kunlun::kernel
// TODO: atomicAddF16
// TODO: atomicAddI8
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment