Commit 52536c5a authored by YdrMaster's avatar YdrMaster
Browse files

issue/63/style: 整理代码,优化风格


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent f23aa206
#include "matmul_ascend.h" #include "matmul_ascend.h"
#include "../../../devices/ascend/ascend_handle.h"
#include "../../../devices/ascend/tensor_aclnn.h" #include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h" #include "../../utils.h"
#include <acl/acl_base.h> #include <acl/acl_base.h>
...@@ -10,16 +11,16 @@ namespace matmul::ascend { ...@@ -10,16 +11,16 @@ namespace matmul::ascend {
struct Descriptor::Opaque { struct Descriptor::Opaque {
mutable aclOpExecutor *executor; mutable aclOpExecutor *executor;
aclnnTensorDescriptor_t cDesc, aDesc, bDesc; aclnnTensorDescriptor_t c, a, b;
// cubeMathType // cubeMathType
// see doc: // see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t mt; int8_t mt;
~Opaque() { ~Opaque() {
delete cDesc; delete c;
delete aDesc; delete a;
delete bDesc; delete b;
aclDestroyAclOpExecutor(executor); aclDestroyAclOpExecutor(executor);
} }
}; };
...@@ -29,76 +30,77 @@ Descriptor::~Descriptor() { ...@@ -29,76 +30,77 @@ Descriptor::~Descriptor() {
} }
infiniopStatus_t Descriptor::create( infiniopStatus_t Descriptor::create(
infiniopAscendHandle_t handle, infiniopHandle_t handle_,
Descriptor **desc_ptr, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype; auto handle = reinterpret_cast<infiniopAscendHandle_t>(handle_);
auto dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE; return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
} }
infiniopStatus_t status; infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR); auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
if (status != INFINIOP_STATUS_SUCCESS) { if (status != INFINIOP_STATUS_SUCCESS) {
return status; return status;
} }
auto cDesc = new aclnnTensorDescriptor(), auto c = new aclnnTensorDescriptor(),
aDesc = new aclnnTensorDescriptor(), a = new aclnnTensorDescriptor(),
bDesc = new aclnnTensorDescriptor(); b = new aclnnTensorDescriptor();
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
// operation // operation
CHECK_STATUS(cDesc->setDescriptor( CHECK_STATUS(c->setDescriptor(
toAclDataType(c_desc->dtype), toAclDataType(c_desc->dtype),
{static_cast<int64_t>(info.c_matrix.rows), {static_cast<int64_t>(_info.c_matrix.rows),
static_cast<int64_t>(info.c_matrix.cols)}, static_cast<int64_t>(_info.c_matrix.cols)},
{info.c_matrix.row_stride, info.c_matrix.col_stride}), {_info.c_matrix.row_stride, _info.c_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS); INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(aDesc->setDescriptor( CHECK_STATUS(a->setDescriptor(
toAclDataType(a_desc->dtype), toAclDataType(a_desc->dtype),
{static_cast<int64_t>(info.a_matrix.rows), {static_cast<int64_t>(_info.a_matrix.rows),
static_cast<int64_t>(info.a_matrix.cols)}, static_cast<int64_t>(_info.a_matrix.cols)},
{info.a_matrix.row_stride, info.a_matrix.col_stride}), {_info.a_matrix.row_stride, _info.a_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS); INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(bDesc->setDescriptor( CHECK_STATUS(b->setDescriptor(
toAclDataType(b_desc->dtype), toAclDataType(b_desc->dtype),
{static_cast<int64_t>(info.b_matrix.rows), {static_cast<int64_t>(_info.b_matrix.rows),
static_cast<int64_t>(info.b_matrix.cols)}, static_cast<int64_t>(_info.b_matrix.cols)},
{info.b_matrix.row_stride, info.b_matrix.col_stride}), {_info.b_matrix.row_stride, _info.b_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS); INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(cDesc->createTensor(), INFINIOP_STATUS_SUCCESS); CHECK_STATUS(c->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(aDesc->createTensor(), INFINIOP_STATUS_SUCCESS); CHECK_STATUS(a->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(bDesc->createTensor(), INFINIOP_STATUS_SUCCESS); CHECK_STATUS(b->createTensor(), INFINIOP_STATUS_SUCCESS);
auto tc = cDesc->t, auto tc = c->t,
ta = aDesc->t, ta = a->t,
tb = bDesc->t; tb = b->t;
aclOpExecutor *executor; aclOpExecutor *executor;
size_t workspaceSize; size_t workspace_size;
// aclnnGemm support C = alpha * A @ B + beta * C // aclnnGemm support C = alpha * A @ B + beta * C
// see // see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily // use alpha = 0.5, beta = 0.5 temporarily
int8_t mt = 1; int8_t mt = 1;
auto ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspaceSize, &executor); auto ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspace_size, &executor);
CHECK_RET(ret == ACL_SUCCESS, CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret); LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR); return INFINIOP_STATUS_INTERNAL_ERROR);
aclSetAclOpExecutorRepeatable(executor); aclSetAclOpExecutorRepeatable(executor);
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
dtype, info, workspaceSize, dtype, _info, workspace_size,
new Opaque{ new Opaque{
executor, executor,
cDesc, c,
aDesc, a,
bDesc, b,
mt, mt,
}, },
handle->device, handle->device_id); handle->device, handle->device_id);
...@@ -115,28 +117,29 @@ infiniopStatus_t Descriptor::calculate( ...@@ -115,28 +117,29 @@ infiniopStatus_t Descriptor::calculate(
float alpha, float alpha,
void *stream) const { void *stream) const {
auto tc = _opaque->cDesc->t, auto tc = _opaque->c->t,
ta = _opaque->aDesc->t, ta = _opaque->a->t,
tb = _opaque->bDesc->t; tb = _opaque->b->t;
size_t workspaceSize; size_t workspace_size;
auto ret = aclnnGemmGetWorkspaceSize( auto ret = aclnnGemmGetWorkspaceSize(
ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt, ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
&workspaceSize, &(_opaque->executor)); &workspace_size, &(_opaque->executor));
CHECK_RET(ret == ACL_SUCCESS, CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret); LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR); return INFINIOP_STATUS_INTERNAL_ERROR);
if (workspaceSize_ < workspaceSize) { if (workspaceSize_ < workspace_size) {
return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE; return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
} }
aclSetAclOpExecutorRepeatable(_opaque->executor); aclSetAclOpExecutorRepeatable(_opaque->executor);
for (size_t i = 0; i < info.batch; ++i) { auto unit = infiniSizeof(_dtype);
AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * info.a_matrix.stride * infiniSizeof(dtype)); for (size_t i = 0; i < _info.batch; ++i) {
AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * info.b_matrix.stride * infiniSizeof(dtype)); AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * info.c_matrix.stride * infiniSizeof(dtype)); AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * info.c_matrix.stride * infiniSizeof(dtype)); AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
ret = aclnnGemm(workspace, workspaceSize, _opaque->executor, stream); AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
ret = aclnnGemm(workspace, workspace_size, _opaque->executor, stream);
CHECK_RET(ret == ACL_SUCCESS, CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret); LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR); return INFINIOP_STATUS_INTERNAL_ERROR);
......
#ifndef __MATMUL_ASCEND_H__ #ifndef __MATMUL_ASCEND_H__
#define __MATMUL_ASCEND_H__ #define __MATMUL_ASCEND_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "../matmul.h" #include "../matmul.h"
DESCRIPTOR(ascend, infiniopAscendHandle_t) DESCRIPTOR(ascend)
#endif // __MATMUL_ASCEND_H__ #endif // __MATMUL_ASCEND_H__
#include "matmul_bang.h" #include "matmul_bang.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h" #include "../../../devices/bang/common_bang.h"
#include "../../utils.h" #include "../../utils.h"
#include <cnnl_extra.h> #include <cnnl_extra.h>
...@@ -6,17 +7,17 @@ ...@@ -6,17 +7,17 @@
namespace matmul::bang { namespace matmul::bang {
struct Descriptor::Opaque { struct Descriptor::Opaque {
cnnlMatMulDescriptor_t opDesc; cnnlMatMulDescriptor_t op;
cnnlMatMulAlgo_t algo; cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult; cnnlMatMulHeuristicResult_t algoResult;
cnnlTensorDescriptor_t aDesc, bDesc, cDesc; cnnlTensorDescriptor_t a, b, c;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool; std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
~Opaque() { ~Opaque() {
cnnlDestroyTensorDescriptor(aDesc); cnnlDestroyTensorDescriptor(a);
cnnlDestroyTensorDescriptor(bDesc); cnnlDestroyTensorDescriptor(b);
cnnlDestroyTensorDescriptor(cDesc); cnnlDestroyTensorDescriptor(c);
cnnlMatMulDescDestroy(opDesc); cnnlMatMulDescDestroy(op);
cnnlMatMulAlgoDestroy(algo); cnnlMatMulAlgoDestroy(algo);
cnnlDestroyMatMulHeuristicResult(algoResult); cnnlDestroyMatMulHeuristicResult(algoResult);
} }
...@@ -59,41 +60,42 @@ Descriptor::~Descriptor() { ...@@ -59,41 +60,42 @@ Descriptor::~Descriptor() {
} }
infiniopStatus_t Descriptor::create( infiniopStatus_t Descriptor::create(
infiniopBangHandle_t handle, infiniopHandle_t handle_,
Descriptor **desc_ptr, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype; auto handle = reinterpret_cast<infiniopBangHandle_t>(handle_);
auto dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE; return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
} }
infiniopStatus_t status; infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR); auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
if (status != INFINIOP_STATUS_SUCCESS) { if (status != INFINIOP_STATUS_SUCCESS) {
return status; return status;
} }
cnnlTensorDescriptor_t aDesc, bDesc, cDesc; cnnlTensorDescriptor_t a, b, c;
cnnlCreateTensorDescriptor(&aDesc); cnnlCreateTensorDescriptor(&a);
cnnlCreateTensorDescriptor(&bDesc); cnnlCreateTensorDescriptor(&b);
cnnlCreateTensorDescriptor(&cDesc); cnnlCreateTensorDescriptor(&c);
setMatrixTensorEx(aDesc, info.a_matrix, a_desc->dtype); setMatrixTensorEx(a, _info.a_matrix, a_desc->dtype);
setMatrixTensorEx(bDesc, info.b_matrix, b_desc->dtype); setMatrixTensorEx(b, _info.b_matrix, b_desc->dtype);
setMatrixTensorEx(cDesc, info.c_matrix, c_desc->dtype); setMatrixTensorEx(c, _info.c_matrix, c_desc->dtype);
cnnlMatMulDescriptor_t opDesc; cnnlMatMulDescriptor_t op;
cnnlMatMulAlgo_t algo; cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult; cnnlMatMulHeuristicResult_t algoResult;
cnnlMatMulDescCreate(&opDesc); cnnlMatMulDescCreate(&op);
cnnlMatMulAlgoCreate(&algo); cnnlMatMulAlgoCreate(&algo);
cnnlCreateMatMulHeuristicResult(&algoResult); cnnlCreateMatMulHeuristicResult(&algoResult);
int32_t use_stride = true; int32_t use_stride = true;
cnnlSetMatMulDescAttr( cnnlSetMatMulDescAttr(
opDesc, op,
CNNL_MATMUL_USE_STRIDE, CNNL_MATMUL_USE_STRIDE,
&use_stride, &use_stride,
sizeof(int32_t)); sizeof(int32_t));
...@@ -102,7 +104,7 @@ infiniopStatus_t Descriptor::create( ...@@ -102,7 +104,7 @@ infiniopStatus_t Descriptor::create(
[&](cnnlHandle_t _handle) { [&](cnnlHandle_t _handle) {
cnnlGetBatchMatMulAlgoHeuristic( cnnlGetBatchMatMulAlgoHeuristic(
_handle, _handle,
opDesc, aDesc, bDesc, cDesc, op, a, b, c,
NULL, 1, &algoResult, &count); NULL, 1, &algoResult, &count);
}); });
...@@ -110,14 +112,14 @@ infiniopStatus_t Descriptor::create( ...@@ -110,14 +112,14 @@ infiniopStatus_t Descriptor::create(
cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size); cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
dtype, info, workspace_size, dtype, _info, workspace_size,
new Opaque{ new Opaque{
opDesc, op,
algo, algo,
algoResult, algoResult,
aDesc, a,
bDesc, b,
cDesc, c,
handle->cnnl_handle_pool}, handle->cnnl_handle_pool},
handle->device, handle->device_id); handle->device, handle->device_id);
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
...@@ -133,7 +135,7 @@ infiniopStatus_t Descriptor::calculate( ...@@ -133,7 +135,7 @@ infiniopStatus_t Descriptor::calculate(
float alpha, float alpha,
void *stream) const { void *stream) const {
if (info.is_transed) { if (_info.is_transed) {
std::swap(a, b); std::swap(a, b);
} }
use_cnnl(_opaque->cnnl_handle_pool, use_cnnl(_opaque->cnnl_handle_pool,
...@@ -141,13 +143,13 @@ infiniopStatus_t Descriptor::calculate( ...@@ -141,13 +143,13 @@ infiniopStatus_t Descriptor::calculate(
[&](cnnlHandle_t handle) { [&](cnnlHandle_t handle) {
cnnlBatchMatMulBCast_v2( cnnlBatchMatMulBCast_v2(
handle, handle,
_opaque->opDesc, _opaque->op,
_opaque->algo, _opaque->algo,
&alpha, &alpha,
_opaque->aDesc, a, _opaque->a, a,
_opaque->bDesc, b, _opaque->b, b,
&beta, &beta,
_opaque->cDesc, c, _opaque->c, c,
workspace, workspace,
workspace_size); workspace_size);
}); });
......
#ifndef __MATMUL_BANG_H__ #ifndef __MATMUL_BANG_H__
#define __MATMUL_BANG_H__ #define __MATMUL_BANG_H__
#include "../../../devices/bang/bang_handle.h"
#include "../matmul.h" #include "../matmul.h"
DESCRIPTOR(bang, infiniopBangHandle_t) DESCRIPTOR(bang)
#endif // __MATMUL_BANG_H__ #endif // __MATMUL_BANG_H__
#include "./matmul_cpu.h" #include "./matmul_cpu.h"
#include "../../../devices/cpu/common_cpu.h" #include "../../../devices/cpu/common_cpu.h"
#include "../../../devices/cpu/cpu_handle.h"
#include <iostream> #include <iostream>
namespace matmul::cpu { namespace matmul::cpu {
...@@ -7,25 +8,26 @@ namespace matmul::cpu { ...@@ -7,25 +8,26 @@ namespace matmul::cpu {
Descriptor::~Descriptor() = default; Descriptor::~Descriptor() = default;
infiniopStatus_t Descriptor::create( infiniopStatus_t Descriptor::create(
infiniopCpuHandle_t handle, infiniopHandle_t handle_,
Descriptor **desc_ptr, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype; auto handle = reinterpret_cast<infiniopCpuHandle_t>(handle_);
auto dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE; return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
} }
infiniopStatus_t status; infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR); auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
if (status != INFINIOP_STATUS_SUCCESS) { if (status != INFINIOP_STATUS_SUCCESS) {
return status; return status;
} }
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
dtype, info, 0, dtype, _info, 0,
nullptr, nullptr,
handle->device, handle->device_id); handle->device, handle->device_id);
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
...@@ -33,26 +35,24 @@ infiniopStatus_t Descriptor::create( ...@@ -33,26 +35,24 @@ infiniopStatus_t Descriptor::create(
template <typename Tdata> template <typename Tdata>
void calculate( void calculate(
Descriptor const *desc, MatmulInfo const &_info,
void *c, void *c,
float beta, float beta,
void const *a, void const *a,
void const *b, void const *b,
float alpha) { float alpha) {
auto info = desc->info; if (_info.is_transed) {
if (info.is_transed) {
std::swap(a, b); std::swap(a, b);
} }
for (size_t i = 0; i < info.batch; ++i) { for (size_t i = 0; i < _info.batch; ++i) {
for (size_t m_ = 0; m_ < info.m; ++m_) { for (size_t m_ = 0; m_ < _info.m; ++m_) {
for (size_t n_ = 0; n_ < info.n; ++n_) { for (size_t n_ = 0; n_ < _info.n; ++n_) {
auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride; auto c_ = reinterpret_cast<Tdata *>(c) + i * _info.c_matrix.stride + m_ * _info.c_matrix.row_stride + n_ * _info.c_matrix.col_stride;
float sum = 0; float sum = 0;
for (size_t k_ = 0; k_ < info.k; ++k_) { for (size_t k_ = 0; k_ < _info.k; ++k_) {
auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride; auto a_ = reinterpret_cast<Tdata const *>(a) + i * _info.a_matrix.stride + m_ * _info.a_matrix.row_stride + k_ * _info.a_matrix.col_stride;
auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride; auto b_ = reinterpret_cast<Tdata const *>(b) + i * _info.b_matrix.stride + n_ * _info.b_matrix.col_stride + k_ * _info.b_matrix.row_stride;
if constexpr (std::is_same<Tdata, uint16_t>::value) { if constexpr (std::is_same<Tdata, uint16_t>::value) {
sum += f16_to_f32(*a_) * f16_to_f32(*b_); sum += f16_to_f32(*a_) * f16_to_f32(*b_);
} else { } else {
...@@ -83,13 +83,13 @@ infiniopStatus_t Descriptor::calculate( ...@@ -83,13 +83,13 @@ infiniopStatus_t Descriptor::calculate(
float alpha, float alpha,
void *stream) const { void *stream) const {
switch (dtype) { switch (_dtype) {
case INFINI_DTYPE_F16: case INFINI_DTYPE_F16:
cpu::calculate<uint16_t>(this, c, beta, a, b, alpha); cpu::calculate<uint16_t>(_info, c, beta, a, b, alpha);
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
case INFINI_DTYPE_F32: case INFINI_DTYPE_F32:
cpu::calculate<float>(this, c, beta, a, b, alpha); cpu::calculate<float>(_info, c, beta, a, b, alpha);
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
default: default:
......
#ifndef __MATMUL_CPU_H__ #ifndef __MATMUL_CPU_H__
#define __MATMUL_CPU_H__ #define __MATMUL_CPU_H__
#include "../../../devices/cpu/cpu_handle.h"
#include "../matmul.h" #include "../matmul.h"
DESCRIPTOR(cpu, infiniopCpuHandle_t) DESCRIPTOR(cpu)
#endif // __MATMUL_CPU_H__ #endif // __MATMUL_CPU_H__
#include "../../../devices/cuda/common_cuda.cuh"
#include "../../utils.h" #include "../../utils.h"
#include "matmul_cuda.cuh" #include "matmul_cuda.cuh"
...@@ -12,25 +13,26 @@ Descriptor::~Descriptor() { ...@@ -12,25 +13,26 @@ Descriptor::~Descriptor() {
} }
infiniopStatus_t Descriptor::create( infiniopStatus_t Descriptor::create(
infiniopCudaHandle_t handle, infiniopHandle_t handle_,
Descriptor **desc_ptr, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype; auto handle = reinterpret_cast<infiniopCudaHandle_t>(handle_);
auto dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE; return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
} }
infiniopStatus_t status; infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR); auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
if (status != INFINIOP_STATUS_SUCCESS) { if (status != INFINIOP_STATUS_SUCCESS) {
return status; return status;
} }
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
dtype, info, 0, dtype, _info, 0,
new Opaque{handle->cublas_handle_pool}, new Opaque{handle->cublas_handle_pool},
handle->device, handle->device_id); handle->device, handle->device_id);
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
...@@ -38,7 +40,7 @@ infiniopStatus_t Descriptor::create( ...@@ -38,7 +40,7 @@ infiniopStatus_t Descriptor::create(
template <typename Tdata> template <typename Tdata>
infiniopStatus_t calculate( infiniopStatus_t calculate(
MatmulInfo const &info, MatmulInfo const &_info,
std::shared_ptr<Pool<cublasHandle_t>> &cublas_handle_pool, std::shared_ptr<Pool<cublasHandle_t>> &cublas_handle_pool,
void *c, void *c,
float beta, float beta,
...@@ -47,7 +49,7 @@ infiniopStatus_t calculate( ...@@ -47,7 +49,7 @@ infiniopStatus_t calculate(
float alpha, float alpha,
cudaStream_t stream) { cudaStream_t stream) {
if (info.is_transed) { if (_info.is_transed) {
std::swap(a, b); std::swap(a, b);
} }
...@@ -65,8 +67,8 @@ infiniopStatus_t calculate( ...@@ -65,8 +67,8 @@ infiniopStatus_t calculate(
#endif #endif
} }
auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
use_cublas(cublas_handle_pool, use_cublas(cublas_handle_pool,
stream, stream,
...@@ -75,24 +77,24 @@ infiniopStatus_t calculate( ...@@ -75,24 +77,24 @@ infiniopStatus_t calculate(
handle, handle,
op_a, op_a,
op_b, op_b,
static_cast<int>(info.m), static_cast<int>(_info.m),
static_cast<int>(info.n), static_cast<int>(_info.n),
static_cast<int>(info.k), static_cast<int>(_info.k),
&alpha, &alpha,
a, a,
a_type, a_type,
static_cast<int>(info.a_matrix.ld()), static_cast<int>(_info.a_matrix.ld()),
info.a_matrix.stride, _info.a_matrix.stride,
b, b,
b_type, b_type,
static_cast<int>(info.b_matrix.ld()), static_cast<int>(_info.b_matrix.ld()),
info.b_matrix.stride, _info.b_matrix.stride,
&beta, &beta,
c, c,
c_type, c_type,
static_cast<int>(info.c_matrix.ld()), static_cast<int>(_info.c_matrix.ld()),
info.c_matrix.stride, _info.c_matrix.stride,
static_cast<int>(info.batch), static_cast<int>(_info.batch),
compute_type, compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); CUBLAS_GEMM_DEFAULT_TENSOR_OP);
}); });
...@@ -109,13 +111,13 @@ infiniopStatus_t Descriptor::calculate( ...@@ -109,13 +111,13 @@ infiniopStatus_t Descriptor::calculate(
float alpha, float alpha,
void *stream) const { void *stream) const {
switch (dtype) { switch (_dtype) {
case INFINI_DTYPE_F16: case INFINI_DTYPE_F16:
cuda::calculate<uint16_t>(info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream); cuda::calculate<uint16_t>(_info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
case INFINI_DTYPE_F32: case INFINI_DTYPE_F32:
cuda::calculate<float>(info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream); cuda::calculate<float>(_info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
default: default:
......
#ifndef __MATMUL_CUDA_CUH__ #ifndef __MATMUL_CUDA_CUH__
#define __MATMUL_CUDA_CUH__ #define __MATMUL_CUDA_CUH__
#include "../../../devices/cuda/cuda_handle.h"
#include "../matmul.h" #include "../matmul.h"
DESCRIPTOR(cuda, infiniopCudaHandle_t) DESCRIPTOR(cuda)
#endif // __MATMUL_CUDA_CUH__ #endif // __MATMUL_CUDA_CUH__
...@@ -2,37 +2,38 @@ ...@@ -2,37 +2,38 @@
#define __MATMUL_H__ #define __MATMUL_H__
#include "blas.h" #include "blas.h"
#include "infiniop/handle.h"
#include "infiniop/operator.h" #include "infiniop/operator.h"
#define DESCRIPTOR(NAMESPACE, HANDLE) \ #define DESCRIPTOR(NAMESPACE) \
\ \
namespace matmul::NAMESPACE { \ namespace matmul::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \ class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \ struct Opaque; \
Opaque *_opaque; \ Opaque *_opaque; \
infiniDtype_t _dtype; \
MatmulInfo _info; \
\ \
Descriptor( \ Descriptor( \
infiniDtype_t dtype_, \ infiniDtype_t dtype, \
MatmulInfo info_, \ MatmulInfo info, \
size_t workspace_size_, \ size_t workspace_size_, \
Opaque *opaque, \ Opaque *opaque, \
infiniDevice_t device_type, \ infiniDevice_t device_type, \
int device_id) \ int device_id) \
: InfiniopDescriptor{device_type, device_id}, \ : InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \ _opaque(opaque), \
dtype(dtype_), \ _dtype(dtype), \
info(info_), \ _info(info), \
workspace_size(workspace_size_) {} \ workspace_size(workspace_size_) {} \
\ \
public: \ public: \
infiniDtype_t dtype; \
MatmulInfo info; \
size_t workspace_size; \ size_t workspace_size; \
\ \
~Descriptor(); \ ~Descriptor(); \
\ \
static infiniopStatus_t create( \ static infiniopStatus_t create( \
HANDLE handle, \ infiniopHandle_t handle, \
Descriptor **desc_ptr, \ Descriptor **desc_ptr, \
infiniopTensorDescriptor_t c_desc, \ infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t a_desc, \ infiniopTensorDescriptor_t a_desc, \
......
...@@ -20,10 +20,10 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor( ...@@ -20,10 +20,10 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
#define CREATE(CASE, HANDLE, NAMESPACE) \ #define CREATE(CASE, NAMESPACE) \
case CASE: \ case CASE: \
return matmul::NAMESPACE::Descriptor::create( \ return matmul::NAMESPACE::Descriptor::create( \
reinterpret_cast<HANDLE>(handle), \ handle, \
reinterpret_cast<matmul::NAMESPACE::Descriptor **>(desc_ptr), \ reinterpret_cast<matmul::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \ c_desc, \
a_desc, \ a_desc, \
...@@ -32,16 +32,16 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor( ...@@ -32,16 +32,16 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(
switch (handle->device) { switch (handle->device) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, infiniopCpuHandle_t, cpu); CREATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, infiniopCudaHandle_t, cuda); CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif #endif
#ifdef ENABLE_CAMBRICON_API #ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, infiniopBangHandle_t, bang); CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, infiniopAscendHandle_t, ascend); CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif #endif
default: default:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment