Commit 46da1a27 authored by PanZezhongQY's avatar PanZezhongQY
Browse files

feat: cpu and cuda matmul

parents
#ifndef __INFINIOP_CPU_HANDLE_H__
#define __INFINIOP_CPU_HANDLE_H__
#include "infiniop/handle.h"
typedef infiniopHandle_t infiniopCpuHandle_t;
infiniopStatus_t createCpuHandle(infiniopCpuHandle_t *handle_ptr);
#endif
#ifndef __INFINIOP_COMMON_CUDA_H__
#define __INFINIOP_COMMON_CUDA_H__
#define MAX_THREADS_PER_BLOCK 1024
#define MAX_WARP_PER_BLOCK 32
#define WARP_SIZE 32
#include <iostream>
#define checkCudaErrorWithCode(call, errorCode) \
do { \
if (auto status = call; status != cudaSuccess) { \
std::cerr << "CUDA error: " << cudaGetErrorString(status) \
<< " in file " << __FILE__ << ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return errorCode; \
} \
} while (0)
#define checkCudaError(call) checkCudaErrorWithCode(call, INFINIOP_STATUS_BAD_DEVICE)
#define checkCudnnError(call) \
do { \
if (auto status = call; status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "CUDNN error: " << cudnnGetErrorString(status) \
<< " in file " << __FILE__ << ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return INFINIOP_STATUS_INTERNAL_ERROR; \
} \
} while (0)
#include "infinicore.h"
#include <cudnn.h>
#include <cublas_v2.h>
#include <memory>
#include "../pool.h"
#include "cuda_handle.h"
#include <cuda_fp16.h>
struct InfiniopCudaHandle {
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t;
cudaDeviceProp prop;
int compute_capability_major;
int compute_capability_minor;
};
template<typename T>
void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t, int device_id, cudaStream_t stream, T const &f) {
auto handle = cublas_handles_t->pop();
if (!handle) {
cudaSetDevice(device_id);
cublasCreate(&(*handle));
}
cublasSetStream(*handle, (cudaStream_t) stream);
f(*handle);
cublas_handles_t->push(std::move(*handle));
}
template<typename T>
cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t, int device_id, cudaStream_t stream, T const &f) {
auto handle = cudnn_handles_t->pop();
if (!handle) {
cudaSetDevice(device_id);
cudnnCreate(&(*handle));
}
cudnnSetStream(*handle, stream);
cudnnStatus_t status = f(*handle);
cudnn_handles_t->push(std::move(*handle));
return status;
}
inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
switch (dt) {
case INFINI_DTYPE_F16:
return CUDNN_DATA_HALF;
case INFINI_DTYPE_F32:
return CUDNN_DATA_FLOAT;
case INFINI_DTYPE_F64:
return CUDNN_DATA_DOUBLE;
case INFINI_DTYPE_BF16:
return CUDNN_DATA_BFLOAT16;
case INFINI_DTYPE_I8:
return CUDNN_DATA_INT8;
case INFINI_DTYPE_I32:
return CUDNN_DATA_INT32;
case INFINI_DTYPE_I64:
return CUDNN_DATA_INT64;
case INFINI_DTYPE_U8:
return CUDNN_DATA_UINT8;
default:
return CUDNN_DATA_FLOAT;
}
}
// return the memory offset of original tensor, given the flattened index of
// broadcasted tensor
inline __device__ __host__ size_t indexToReducedOffset(
size_t flat_index, size_t ndim, int64_t const *broadcasted_strides,
int64_t const *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
size_t const *shape,
int64_t const *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
#endif // __INFINIOP_COMMON_CUDA_H__
#include "./common_cuda.cuh"
infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_id, infiniDevice_t cuda_device_type) {
// Check if device_id is valid
int device_count;
cudaGetDeviceCount(&device_count);
if (device_id >= device_count) {
return INFINIOP_STATUS_BAD_DEVICE;
}
// Create a new cublas handle pool
auto pool = std::make_shared<Pool<cublasHandle_t>>();
if (cudaSetDevice(device_id) != cudaSuccess) {
return INFINIOP_STATUS_BAD_DEVICE;
}
cublasHandle_t handle;
cublasCreate(&handle);
pool->push(std::move(handle));
// create a cudnn handle pool
auto cudnn_pool = std::make_shared<Pool<cudnnHandle_t>>();
cudnnHandle_t cudnn_handle;
checkCudnnError(cudnnCreate(&cudnn_handle));
cudnn_pool->push(std::move(cudnn_handle));
// set CUDA device property
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
// set device compute capability numbers
int capability_major;
int capability_minor;
cudaDeviceGetAttribute(&capability_major, cudaDevAttrComputeCapabilityMajor, device_id);
cudaDeviceGetAttribute(&capability_minor, cudaDevAttrComputeCapabilityMinor, device_id);
*handle_ptr = new InfiniopCudaHandle{
cuda_device_type,
device_id,
std::move(pool),
std::move(cudnn_pool),
std::move(prop),
capability_major,
capability_minor,
};
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t deleteCudaHandle(infiniopCudaHandle_t handle_ptr) {
handle_ptr->cublas_handles_t = nullptr;
handle_ptr->cudnn_handles_t = nullptr;
delete handle_ptr;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef __INFINIOP_CUDA_HANDLE_H__
#define __INFINIOP_CUDA_HANDLE_H__
#include "infiniop/handle.h"
struct InfiniopCudaHandle;
typedef struct InfiniopCudaHandle *infiniopCudaHandle_t;
infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_id, infiniDevice_t cuda_device_type);
infiniopStatus_t deleteCudaHandle(infiniopCudaHandle_t handle_ptr);
#endif
#include "infiniop/handle.h"
#ifdef ENABLE_CPU_API
#include "./cpu/cpu_handle.h"
#endif
#ifdef ENABLE_CUDA_API
#include "./cuda/cuda_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_MLU
#include "./bang/bang_handle.h"
#endif
#ifdef ENABLE_ASCEND_NPU
#include "./ascend/ascend_handle.h"
#endif
__C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDevice_t device, int device_id) {
if (handle_ptr == nullptr) {
return INFINIOP_STATUS_NULL_POINTER;
}
if (device_id < 0) {
return INFINIOP_STATUS_BAD_DEVICE;
}
switch (device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return createCpuHandle((infiniopCpuHandle_t *) handle_ptr);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return createCudaHandle((infiniopCudaHandle_t *) handle_ptr, device_id, device);
}
#endif
#ifdef ENABLE_CAMBRICON_API
case DevCambriconMlu: {
return createBangHandle((infiniopBangHandle_t *) handle_ptr, device_id);
}
#endif
#ifdef ENABLE_ASCEND_API
case DevAscendNpu: {
return createAscendHandle((infiniopAscendHandle_t *) handle_ptr, device_id);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
switch (handle->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
delete handle;
return INFINIOP_STATUS_SUCCESS;
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return deleteCudaHandle((infiniopCudaHandle_t) handle);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
delete (infiniopBangHandle_t) handle;
return STATUS_SUCCESS;
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return deleteAscendHandle((infiniopAscendHandle_t) handle);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
\ No newline at end of file
#ifndef __POOL_H__
#define __POOL_H__
#include <atomic>
#include <mutex>
#include <optional>
template<class T>
class Pool {
public:
Pool() : _head(nullptr) {}
Pool(const Pool &) = delete;
Pool(Pool &&pool) noexcept : _head(pool._head.exchange(nullptr)) {}
~Pool() {
while (this->pop()) {}
}
void push(T &&val) const {
Node<T> *new_node = new Node<T>(std::move(val));
new_node->next = _head.load();
while (!_head.compare_exchange_weak(new_node->next, new_node));
}
std::optional<T> pop() const {
Node<T> *top = _head.load();
Node<T> *new_head = nullptr;
do {
if (!top) {
return std::nullopt;
}
new_head = top->next;
} while (!_head.compare_exchange_weak(top, new_head));
return {std::move(top->data)};
}
private:
template<class U>
struct Node {
U data;
Node<U> *next;
Node(U &&data) : data(data), next(nullptr) {}
};
mutable std::atomic<Node<T> *> _head;
};
#endif // __POOL_H__
#include "matmul_aclnn.h"
MatmulAclnnDescriptor::MatmulAclnnDescriptor(Device _device) {
device = _device;
device_id = 0;
executor = nullptr;
info = nullptr;
cDesc = new aclnnTensorDescriptor();
aDesc = new aclnnTensorDescriptor();
bDesc = new aclnnTensorDescriptor();
alpha = 1.0;
beta = 0;
mt = 1;
workspaceSize = 0;
}
infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
MatmulAclnnDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta,
int8_t mt) {
DT dtype = c_desc->dt;
if (dtype != F16 && dtype != F32) {
return STATUS_BAD_TENSOR_DTYPE;
}
*desc_ptr = new MatmulAclnnDescriptor(handle->device);
(*desc_ptr)->device_id = handle->device_id;
(*desc_ptr)->dtype = dtype;
(*desc_ptr)->mt = mt;
(*desc_ptr)->alpha = alpha;
(*desc_ptr)->beta = beta;
infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
auto info = new MatmulInfo(c_desc, a_desc, b_desc, status, false);
if (*status != STATUS_SUCCESS) {
return *status;
}
(*desc_ptr)->info = info;
auto &cDesc = (*desc_ptr)->cDesc;
auto &aDesc = (*desc_ptr)->aDesc;
auto &bDesc = (*desc_ptr)->bDesc;
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation
CHECK_STATUS(cDesc->setDescriptor(toAclDataType(c_desc->dt), {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS);
CHECK_STATUS(aDesc->setDescriptor(toAclDataType(a_desc->dt), {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS);
CHECK_STATUS(bDesc->setDescriptor(toAclDataType(b_desc->dt), {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS);
CHECK_STATUS(cDesc->createTensor(), STATUS_SUCCESS);
CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS);
CHECK_STATUS(bDesc->createTensor(), STATUS_SUCCESS);
auto &workspaceSize = (*desc_ptr)->workspaceSize;
auto &executor = (*desc_ptr)->executor;
aclTensor *tc = cDesc->t;
aclTensor *ta = aDesc->t;
aclTensor *tb = bDesc->t;
aclnnStatus ret;
int64_t transA = 0;
int64_t transB = 0;
// aclnnGemm support C = alpha * A @ B + beta * C
// see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc,
(*desc_ptr)->mt, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
aclSetAclOpExecutorRepeatable(executor);
return STATUS_SUCCESS;
}
infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
uint64_t *size) {
*size = desc->workspaceSize;
return STATUS_SUCCESS;
}
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
void *stream) {
auto &cDesc = desc->cDesc;
auto &aDesc = desc->aDesc;
auto &bDesc = desc->bDesc;
aclTensor *tc = cDesc->t;
aclTensor *ta = aDesc->t;
aclTensor *tb = bDesc->t;
auto batch = desc->info->batch;
auto &executor = desc->executor;
auto &workspaceSize = desc->workspaceSize;
// Set runing on handle device
aclrtSetDevice(desc->device_id);
for (int i = 0; i < batch; i++) {
AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size);
AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size);
AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
aclnnStatus ret = aclnnGemm(workspace,
workspaceSize,
executor,
stream);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
}
return STATUS_SUCCESS;
}
infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
delete desc->cDesc;
delete desc->bDesc;
delete desc->aDesc;
delete desc->info;
aclDestroyAclOpExecutor(desc->executor);
delete desc;
return STATUS_SUCCESS;
}
#ifndef __ACLNN_MATMUL_H__
#define __ACLNN_MATMUL_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "../blas.h"
#include "operators.h"
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <aclnnop/level2/aclnn_gemm.h>
#include <aclnnop/aclnn_matmul.h>
struct MatmulAclnnDescriptor {
Device device;
int device_id;
aclOpExecutor* executor;
MatmulInfo* info;
DT dtype;
aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
// cubeMathType
// see doc: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
float alpha;
float beta;
int8_t mt;
uint64_t workspaceSize;
MatmulAclnnDescriptor(Device _device);
};
typedef struct MatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
MatmulAclnnDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta,
int8_t cubeMathType);
infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
uint64_t *size);
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream);
infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
#endif
#include "matmul_cnnl.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h"
#include "../../utils.h"
#include "cnrt.h"
infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
MatmulBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta) {
infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false);
if (*status != STATUS_SUCCESS) {
return *status;
}
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
cnnlCreateTensorDescriptor(&aDesc);
cnnlCreateTensorDescriptor(&bDesc);
cnnlCreateTensorDescriptor(&cDesc);
setMatrixTensorEx(aDesc, info.a_matrix);
setMatrixTensorEx(bDesc, info.b_matrix);
setMatrixTensorEx(cDesc, info.c_matrix);
cnnlMatMulDescriptor_t opDesc;
cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult;
cnnlMatMulDescCreate(&opDesc);
cnnlMatMulAlgoCreate(&algo);
cnnlCreateMatMulHeuristicResult(&algoResult);
int32_t use_stride = true;
cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride,
sizeof(int32_t));
*desc_ptr = new MatmulBangDescriptor{
handle->device,
handle->device_id,
info,
alpha,
beta,
c_desc->dt,
handle->cnnl_handles,
aDesc,
bDesc,
cDesc,
opDesc,
algo,
algoResult};
return STATUS_SUCCESS;
}
infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size) {
*size = 0;
return STATUS_SUCCESS;
}
infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) {
desc->cnnl_handles = nullptr;
cnnlDestroyTensorDescriptor(desc->aDesc);
cnnlDestroyTensorDescriptor(desc->bDesc);
cnnlDestroyTensorDescriptor(desc->cDesc);
cnnlMatMulDescDestroy(desc->opDesc);
cnnlMatMulAlgoDestroy(desc->algo);
cnnlDestroyMatMulHeuristicResult(desc->algoResult);
delete desc;
return STATUS_SUCCESS;
}
void matmul_cnnl_f16(MatmulBangDescriptor_t desc, void *workspace, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
auto info = desc->info;
if (info.is_transed) {
std::swap(a, b);
}
use_cnnl(desc->cnnl_handles, desc->device_id, (cnrtQueue_t) stream,
[&](cnnlHandle_t handle) {
int count = 0;
cnnlGetBatchMatMulAlgoHeuristic(handle, desc->opDesc, desc->aDesc,
desc->bDesc, desc->cDesc,
NULL, 1, &desc->algoResult, &count);
size_t wsSize;
cnnlGetBatchMatMulHeuristicResult(desc->algoResult, desc->algo, &wsSize);
cnrtMalloc(&workspace, wsSize);
cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo,
&alpha, desc->aDesc, a,
desc->bDesc, b,
&beta, desc->cDesc, c,
workspace, wsSize);
});
}
infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) {
if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
return STATUS_BAD_DEVICE;
}
float alpha = desc->alpha;
float beta = desc->beta;
if (dtype_eq(desc->dtype, F16)) {
matmul_cnnl_f16(desc, workspace, c, beta, a, b, alpha, stream);
cnrtQueueSync((cnrtQueue_t)stream);
return STATUS_SUCCESS;
}
return STATUS_BAD_TENSOR_DTYPE;
}
#ifndef __CNNL_MATMUL_H__
#define __CNNL_MATMUL_H__
#include "../../../devices/bang/bang_handle.h"
#include "../blas.h"
#include "cnnl.h"
#include "cnnl_extra.h"
#include "operators.h"
struct MatmulBangDescriptor {
Device device;
int device_id;
MatmulInfo info;
float alpha;
float beta;
DT dtype;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
cnnlTensorDescriptor_t aDesc;
cnnlTensorDescriptor_t bDesc;
cnnlTensorDescriptor_t cDesc;
cnnlMatMulDescriptor_t opDesc;
cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult;
};
typedef struct MatmulBangDescriptor *MatmulBangDescriptor_t;
infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
MatmulBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta);
infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size);
infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream);
infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc);
inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &matrix, bool trans = false) {
int ndim = matrix.ndim;
int batch = matrix.batch;
int stride = static_cast<int>(matrix.stride);
int rows = matrix.rows;
int cols = matrix.cols;
int row_stride = matrix.row_stride;
int col_stride = matrix.col_stride;
if (ndim == 3) {
std::vector<int> dim_size = {batch, rows, cols};
std::vector<int> dim_stride = {stride, row_stride, col_stride};
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dim_size.size(), dim_size.data(), dim_stride.data());
} else if (ndim == 2) {
std::vector<int> dim_size = {rows, cols};
std::vector<int> dim_stride = {row_stride, col_stride};
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dim_size.size(), dim_size.data(), dim_stride.data());
}
}
#endif// __CNNL_MATMUL_H__
#ifndef __BLAS_H__
#define __BLAS_H__
#include "../utils.h"
#include "infiniop/operator.h"
#include <algorithm>
#include <stdint.h>
typedef struct BlasMatrix {
size_t ndim;
size_t batch;
int64_t stride;
size_t rows;
size_t cols;
int64_t row_stride;
int64_t col_stride;
BlasMatrix() {}
BlasMatrix(infiniopTensorDescriptor_t layout, infiniopStatus_t *status) {
if (layout->ndim == 2) {
this->ndim = 2;
this->batch = 1;
this->stride = 0;
this->rows = layout->shape[0];
this->cols = layout->shape[1];
this->row_stride = layout->strides[0];
this->col_stride = layout->strides[1];
} else if (layout->ndim == 3) {
this->ndim = 3;
this->batch = layout->shape[0];
this->stride = this->batch == 1 ? 0 : layout->strides[0];
this->rows = layout->shape[1];
this->cols = layout->shape[2];
this->row_stride = layout->strides[1];
this->col_stride = layout->strides[2];
} else {
*status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
return;
}
if (this->row_stride != 1 && this->col_stride != 1) {
*status = INFINIOP_STATUS_BAD_TENSOR_STRIDES;
return;
}
*status = INFINIOP_STATUS_SUCCESS;
}
bool match_batch(int batch) const {
return this->batch == batch || this->batch == 1;
}
void transpose() {
std::swap(rows, cols);
std::swap(row_stride, col_stride);
}
int ld() const {
if (this->row_stride == 1) {
return this->col_stride;
} else {
return this->row_stride;
}
}
} BlasMatrix;
struct MatmulInfo {
BlasMatrix a_matrix;
BlasMatrix b_matrix;
BlasMatrix c_matrix;
size_t m, n, k, batch;
bool is_transed = false;
MatmulInfo(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, infiniopStatus_t *status, bool col_major = true) {
a_matrix = BlasMatrix(a_desc, status);
if (*status != INFINIOP_STATUS_SUCCESS) {
return;
}
b_matrix = BlasMatrix(b_desc, status);
if (*status != INFINIOP_STATUS_SUCCESS) {
return;
}
c_matrix = BlasMatrix(c_desc, status);
if (*status != INFINIOP_STATUS_SUCCESS) {
return;
}
if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows){
*status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
return;
}
batch = c_matrix.batch;
if (!a_matrix.match_batch(batch) || !b_matrix.match_batch(batch)) {
*status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
return;
}
if ((col_major && c_matrix.col_stride == 1) || (!col_major && c_matrix.row_stride == 1)) {
c_matrix.transpose();
b_matrix.transpose();
a_matrix.transpose();
std::swap(a_matrix, b_matrix);
is_transed = true;
}
m = c_matrix.rows;
n = c_matrix.cols;
k = a_matrix.cols;
}
};
#endif// __BLAS_H__
#include "./matmul_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../utils.h"
#include <cmath>
infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
MatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status);
if (status != INFINIOP_STATUS_SUCCESS) {
return status;
}
*desc_ptr = new MatmulCpuDescriptor{
INFINI_DEVICE_CPU,
dtype,
info};
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size) {
*size = 0;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc) {
delete desc;
return INFINIOP_STATUS_SUCCESS;
}
template<typename Tdata>
infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha) {
auto info = desc->info;
if (info.is_transed) {
std::swap(a, b);
}
for (int i = 0; i < info.batch; ++i) {
for (int m_ = 0; m_ < info.m; ++m_) {
for (int n_ = 0; n_ < info.n; ++n_) {
auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
float sum = 0;
for (int k_ = 0; k_ < info.k; ++k_) {
auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
if constexpr (std::is_same<Tdata, uint16_t>::value) {
sum += f16_to_f32(*a_) * f16_to_f32(*b_);
} else {
sum += *a_ * (*b_);
}
}
if constexpr (std::is_same<Tdata, uint16_t>::value) {
if (beta == 0) {
*c_ = f32_to_f16(alpha * sum);
} else {
*c_ = f32_to_f16(beta * f16_to_f32(*c_) + alpha * sum);
}
} else {
*c_ = beta * (*c_) + alpha * sum;
}
}
}
}
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta) {
if (desc->dtype == INFINI_DTYPE_F16) {
return matmul_cpu<uint16_t>(desc, c, beta, a, b, alpha);
}
if (desc->dtype == INFINI_DTYPE_F32) {
return matmul_cpu<float>(desc, c, beta, a, b, alpha);
}
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
#ifndef __INFINIOP_MATMUL_CPU_H__
#define __INFINIOP_MATMUL_CPU_H__
#include "../../../devices/cpu/cpu_handle.h"
#include "../blas.h"
#include "infiniop/operator.h"
typedef struct MatmulCpuDescriptor {
infiniDevice_t device;
infiniDtype_t dtype;
MatmulInfo info;
} MatmulCpuDescriptor;
typedef struct MatmulCpuDescriptor *MatmulCpuDescriptor_t;
infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
MatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size);
infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta);
infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc);
#endif// __INFINIOP_MATMUL_CPU_H__
#include "./matmul_cuda.cuh"
#include "../../utils.h"
infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
infiniopMatmulCudaDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status);
if (status != INFINIOP_STATUS_SUCCESS) {
return status;
}
*desc_ptr = new InfiniopMatmulCudaDescriptor{
handle->device,
dtype,
handle->device_id,
info,
handle->cublas_handles_t};
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, uint64_t *size) {
*size = 0;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc) {
desc->cublas_handles_t = nullptr;
delete desc;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef __INFINIOP_MATMUL_CUDA_H__
#define __INFINIOP_MATMUL_CUDA_H__
#include "matmul_cuda_api.h"
#include "../../../devices/cuda/common_cuda.cuh"
#include <memory>
#include "../blas.h"
typedef struct InfiniopMatmulCudaDescriptor {
infiniDevice_t device;
infiniDtype_t dtype;
int device_id;
MatmulInfo info;
std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
} InfiniopMatmulCudaDescriptor;
#endif// __INFINIOP_MATMUL_CUDA_H__
#ifndef __INFINIOP_MATMUL_CUDA_API_H__
#define __INFINIOP_MATMUL_CUDA_API_H__
#include "../../../devices/cuda/cuda_handle.h"
#include "infiniop/operator.h"
struct InfiniopMatmulCudaDescriptor;
typedef struct InfiniopMatmulCudaDescriptor *infiniopMatmulCudaDescriptor_t;
infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
infiniopMatmulCudaDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, uint64_t *size);
infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta,
void *stream);
infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc);
#endif // __INFINIOP_MATMUL_CUDA_API_H__
#include "../../utils.h"
#include "./matmul_cuda.cuh"
template<typename Tdata>
infiniopStatus_t matmul_cuda(infiniopMatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
auto info = desc->info;
if (info.is_transed) {
std::swap(a, b);
}
cudaDataType a_type, b_type, c_type;
cublasComputeType_t compute_type;
if constexpr (std::is_same<Tdata, half>::value) {
a_type = b_type = c_type = CUDA_R_16F;
compute_type = CUBLAS_COMPUTE_32F;
} else {
a_type = b_type = c_type = CUDA_R_32F;
#ifdef ENABLE_SUGON_CUDA_API
compute_type = CUBLAS_COMPUTE_32F;
#else
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
#endif
}
auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
use_cublas(desc->cublas_handles_t, desc->device_id, (cudaStream_t) stream,
[&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
handle,
op_a,
op_b,
info.m,
info.n,
info.k,
&alpha,
a,
a_type,
info.a_matrix.ld(),
info.a_matrix.stride,
b,
b_type,
info.b_matrix.ld(),
info.b_matrix.stride,
&beta,
c,
c_type,
info.c_matrix.ld(),
info.c_matrix.stride,
info.batch,
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); });
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta,
void *stream) {
if (desc->dtype == INFINI_DTYPE_F16) {
return matmul_cuda<half>(desc, c, beta, a, b, alpha, stream);
}
if (desc->dtype == INFINI_DTYPE_F32) {
return matmul_cuda<float>(desc, c, beta, a, b, alpha, stream);
}
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
#include "../utils.h"
#include "infiniop/ops/matmul.h"
#ifdef ENABLE_CPU_API
#include "cpu/matmul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/matmul_cuda_api.h"
#endif
#ifdef ENABLE_CAMBRICON_MLU
#include "bang/matmul_cnnl.h"
#endif
#ifdef ENABLE_ASCEND_NPU
#include "ascend/matmul_aclnn.h"
#endif
__C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
infiniopMatmulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
switch (handle->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuCreateMatmulDescriptor((infiniopCpuHandle_t) handle, (MatmulCpuDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return cudaCreateMatmulDescriptor((infiniopCudaHandle_t) handle, (infiniopMatmulCudaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateMatmulDescriptor((BangHandle_t) handle, (MatmulBangDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateMatmulDescriptor((AscendHandle_t) handle,
(MatmulAclnnDescriptor_t *) desc_ptr,
c_desc,
a_desc,
b_desc,
1);
}
#endif
}
return INFINIOP_STATUS_BAD_DEVICE;
}
__C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuGetMatmulWorkspaceSize((MatmulCpuDescriptor_t) desc, size);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return cudaGetMatmulWorkspaceSize((infiniopMatmulCudaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetMatmulWorkspaceSize((MatmulBangDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc,
size);
}
#endif
}
return INFINIOP_STATUS_BAD_DEVICE;
}
__C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, float alpha, float beta, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuMatmul((MatmulCpuDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA:
return cudaMatmul((infiniopMatmulCudaDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta, stream);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangMatmul((MatmulBangDescriptor_t) desc, workspace, workspace_size, c, alpha, a, b, beta, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return aclnnMatmul((MatmulAclnnDescriptor_t) desc,
workspace,
workspace_size,
c,
alpha,
a,
b,
beta,
stream);
#endif
}
return INFINIOP_STATUS_BAD_DEVICE;
}
__C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuDestroyMatmulDescriptor((MatmulCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return cudaDestroyMatmulDescriptor((infiniopMatmulCudaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyMatmulDescriptor((MatmulBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc);
}
#endif
}
return INFINIOP_STATUS_BAD_DEVICE;
}
#ifndef __UTILS_H__
#define __UTILS_H__
#include "infiniop/tensor_descriptor.h"
#include <algorithm>
#include <iostream>
#include <numeric>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
/* This file contains some useful macros and helper functions */
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define CHECK_ERROR(call, target, errCode) \
do { \
if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) \
<< " but got " << value \
<< " in file " << __FILE__ \
<< ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return (errCode); \
} \
} while (0)
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr; \
CHECK_ERROR(value, target, errCode)
#define CHECK_STATUS(call, target) \
do { \
if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) \
<< " but got " << value \
<< " in file " << __FILE__ \
<< ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return value; \
} \
} while (0)
inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
std::vector<int64_t> strides(desc->ndim);
for (uint64_t i = 0; i < desc->ndim; i++) {
strides[i] = desc->strides[i] * infini_sizeof(desc->dtype);
}
return strides;
}
// calculate the broadcasted shape for two tensors
inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
const uint64_t *shape2, uint64_t ndim2,
uint64_t *broadcast_shape, uint64_t *padded_shape1,
uint64_t *padded_shape2, uint64_t max_rank) {
// prepending and initializing
std::fill(padded_shape1, padded_shape1 + max_rank, 1);
std::fill(padded_shape2, padded_shape2 + max_rank, 1);
std::copy(shape1, shape1 + ndim1, padded_shape1 + max_rank - ndim1);
std::copy(shape2, shape2 + ndim2, padded_shape2 + max_rank - ndim2);
// compute broadcasted shape
for (size_t i = 0; i < max_rank; ++i) {
if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
} else {
return false;
}
}
return true;
}
// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c,
uint64_t broadcast_ndim) {
std::vector<uint64_t>
broadcast_shape_(broadcast_ndim),
padded_shape1_(broadcast_ndim),
padded_shape2_(broadcast_ndim);
auto broadcast_shape = broadcast_shape_.data(),
padded_shape1 = padded_shape1_.data(),
padded_shape2 = padded_shape2_.data();
if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
return false;
}
return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, c->shape);
}
// check if the shape of tensor src can be validly broadcasted to that of the tensor dst
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) {
if (dst->ndim < src->ndim) {
return false;
}
std::vector<size_t> padded_shape_(dst->ndim);
auto padded_shape = padded_shape_.data();
std::fill(padded_shape, padded_shape + dst->ndim, 1);
std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim);
for (size_t i = 0; i < dst->ndim; ++i) {
if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) {
return false;
}
}
return true;
}
// check if the shape of tensor c is valid after broadcasting tensors a and b
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) {
return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
}
inline size_t get_byte_size(infiniopTensorDescriptor_t desc) {
size_t size = 1;
for (size_t i = 0; i < desc->ndim; i++) {
size *= desc->shape[i];
}
return size * infini_sizeof(desc->dtype);
}
// permute the dimensions of a tensor descriptor
inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const std::vector<size_t> &order) {
size_t ndim = desc->ndim;
if (order.size() != ndim) {
return nullptr;
}
size_t *shape = new size_t[ndim];
int64_t *strides = new int64_t[ndim];
for (size_t i = 0; i < ndim; i++) {
if (std::find(order.begin(), order.end(), i) == order.end()) {
return nullptr;
}
shape[i] = desc->shape[order[i]];
strides[i] = desc->strides[order[i]];
}
return new InfiniopTensorDescriptor{
desc->dtype, ndim, shape, strides};
}
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous
inline bool is_contiguous(const infiniopTensorDescriptor_t &desc, size_t dim_start, size_t dim_end) {
for (size_t i = dim_start + 1; i <= dim_end; i++) {
if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
return false;
}
}
return true;
}
inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) {
if (desc->ndim == 0) {
return true;
}
return is_contiguous(desc, 0, desc->ndim - 1);
}
// merge the dimensions [dim_start, dim_end] of a tensor descriptor
inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, size_t dim_start, size_t dim_end) {
size_t ndim = desc->ndim;
if (dim_start > dim_end || dim_end >= ndim) {
return nullptr;
}
size_t new_ndim = ndim - (dim_end - dim_start);
size_t *new_shape = new size_t[new_ndim];
int64_t *new_strides = new int64_t[new_ndim];
size_t index = 0;
for (size_t i = 0; i < dim_start; i++) {
new_shape[index] = desc->shape[i];
new_strides[index] = desc->strides[i];
index++;
}
if (!is_contiguous(desc, dim_start, dim_end)) {
return nullptr;
}
new_shape[index] = 1;
for (size_t i = dim_start; i <= dim_end; i++) {
new_shape[index] *= desc->shape[i];
}
new_strides[index] = desc->strides[dim_end];
index++;
for (size_t i = dim_end + 1; i < ndim; i++) {
new_shape[index] = desc->shape[i];
new_strides[index] = desc->strides[i];
index++;
}
return new InfiniopTensorDescriptor{
desc->dtype, new_ndim, new_shape, new_strides};
}
// split the dimension dim of a tensor descriptor into multiple dimensions
inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, size_t dim, const std::vector<size_t> &dims) {
size_t ndim = desc->ndim;
if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) {
return nullptr;
}
size_t new_ndim = ndim + dims.size() - 1;
size_t *new_shape = new size_t[new_ndim];
int64_t *new_strides = new int64_t[new_ndim];
size_t index = 0;
for (size_t i = 0; i < dim; i++) {
new_shape[index] = desc->shape[i];
new_strides[index] = desc->strides[i];
index++;
}
for (size_t i = 0; i < dims.size(); i++) {
new_shape[index] = dims[i];
new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<size_t>());
index++;
}
for (size_t i = dim + 1; i < ndim; i++) {
new_shape[index] = desc->shape[i];
new_strides[index] = desc->strides[i];
index++;
}
return new InfiniopTensorDescriptor{
desc->dtype, new_ndim, new_shape, new_strides};
}
#endif// __UTILS_H__
#include "infiniop/tensor_descriptor.h"
#include <cstring>
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape_, int64_t const *strides_, infiniDtype_t datatype) {
size_t *shape = new size_t[ndim];
int64_t *strides = new int64_t[ndim];
std::memcpy(shape, shape_, ndim * sizeof(size_t));
if (strides_) {
std::memcpy(strides, strides_, ndim * sizeof(int64_t));
} else {
int64_t dsize = 1;
for (int i = ndim - 1; i >= 0; i--) {
strides[i] = dsize;
dsize *= shape[i];
}
}
*desc_ptr = new InfiniopTensorDescriptor{datatype, ndim, shape, strides};
return INFINIOP_STATUS_SUCCESS;
}
__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc) {
delete[] desc->shape;
delete[] desc->strides;
delete desc;
return INFINIOP_STATUS_SUCCESS;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment