Commit 63d9c4a1 authored by YdrMaster's avatar YdrMaster
Browse files

issue/87/refactor: 修改所有 handle 和 matmul op 命名空间,并重构 cuda handle


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent 601defcb
#include "cpu_handle.h" #include "cpu_handle.h"
namespace infiniop::cpu { namespace device::cpu {
Handle::Handle() : InfiniopHandle{INFINI_DEVICE_CPU, 0} {} Handle::Handle() : InfiniopHandle{INFINI_DEVICE_CPU, 0} {}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr) { infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int) {
*handle_ptr = new Handle{}; *handle_ptr = new Handle{};
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace infiniop::cpu } // namespace device::cpu
...@@ -3,13 +3,15 @@ ...@@ -3,13 +3,15 @@
#include "../../handle.h" #include "../../handle.h"
namespace infiniop::cpu { namespace device::cpu {
class Handle : public InfiniopHandle { class Handle : public InfiniopHandle {
Handle(); Handle();
public: public:
static infiniStatus_t create(InfiniopHandle **handle_ptr); static infiniStatus_t create(InfiniopHandle **handle_ptr, int);
}; };
} // namespace infiniop::cpu
} // namespace device::cpu
#endif #endif
#ifndef __INFINIOP_COMMON_CUDA_H__
#define __INFINIOP_COMMON_CUDA_H__
#define MAX_THREADS_PER_BLOCK 1024
#define MAX_WARP_PER_BLOCK 32
#define WARP_SIZE 32
#include "../../../utils.h"
#include <iostream>
#define CHECK_CUDA_OR_RETURN(API, ERROR) CHECK_API_OR(API, cudaSuccess, return ERROR)
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
#include "../pool.h"
#include "cuda_handle.h"
#include "infinicore.h"
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cudnn.h>
#include <memory>
struct InfiniopCudaHandle {
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool;
cudaDeviceProp prop;
int compute_capability_major;
int compute_capability_minor;
};
template <typename T>
void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> &pool, cudaStream_t stream, const T &f) {
auto handle = pool->pop();
if (!handle) {
cublasCreate(&(*handle));
}
cublasSetStream(*handle, stream);
f(*handle);
pool->push(std::move(*handle));
}
template <typename T>
void use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> &pool, cudaStream_t stream, const T &f) {
auto handle = pool->pop();
if (!handle) {
cudnnCreate(&(*handle));
}
cudnnSetStream(*handle, stream);
f(*handle);
pool->push(std::move(*handle));
}
inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
switch (dt) {
case INFINI_DTYPE_F16:
return CUDNN_DATA_HALF;
case INFINI_DTYPE_F32:
return CUDNN_DATA_FLOAT;
case INFINI_DTYPE_F64:
return CUDNN_DATA_DOUBLE;
case INFINI_DTYPE_BF16:
return CUDNN_DATA_BFLOAT16;
case INFINI_DTYPE_I8:
return CUDNN_DATA_INT8;
case INFINI_DTYPE_I32:
return CUDNN_DATA_INT32;
case INFINI_DTYPE_I64:
return CUDNN_DATA_INT64;
case INFINI_DTYPE_U8:
return CUDNN_DATA_UINT8;
default:
return CUDNN_DATA_FLOAT;
}
}
// return the memory offset of original tensor, given the flattened index of
// broadcasted tensor
inline __device__ __host__ size_t indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
inline __device__ __host__ size_t indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
#endif // __INFINIOP_COMMON_CUDA_H__
#include "common_cuda.cuh" #include "cuda_handle.cuh"
infiniStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type) {
// Create a new cublas handle pool
int device_id = 0;
CHECK_CUDA_OR_RETURN(cudaGetDevice(&device_id), INFINI_STATUS_DEVICE_NOT_INITIALIZED);
auto pool = std::make_shared<Pool<cublasHandle_t>>();
cublasHandle_t handle;
cublasCreate(&handle);
pool->push(std::move(handle));
// create a cudnn handle pool
auto cudnn_pool = std::make_shared<Pool<cudnnHandle_t>>();
cudnnHandle_t cudnn_handle;
CHECK_CUDNN(cudnnCreate(&cudnn_handle));
cudnn_pool->push(std::move(cudnn_handle));
// set CUDA device property
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
// set device compute capability numbers
int capability_major;
int capability_minor;
cudaDeviceGetAttribute(&capability_major, cudaDevAttrComputeCapabilityMajor, device_id);
cudaDeviceGetAttribute(&capability_minor, cudaDevAttrComputeCapabilityMinor, device_id);
*handle_ptr = new InfiniopCudaHandle{
cuda_device_type,
device_id,
std::move(pool),
std::move(cudnn_pool),
std::move(prop),
capability_major,
capability_minor,
};
return INFINI_STATUS_SUCCESS; namespace device::cuda {
Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id},
_internal(std::make_shared<Handle::Internal>()) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & {
return _internal;
} }
infiniStatus_t destroyCudaHandle(infiniopCudaHandle_t handle_ptr) { template <typename T>
handle_ptr->cublas_handle_pool = nullptr; using Fn = std::function<void(T)>;
handle_ptr->cudnn_handle_pool = nullptr;
delete handle_ptr; void Handle::Internal::use_cublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
auto handle = blas_handles.pop();
if (!handle) {
cublasCreate(&(*handle));
}
cublasSetStream(*handle, stream);
f(*handle);
blas_handles.push(std::move(*handle));
}
void Handle::Internal::use_cudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
auto handle = dnn_handles.pop();
if (!handle) {
cudnnCreate(&(*handle));
}
cudnnSetStream(*handle, stream);
f(*handle);
dnn_handles.push(std::move(*handle));
}
cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
switch (dt) {
case INFINI_DTYPE_F16:
return CUDNN_DATA_HALF;
case INFINI_DTYPE_F32:
return CUDNN_DATA_FLOAT;
case INFINI_DTYPE_F64:
return CUDNN_DATA_DOUBLE;
case INFINI_DTYPE_BF16:
return CUDNN_DATA_BFLOAT16;
case INFINI_DTYPE_I8:
return CUDNN_DATA_INT8;
case INFINI_DTYPE_I32:
return CUDNN_DATA_INT32;
case INFINI_DTYPE_I64:
return CUDNN_DATA_INT64;
case INFINI_DTYPE_U8:
return CUDNN_DATA_UINT8;
default:
return CUDNN_DATA_FLOAT;
}
}
namespace nvidia {
Handle::Handle(int device_id)
: cuda::Handle(INFINI_DEVICE_NVIDIA, device_id) {}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
*handle_ptr = new Handle(device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace nvidia
} // namespace device::cuda
#ifndef __INFINIOP_CUDA_INTERNAL_H__
#define __INFINIOP_CUDA_INTERNAL_H__
#include "../pool.h"
#include "cuda_handle.h"
#include <cublas_v2.h>
#include <cudnn.h>
#include <functional>
namespace device::cuda {
class Handle::Internal {
Pool<cublasHandle_t> blas_handles;
Pool<cudnnHandle_t> dnn_handles;
public:
void use_cublas(cudaStream_t stream, const std::function<void(cublasHandle_t)> &f) const;
void use_cudnn(cudaStream_t stream, const std::function<void(cudnnHandle_t)> &f) const;
};
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda
#endif // __INFINIOP_CUDA_INTERNAL_H__
...@@ -2,12 +2,32 @@ ...@@ -2,12 +2,32 @@
#define __INFINIOP_CUDA_HANDLE_H__ #define __INFINIOP_CUDA_HANDLE_H__
#include "../../handle.h" #include "../../handle.h"
#include <memory>
struct InfiniopCudaHandle; namespace device::cuda {
typedef struct InfiniopCudaHandle *infiniopCudaHandle_t;
infiniStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type); struct Handle : public InfiniopHandle {
class Internal;
auto internal() const -> const std::shared_ptr<Internal> &;
infiniStatus_t destroyCudaHandle(infiniopCudaHandle_t handle_ptr); protected:
Handle(infiniDevice_t device, int device_id);
#endif private:
std::shared_ptr<Internal> _internal;
};
namespace nvidia {
class Handle : public cuda::Handle {
Handle(int device_id);
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
};
} // namespace nvidia
} // namespace device::cuda
#endif // __INFINIOP_CUDA_HANDLE_H__
...@@ -27,15 +27,16 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) { ...@@ -27,15 +27,16 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
int device_id; int device_id;
CHECK_STATUS(infinirtGetDevice(&device, &device_id)); CHECK_STATUS(infinirtGetDevice(&device, &device_id));
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return device::NAMESPACE::Handle::create(handle_ptr, device_id)
switch (device) { switch (device) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU: CREATE(INFINI_DEVICE_CPU, cpu);
return infiniop::cpu::Handle::create(handle_ptr);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: { CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
return createCudaHandle((infiniopCudaHandle_t *)handle_ptr, device);
}
#endif #endif
#ifdef ENABLE_CAMBRICON_API #ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: { case INFINI_DEVICE_CAMBRICON: {
...@@ -52,25 +53,27 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) { ...@@ -52,25 +53,27 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
return createKunlunHandle((infiniopKunlunHandle_t *)handle_ptr); return createKunlunHandle((infiniopKunlunHandle_t *)handle_ptr);
} }
#endif #endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
#undef CREATE
} }
__C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
#define DELETE(CASE, NAMESPACE) \ #define DELETE(CASE, NAMESPACE) \
case CASE: \ case CASE: \
delete reinterpret_cast<infiniop::NAMESPACE::Handle *>(handle); \ delete reinterpret_cast<device::NAMESPACE::Handle *>(handle); \
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS
switch (handle->device) { switch (handle->device) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu) DELETE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: { DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
return destroyCudaHandle((infiniopCudaHandle_t)handle);
}
#endif #endif
#ifdef ENABLE_CAMBRICON_API #ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: { case INFINI_DEVICE_CAMBRICON: {
...@@ -90,5 +93,6 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { ...@@ -90,5 +93,6 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
#undef DELETE #undef DELETE
} }
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <aclnnop/aclnn_matmul.h> #include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h> #include <aclnnop/level2/aclnn_gemm.h>
namespace matmul::ascend { namespace op::matmul::ascend {
struct Descriptor::Opaque { struct Descriptor::Opaque {
mutable aclOpExecutor *executor; mutable aclOpExecutor *executor;
...@@ -135,4 +135,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -135,4 +135,4 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace matmul::ascend } // namespace op::matmul::ascend
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "../../../devices/bang/common_bang.h" #include "../../../devices/bang/common_bang.h"
#include <cnnl_extra.h> #include <cnnl_extra.h>
namespace matmul::bang { namespace op::matmul::bang {
struct Descriptor::Opaque { struct Descriptor::Opaque {
cnnlMatMulDescriptor_t op; cnnlMatMulDescriptor_t op;
...@@ -157,4 +157,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -157,4 +157,4 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace matmul::bang } // namespace op::matmul::bang
...@@ -5,7 +5,8 @@ ...@@ -5,7 +5,8 @@
#include "../../tensor.h" #include "../../tensor.h"
#include <algorithm> #include <algorithm>
namespace matmul { namespace op::matmul {
struct BlasMatrix { struct BlasMatrix {
size_t ndim; size_t ndim;
size_t batch; size_t batch;
...@@ -118,6 +119,7 @@ struct MatmulInfo { ...@@ -118,6 +119,7 @@ struct MatmulInfo {
k = a_matrix.cols; k = a_matrix.cols;
} }
}; };
} // namespace matmul
} // namespace op::matmul
#endif // __BLAS_H__ #endif // __BLAS_H__
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#include "../../../devices/cpu/common_cpu.h" #include "../../../devices/cpu/common_cpu.h"
#include "../../../devices/cpu/cpu_handle.h" #include "../../../devices/cpu/cpu_handle.h"
namespace matmul::cpu { namespace op::matmul::cpu {
Descriptor::~Descriptor() = default; Descriptor::~Descriptor() = default;
...@@ -12,7 +12,7 @@ infiniStatus_t Descriptor::create( ...@@ -12,7 +12,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
auto handle = reinterpret_cast<infiniop::cpu::Handle *>(handle_); auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = c_desc->dtype(); auto dtype = c_desc->dtype();
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
...@@ -96,4 +96,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -96,4 +96,4 @@ infiniStatus_t Descriptor::calculate(
} }
} }
} // namespace matmul::cpu } // namespace op::matmul::cpu
#include "../../../devices/cuda/common_cuda.cuh" #include "../../../devices/cuda/cuda_handle.cuh"
#include "matmul_cuda.cuh" #include "matmul_cuda.cuh"
namespace matmul::cuda { namespace op::matmul::cuda {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool; std::shared_ptr<device::cuda::Handle::Internal> internal;
}; };
Descriptor::~Descriptor() { Descriptor::~Descriptor() {
...@@ -17,7 +17,7 @@ infiniStatus_t Descriptor::create( ...@@ -17,7 +17,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
auto handle = reinterpret_cast<infiniopCudaHandle_t>(handle_); auto handle = reinterpret_cast<device::cuda::nvidia::Handle *>(handle_);
auto dtype = c_desc->dtype(); auto dtype = c_desc->dtype();
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
...@@ -32,7 +32,7 @@ infiniStatus_t Descriptor::create( ...@@ -32,7 +32,7 @@ infiniStatus_t Descriptor::create(
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
dtype, info, 0, dtype, info, 0,
new Opaque{handle->cublas_handle_pool}, new Opaque{handle->internal()},
handle->device, handle->device_id); handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -76,35 +76,35 @@ infiniStatus_t Descriptor::calculate( ...@@ -76,35 +76,35 @@ infiniStatus_t Descriptor::calculate(
auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
use_cublas(_opaque->cublas_handle_pool, _opaque->internal->use_cublas(
(cudaStream_t)stream, (cudaStream_t)stream,
[&](cublasHandle_t handle) { [&](cublasHandle_t handle) {
cublasGemmStridedBatchedEx( cublasGemmStridedBatchedEx(
handle, handle,
op_a, op_a,
op_b, op_b,
static_cast<int>(_info.m), static_cast<int>(_info.m),
static_cast<int>(_info.n), static_cast<int>(_info.n),
static_cast<int>(_info.k), static_cast<int>(_info.k),
&alpha, &alpha,
a, a,
a_type, a_type,
static_cast<int>(_info.a_matrix.ld()), static_cast<int>(_info.a_matrix.ld()),
_info.a_matrix.stride, _info.a_matrix.stride,
b, b,
b_type, b_type,
static_cast<int>(_info.b_matrix.ld()), static_cast<int>(_info.b_matrix.ld()),
_info.b_matrix.stride, _info.b_matrix.stride,
&beta, &beta,
c, c,
c_type, c_type,
static_cast<int>(_info.c_matrix.ld()), static_cast<int>(_info.c_matrix.ld()),
_info.c_matrix.stride, _info.c_matrix.stride,
static_cast<int>(_info.batch), static_cast<int>(_info.batch),
compute_type, compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); CUBLAS_GEMM_DEFAULT_TENSOR_OP);
}); });
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace matmul::cuda } // namespace op::matmul::cuda
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#include "../../../devices/kunlun/common_kunlun.h" #include "../../../devices/kunlun/common_kunlun.h"
#include "../../utils.h" #include "../../utils.h"
namespace matmul::kunlun { namespace op::matmul::kunlun {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handle_pool; std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handle_pool;
...@@ -110,4 +110,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -110,4 +110,4 @@ infiniStatus_t Descriptor::calculate(
} }
} }
} // namespace matmul::kunlun } // namespace op::matmul::kunlun
...@@ -46,7 +46,7 @@ ...@@ -46,7 +46,7 @@
#define DESCRIPTOR(NAMESPACE) \ #define DESCRIPTOR(NAMESPACE) \
\ \
namespace matmul::NAMESPACE { \ namespace op::matmul::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \ class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \ struct Opaque; \
Opaque *_opaque; \ Opaque *_opaque; \
......
...@@ -25,13 +25,13 @@ __C infiniStatus_t infiniopCreateMatmulDescriptor( ...@@ -25,13 +25,13 @@ __C infiniStatus_t infiniopCreateMatmulDescriptor(
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) { infiniopTensorDescriptor_t b_desc) {
#define CREATE(CASE, NAMESPACE) \ #define CREATE(CASE, NAMESPACE) \
case CASE: \ case CASE: \
return matmul::NAMESPACE::Descriptor::create( \ return op::matmul::NAMESPACE::Descriptor::create( \
handle, \ handle, \
reinterpret_cast<matmul::NAMESPACE::Descriptor **>(desc_ptr), \ reinterpret_cast<op::matmul::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \ c_desc, \
a_desc, \ a_desc, \
b_desc) b_desc)
switch (handle->device) { switch (handle->device) {
...@@ -64,9 +64,9 @@ infiniopGetMatmulWorkspaceSize( ...@@ -64,9 +64,9 @@ infiniopGetMatmulWorkspaceSize(
infiniopMatmulDescriptor_t desc, infiniopMatmulDescriptor_t desc,
size_t *size) { size_t *size) {
#define GET(CASE, NAMESPACE) \ #define GET(CASE, NAMESPACE) \
case CASE: \ case CASE: \
*size = reinterpret_cast<const matmul::NAMESPACE::Descriptor *>(desc)->workspace_size; \ *size = reinterpret_cast<const op::matmul::NAMESPACE::Descriptor *>(desc)->workspace_size; \
return INFINI_STATUS_SUCCESS return INFINI_STATUS_SUCCESS
switch (desc->device_type) { switch (desc->device_type) {
...@@ -104,12 +104,12 @@ __C infiniStatus_t infiniopMatmul( ...@@ -104,12 +104,12 @@ __C infiniStatus_t infiniopMatmul(
float beta, float beta,
void *stream) { void *stream) {
#define CALCULATE(CASE, NAMESPACE) \ #define CALCULATE(CASE, NAMESPACE) \
case CASE: \ case CASE: \
return reinterpret_cast<const matmul::NAMESPACE::Descriptor *>(desc) \ return reinterpret_cast<const op::matmul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, \ ->calculate(workspace, workspace_size, \
c, beta, \ c, beta, \
a, b, alpha, \ a, b, alpha, \
stream) stream)
switch (desc->device_type) { switch (desc->device_type) {
...@@ -140,9 +140,9 @@ __C infiniStatus_t infiniopMatmul( ...@@ -140,9 +140,9 @@ __C infiniStatus_t infiniopMatmul(
__C infiniStatus_t __C infiniStatus_t
infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) { infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \ #define DELETE(CASE, NAMESPACE) \
case CASE: \ case CASE: \
delete reinterpret_cast<const matmul::NAMESPACE::Descriptor *>(desc); \ delete reinterpret_cast<const op::matmul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
switch (desc->device_type) { switch (desc->device_type) {
......
...@@ -24,7 +24,7 @@ class CTensor: ...@@ -24,7 +24,7 @@ class CTensor:
self.descriptor = desc self.descriptor = desc
self.torch_tensor_ = torch_tensor self.torch_tensor_ = torch_tensor
self.data = torch_tensor.data_ptr() self.data = torch_tensor.data_ptr()
def destroyDesc(self, lib_): def destroyDesc(self, lib_):
lib_.infiniopDestroyTensorDescriptor(self.descriptor) lib_.infiniopDestroyTensorDescriptor(self.descriptor)
self.descriptor = None self.descriptor = None
...@@ -37,6 +37,20 @@ class Handle(Structure): ...@@ -37,6 +37,20 @@ class Handle(Structure):
infiniopHandle_t = POINTER(Handle) infiniopHandle_t = POINTER(Handle)
class InfiniLib:
def __init__(self, librt, libop):
self.librt = librt
self.libop = libop
def __getattr__(self, name):
if hasattr(self.libop, name):
return getattr(self.libop, name)
elif hasattr(self.librt, name):
return getattr(self.librt, name)
else:
raise AttributeError(f"Attribute {name} not found in library")
# Open operators library # Open operators library
def open_lib(): def open_lib():
def find_library_in_ld_path(subdir, library_name): def find_library_in_ld_path(subdir, library_name):
...@@ -51,14 +65,22 @@ def open_lib(): ...@@ -51,14 +65,22 @@ def open_lib():
system_name = platform.system() system_name = platform.system()
# Load the library # Load the library
if system_name == "Windows": if system_name == "Windows":
library_path = find_library_in_ld_path("bin", "infiniop.dll") libop_path = find_library_in_ld_path("bin", "infiniop.dll")
librt_path = find_library_in_ld_path("bin", "infinirt.dll")
elif system_name == "Linux": elif system_name == "Linux":
library_path = find_library_in_ld_path("lib", "libinfiniop.so") libop_path = find_library_in_ld_path("lib", "libinfiniop.so")
librt_path = find_library_in_ld_path("lib", "libinfinirt.so")
assert ( assert (
library_path is not None libop_path is not None
), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly." ), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly."
lib = ctypes.CDLL(library_path) assert (
librt_path is not None
), f"Cannot find infinirt.dll or libinfinirt.so. Check if INFINI_ROOT is set correctly."
librt = ctypes.CDLL(librt_path)
libop = ctypes.CDLL(libop_path)
lib = InfiniLib(librt, libop)
lib.infiniopCreateTensorDescriptor.argtypes = [ lib.infiniopCreateTensorDescriptor.argtypes = [
POINTER(infiniopTensorDescriptor_t), POINTER(infiniopTensorDescriptor_t),
c_uint64, c_uint64,
...@@ -69,9 +91,11 @@ def open_lib(): ...@@ -69,9 +91,11 @@ def open_lib():
lib.infiniopCreateTensorDescriptor.restype = c_int lib.infiniopCreateTensorDescriptor.restype = c_int
lib.infiniopDestroyTensorDescriptor.argtypes = [infiniopTensorDescriptor_t] lib.infiniopDestroyTensorDescriptor.argtypes = [infiniopTensorDescriptor_t]
lib.infiniopDestroyTensorDescriptor.restype = c_int lib.infiniopDestroyTensorDescriptor.restype = c_int
lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t), c_int, c_int] lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t)]
lib.infiniopCreateHandle.restype = c_int lib.infiniopCreateHandle.restype = c_int
lib.infiniopDestroyHandle.argtypes = [infiniopHandle_t] lib.infiniopDestroyHandle.argtypes = [infiniopHandle_t]
lib.infiniopDestroyHandle.restype = c_int lib.infiniopDestroyHandle.restype = c_int
lib.infinirtSetDevice.argtypes = [c_int, c_int]
lib.infinirtSetDevice.restype = c_int
return lib return lib
...@@ -30,7 +30,7 @@ def to_tensor(tensor, lib): ...@@ -30,7 +30,7 @@ def to_tensor(tensor, lib):
InfiniDtype.BF16 if tensor.dtype == torch.bfloat16 else InfiniDtype.BF16 if tensor.dtype == torch.bfloat16 else
InfiniDtype.F32 if tensor.dtype == torch.float32 else InfiniDtype.F32 if tensor.dtype == torch.float32 else
InfiniDtype.F64 if tensor.dtype == torch.float64 else InfiniDtype.F64 if tensor.dtype == torch.float64 else
# TODO: These following types may not be supported by older # TODO: These following types may not be supported by older
# versions of PyTorch. # versions of PyTorch.
InfiniDtype.U16 if tensor.dtype == torch.uint16 else InfiniDtype.U16 if tensor.dtype == torch.uint16 else
InfiniDtype.U32 if tensor.dtype == torch.uint32 else InfiniDtype.U32 if tensor.dtype == torch.uint32 else
...@@ -57,9 +57,9 @@ def create_workspace(size, torch_device): ...@@ -57,9 +57,9 @@ def create_workspace(size, torch_device):
return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device) return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
def create_handle(lib, device, id=0): def create_handle(lib):
handle = infiniopHandle_t() handle = infiniopHandle_t()
check_error(lib.infiniopCreateHandle(ctypes.byref(handle), device, id)) check_error(lib.infiniopCreateHandle(ctypes.byref(handle)))
return handle return handle
...@@ -392,7 +392,8 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes): ...@@ -392,7 +392,8 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
to be passed to `test_func`. to be passed to `test_func`.
- tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test. - tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
""" """
handle = create_handle(lib, device) lib.infinirtSetDevice(device, ctypes.c_int(0))
handle = create_handle(lib)
try: try:
for test_case in test_cases: for test_case in test_cases:
for tensor_dtype in tensor_dtypes: for tensor_dtype in tensor_dtypes:
...@@ -435,6 +436,7 @@ def get_test_devices(args): ...@@ -435,6 +436,7 @@ def get_test_devices(args):
devices_to_test.append(InfiniDeviceEnum.ASCEND) devices_to_test.append(InfiniDeviceEnum.ASCEND)
if args.kunlun: if args.kunlun:
import torch_xmlir import torch_xmlir
devices_to_test.append(InfiniDeviceEnum.KUNLUN) devices_to_test.append(InfiniDeviceEnum.KUNLUN)
if not devices_to_test: if not devices_to_test:
devices_to_test = [InfiniDeviceEnum.CPU] devices_to_test = [InfiniDeviceEnum.CPU]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment