Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
46da1a27
Commit
46da1a27
authored
Feb 11, 2025
by
PanZezhongQY
Browse files
feat: cpu and cuda matmul
parents
Changes
87
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1469 additions
and
0 deletions
+1469
-0
src/infiniop/devices/cpu/cpu_handle.h
src/infiniop/devices/cpu/cpu_handle.h
+10
-0
src/infiniop/devices/cuda/common_cuda.cuh
src/infiniop/devices/cuda/common_cuda.cuh
+123
-0
src/infiniop/devices/cuda/cuda_handle.cu
src/infiniop/devices/cuda/cuda_handle.cu
+55
-0
src/infiniop/devices/cuda/cuda_handle.h
src/infiniop/devices/cuda/cuda_handle.h
+13
-0
src/infiniop/devices/handle.cc
src/infiniop/devices/handle.cc
+75
-0
src/infiniop/devices/pool.h
src/infiniop/devices/pool.h
+50
-0
src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+137
-0
src/infiniop/ops/matmul/ascend/matmul_aclnn.h
src/infiniop/ops/matmul/ascend/matmul_aclnn.h
+55
-0
src/infiniop/ops/matmul/bang/matmul_cnnl.cc
src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+103
-0
src/infiniop/ops/matmul/bang/matmul_cnnl.h
src/infiniop/ops/matmul/bang/matmul_cnnl.h
+63
-0
src/infiniop/ops/matmul/blas.h
src/infiniop/ops/matmul/blas.h
+116
-0
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+93
-0
src/infiniop/ops/matmul/cpu/matmul_cpu.h
src/infiniop/ops/matmul/cpu/matmul_cpu.h
+35
-0
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+39
-0
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+17
-0
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+32
-0
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+73
-0
src/infiniop/ops/matmul/operator.cc
src/infiniop/ops/matmul/operator.cc
+133
-0
src/infiniop/ops/utils.h
src/infiniop/ops/utils.h
+221
-0
src/infiniop/tensor_descriptor.cc
src/infiniop/tensor_descriptor.cc
+26
-0
No files found.
src/infiniop/devices/cpu/cpu_handle.h
0 → 100644
View file @
46da1a27
#ifndef __INFINIOP_CPU_HANDLE_H__
#define __INFINIOP_CPU_HANDLE_H__
#include "infiniop/handle.h"
typedef
infiniopHandle_t
infiniopCpuHandle_t
;
infiniopStatus_t
createCpuHandle
(
infiniopCpuHandle_t
*
handle_ptr
);
#endif
src/infiniop/devices/cuda/common_cuda.cuh
0 → 100644
View file @
46da1a27
#ifndef __INFINIOP_COMMON_CUDA_H__
#define __INFINIOP_COMMON_CUDA_H__
#define MAX_THREADS_PER_BLOCK 1024
#define MAX_WARP_PER_BLOCK 32
#define WARP_SIZE 32
#include <iostream>
#define checkCudaErrorWithCode(call, errorCode) \
do { \
if (auto status = call; status != cudaSuccess) { \
std::cerr << "CUDA error: " << cudaGetErrorString(status) \
<< " in file " << __FILE__ << ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return errorCode; \
} \
} while (0)
#define checkCudaError(call) checkCudaErrorWithCode(call, INFINIOP_STATUS_BAD_DEVICE)
#define checkCudnnError(call) \
do { \
if (auto status = call; status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "CUDNN error: " << cudnnGetErrorString(status) \
<< " in file " << __FILE__ << ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return INFINIOP_STATUS_INTERNAL_ERROR; \
} \
} while (0)
#include "infinicore.h"
#include <cudnn.h>
#include <cublas_v2.h>
#include <memory>
#include "../pool.h"
#include "cuda_handle.h"
#include <cuda_fp16.h>
struct
InfiniopCudaHandle
{
infiniDevice_t
device
;
int
device_id
;
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handles_t
;
std
::
shared_ptr
<
Pool
<
cudnnHandle_t
>>
cudnn_handles_t
;
cudaDeviceProp
prop
;
int
compute_capability_major
;
int
compute_capability_minor
;
};
template
<
typename
T
>
void
use_cublas
(
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handles_t
,
int
device_id
,
cudaStream_t
stream
,
T
const
&
f
)
{
auto
handle
=
cublas_handles_t
->
pop
();
if
(
!
handle
)
{
cudaSetDevice
(
device_id
);
cublasCreate
(
&
(
*
handle
));
}
cublasSetStream
(
*
handle
,
(
cudaStream_t
)
stream
);
f
(
*
handle
);
cublas_handles_t
->
push
(
std
::
move
(
*
handle
));
}
template
<
typename
T
>
cudnnStatus_t
use_cudnn
(
std
::
shared_ptr
<
Pool
<
cudnnHandle_t
>>
cudnn_handles_t
,
int
device_id
,
cudaStream_t
stream
,
T
const
&
f
)
{
auto
handle
=
cudnn_handles_t
->
pop
();
if
(
!
handle
)
{
cudaSetDevice
(
device_id
);
cudnnCreate
(
&
(
*
handle
));
}
cudnnSetStream
(
*
handle
,
stream
);
cudnnStatus_t
status
=
f
(
*
handle
);
cudnn_handles_t
->
push
(
std
::
move
(
*
handle
));
return
status
;
}
inline
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
)
{
switch
(
dt
)
{
case
INFINI_DTYPE_F16
:
return
CUDNN_DATA_HALF
;
case
INFINI_DTYPE_F32
:
return
CUDNN_DATA_FLOAT
;
case
INFINI_DTYPE_F64
:
return
CUDNN_DATA_DOUBLE
;
case
INFINI_DTYPE_BF16
:
return
CUDNN_DATA_BFLOAT16
;
case
INFINI_DTYPE_I8
:
return
CUDNN_DATA_INT8
;
case
INFINI_DTYPE_I32
:
return
CUDNN_DATA_INT32
;
case
INFINI_DTYPE_I64
:
return
CUDNN_DATA_INT64
;
case
INFINI_DTYPE_U8
:
return
CUDNN_DATA_UINT8
;
default:
return
CUDNN_DATA_FLOAT
;
}
}
// return the memory offset of original tensor, given the flattened index of
// broadcasted tensor
inline
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
int64_t
const
*
broadcasted_strides
,
int64_t
const
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
inline
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
size_t
const
*
shape
,
int64_t
const
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
#endif // __INFINIOP_COMMON_CUDA_H__
src/infiniop/devices/cuda/cuda_handle.cu
0 → 100644
View file @
46da1a27
#include "./common_cuda.cuh"
infiniopStatus_t
createCudaHandle
(
infiniopCudaHandle_t
*
handle_ptr
,
int
device_id
,
infiniDevice_t
cuda_device_type
)
{
// Check if device_id is valid
int
device_count
;
cudaGetDeviceCount
(
&
device_count
);
if
(
device_id
>=
device_count
)
{
return
INFINIOP_STATUS_BAD_DEVICE
;
}
// Create a new cublas handle pool
auto
pool
=
std
::
make_shared
<
Pool
<
cublasHandle_t
>>
();
if
(
cudaSetDevice
(
device_id
)
!=
cudaSuccess
)
{
return
INFINIOP_STATUS_BAD_DEVICE
;
}
cublasHandle_t
handle
;
cublasCreate
(
&
handle
);
pool
->
push
(
std
::
move
(
handle
));
// create a cudnn handle pool
auto
cudnn_pool
=
std
::
make_shared
<
Pool
<
cudnnHandle_t
>>
();
cudnnHandle_t
cudnn_handle
;
checkCudnnError
(
cudnnCreate
(
&
cudnn_handle
));
cudnn_pool
->
push
(
std
::
move
(
cudnn_handle
));
// set CUDA device property
cudaDeviceProp
prop
;
cudaGetDeviceProperties
(
&
prop
,
device_id
);
// set device compute capability numbers
int
capability_major
;
int
capability_minor
;
cudaDeviceGetAttribute
(
&
capability_major
,
cudaDevAttrComputeCapabilityMajor
,
device_id
);
cudaDeviceGetAttribute
(
&
capability_minor
,
cudaDevAttrComputeCapabilityMinor
,
device_id
);
*
handle_ptr
=
new
InfiniopCudaHandle
{
cuda_device_type
,
device_id
,
std
::
move
(
pool
),
std
::
move
(
cudnn_pool
),
std
::
move
(
prop
),
capability_major
,
capability_minor
,
};
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
deleteCudaHandle
(
infiniopCudaHandle_t
handle_ptr
)
{
handle_ptr
->
cublas_handles_t
=
nullptr
;
handle_ptr
->
cudnn_handles_t
=
nullptr
;
delete
handle_ptr
;
return
INFINIOP_STATUS_SUCCESS
;
}
src/infiniop/devices/cuda/cuda_handle.h
0 → 100644
View file @
46da1a27
#ifndef __INFINIOP_CUDA_HANDLE_H__
#define __INFINIOP_CUDA_HANDLE_H__
#include "infiniop/handle.h"
struct
InfiniopCudaHandle
;
typedef
struct
InfiniopCudaHandle
*
infiniopCudaHandle_t
;
infiniopStatus_t
createCudaHandle
(
infiniopCudaHandle_t
*
handle_ptr
,
int
device_id
,
infiniDevice_t
cuda_device_type
);
infiniopStatus_t
deleteCudaHandle
(
infiniopCudaHandle_t
handle_ptr
);
#endif
src/infiniop/devices/handle.cc
0 → 100644
View file @
46da1a27
#include "infiniop/handle.h"
#ifdef ENABLE_CPU_API
#include "./cpu/cpu_handle.h"
#endif
#ifdef ENABLE_CUDA_API
#include "./cuda/cuda_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_MLU
#include "./bang/bang_handle.h"
#endif
#ifdef ENABLE_ASCEND_NPU
#include "./ascend/ascend_handle.h"
#endif
__C
infiniopStatus_t
infiniopCreateHandle
(
infiniopHandle_t
*
handle_ptr
,
infiniDevice_t
device
,
int
device_id
)
{
if
(
handle_ptr
==
nullptr
)
{
return
INFINIOP_STATUS_NULL_POINTER
;
}
if
(
device_id
<
0
)
{
return
INFINIOP_STATUS_BAD_DEVICE
;
}
switch
(
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
createCpuHandle
((
infiniopCpuHandle_t
*
)
handle_ptr
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
createCudaHandle
((
infiniopCudaHandle_t
*
)
handle_ptr
,
device_id
,
device
);
}
#endif
#ifdef ENABLE_CAMBRICON_API
case
DevCambriconMlu
:
{
return
createBangHandle
((
infiniopBangHandle_t
*
)
handle_ptr
,
device_id
);
}
#endif
#ifdef ENABLE_ASCEND_API
case
DevAscendNpu
:
{
return
createAscendHandle
((
infiniopAscendHandle_t
*
)
handle_ptr
,
device_id
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniopStatus_t
infiniopDestroyHandle
(
infiniopHandle_t
handle
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
delete
handle
;
return
INFINIOP_STATUS_SUCCESS
;
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
deleteCudaHandle
((
infiniopCudaHandle_t
)
handle
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
delete
(
infiniopBangHandle_t
)
handle
;
return
STATUS_SUCCESS
;
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
deleteAscendHandle
((
infiniopAscendHandle_t
)
handle
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
\ No newline at end of file
src/infiniop/devices/pool.h
0 → 100644
View file @
46da1a27
#ifndef __POOL_H__
#define __POOL_H__
#include <atomic>
#include <mutex>
#include <optional>
template
<
class
T
>
class
Pool
{
public:
Pool
()
:
_head
(
nullptr
)
{}
Pool
(
const
Pool
&
)
=
delete
;
Pool
(
Pool
&&
pool
)
noexcept
:
_head
(
pool
.
_head
.
exchange
(
nullptr
))
{}
~
Pool
()
{
while
(
this
->
pop
())
{}
}
void
push
(
T
&&
val
)
const
{
Node
<
T
>
*
new_node
=
new
Node
<
T
>
(
std
::
move
(
val
));
new_node
->
next
=
_head
.
load
();
while
(
!
_head
.
compare_exchange_weak
(
new_node
->
next
,
new_node
));
}
std
::
optional
<
T
>
pop
()
const
{
Node
<
T
>
*
top
=
_head
.
load
();
Node
<
T
>
*
new_head
=
nullptr
;
do
{
if
(
!
top
)
{
return
std
::
nullopt
;
}
new_head
=
top
->
next
;
}
while
(
!
_head
.
compare_exchange_weak
(
top
,
new_head
));
return
{
std
::
move
(
top
->
data
)};
}
private:
template
<
class
U
>
struct
Node
{
U
data
;
Node
<
U
>
*
next
;
Node
(
U
&&
data
)
:
data
(
data
),
next
(
nullptr
)
{}
};
mutable
std
::
atomic
<
Node
<
T
>
*>
_head
;
};
#endif // __POOL_H__
src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
0 → 100644
View file @
46da1a27
#include "matmul_aclnn.h"
MatmulAclnnDescriptor
::
MatmulAclnnDescriptor
(
Device
_device
)
{
device
=
_device
;
device_id
=
0
;
executor
=
nullptr
;
info
=
nullptr
;
cDesc
=
new
aclnnTensorDescriptor
();
aDesc
=
new
aclnnTensorDescriptor
();
bDesc
=
new
aclnnTensorDescriptor
();
alpha
=
1.0
;
beta
=
0
;
mt
=
1
;
workspaceSize
=
0
;
}
infiniopStatus_t
aclnnCreateMatmulDescriptor
(
AscendHandle_t
handle
,
MatmulAclnnDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
float
alpha
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
float
beta
,
int8_t
mt
)
{
DT
dtype
=
c_desc
->
dt
;
if
(
dtype
!=
F16
&&
dtype
!=
F32
)
{
return
STATUS_BAD_TENSOR_DTYPE
;
}
*
desc_ptr
=
new
MatmulAclnnDescriptor
(
handle
->
device
);
(
*
desc_ptr
)
->
device_id
=
handle
->
device_id
;
(
*
desc_ptr
)
->
dtype
=
dtype
;
(
*
desc_ptr
)
->
mt
=
mt
;
(
*
desc_ptr
)
->
alpha
=
alpha
;
(
*
desc_ptr
)
->
beta
=
beta
;
infiniopStatus_t
*
status
=
new
infiniopStatus_t
{
STATUS_EXECUTION_FAILED
};
auto
info
=
new
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
status
,
false
);
if
(
*
status
!=
STATUS_SUCCESS
)
{
return
*
status
;
}
(
*
desc_ptr
)
->
info
=
info
;
auto
&
cDesc
=
(
*
desc_ptr
)
->
cDesc
;
auto
&
aDesc
=
(
*
desc_ptr
)
->
aDesc
;
auto
&
bDesc
=
(
*
desc_ptr
)
->
bDesc
;
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation
CHECK_STATUS
(
cDesc
->
setDescriptor
(
toAclDataType
(
c_desc
->
dt
),
{
info
->
c_matrix
.
rows
,
info
->
c_matrix
.
cols
},
{
info
->
c_matrix
.
row_stride
,
info
->
c_matrix
.
col_stride
}),
STATUS_SUCCESS
);
CHECK_STATUS
(
aDesc
->
setDescriptor
(
toAclDataType
(
a_desc
->
dt
),
{
info
->
a_matrix
.
rows
,
info
->
a_matrix
.
cols
},
{
info
->
a_matrix
.
row_stride
,
info
->
a_matrix
.
col_stride
}),
STATUS_SUCCESS
);
CHECK_STATUS
(
bDesc
->
setDescriptor
(
toAclDataType
(
b_desc
->
dt
),
{
info
->
b_matrix
.
rows
,
info
->
b_matrix
.
cols
},
{
info
->
b_matrix
.
row_stride
,
info
->
b_matrix
.
col_stride
}),
STATUS_SUCCESS
);
CHECK_STATUS
(
cDesc
->
createTensor
(),
STATUS_SUCCESS
);
CHECK_STATUS
(
aDesc
->
createTensor
(),
STATUS_SUCCESS
);
CHECK_STATUS
(
bDesc
->
createTensor
(),
STATUS_SUCCESS
);
auto
&
workspaceSize
=
(
*
desc_ptr
)
->
workspaceSize
;
auto
&
executor
=
(
*
desc_ptr
)
->
executor
;
aclTensor
*
tc
=
cDesc
->
t
;
aclTensor
*
ta
=
aDesc
->
t
;
aclTensor
*
tb
=
bDesc
->
t
;
aclnnStatus
ret
;
int64_t
transA
=
0
;
int64_t
transB
=
0
;
// aclnnGemm support C = alpha * A @ B + beta * C
// see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
ret
=
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
(
*
desc_ptr
)
->
alpha
,
(
*
desc_ptr
)
->
beta
,
transA
,
transB
,
tc
,
(
*
desc_ptr
)
->
mt
,
&
workspaceSize
,
&
executor
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemmGetWorkspaceSize failed. ERROR: %d
\n
"
,
ret
);
return
STATUS_EXECUTION_FAILED
);
aclSetAclOpExecutorRepeatable
(
executor
);
return
STATUS_SUCCESS
;
}
infiniopStatus_t
aclnnGetMatmulWorkspaceSize
(
MatmulAclnnDescriptor_t
desc
,
uint64_t
*
size
)
{
*
size
=
desc
->
workspaceSize
;
return
STATUS_SUCCESS
;
}
infiniopStatus_t
aclnnMatmul
(
MatmulAclnnDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
void
*
stream
)
{
auto
&
cDesc
=
desc
->
cDesc
;
auto
&
aDesc
=
desc
->
aDesc
;
auto
&
bDesc
=
desc
->
bDesc
;
aclTensor
*
tc
=
cDesc
->
t
;
aclTensor
*
ta
=
aDesc
->
t
;
aclTensor
*
tb
=
bDesc
->
t
;
auto
batch
=
desc
->
info
->
batch
;
auto
&
executor
=
desc
->
executor
;
auto
&
workspaceSize
=
desc
->
workspaceSize
;
// Set runing on handle device
aclrtSetDevice
(
desc
->
device_id
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
AclSetTensorAddr
(
executor
,
0
,
ta
,
(
char
*
)
(
a
)
+
i
*
desc
->
info
->
a_matrix
.
stride
*
desc
->
dtype
.
size
);
AclSetTensorAddr
(
executor
,
1
,
tb
,
(
char
*
)
(
b
)
+
i
*
desc
->
info
->
b_matrix
.
stride
*
desc
->
dtype
.
size
);
AclSetTensorAddr
(
executor
,
2
,
tc
,
(
char
*
)
(
c
)
+
i
*
desc
->
info
->
c_matrix
.
stride
*
desc
->
dtype
.
size
);
AclSetTensorAddr
(
executor
,
3
,
tc
,
(
char
*
)
(
c
)
+
i
*
desc
->
info
->
c_matrix
.
stride
*
desc
->
dtype
.
size
);
aclnnStatus
ret
=
aclnnGemm
(
workspace
,
workspaceSize
,
executor
,
stream
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemm failed. ERROR: %d
\n
"
,
ret
);
return
STATUS_EXECUTION_FAILED
);
}
return
STATUS_SUCCESS
;
}
infiniopStatus_t
aclnnDestroyMatmulDescriptor
(
MatmulAclnnDescriptor_t
desc
)
{
delete
desc
->
cDesc
;
delete
desc
->
bDesc
;
delete
desc
->
aDesc
;
delete
desc
->
info
;
aclDestroyAclOpExecutor
(
desc
->
executor
);
delete
desc
;
return
STATUS_SUCCESS
;
}
src/infiniop/ops/matmul/ascend/matmul_aclnn.h
0 → 100644
View file @
46da1a27
#ifndef __ACLNN_MATMUL_H__
#define __ACLNN_MATMUL_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "../blas.h"
#include "operators.h"
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <aclnnop/level2/aclnn_gemm.h>
#include <aclnnop/aclnn_matmul.h>
struct
MatmulAclnnDescriptor
{
Device
device
;
int
device_id
;
aclOpExecutor
*
executor
;
MatmulInfo
*
info
;
DT
dtype
;
aclnnTensorDescriptor_t
cDesc
,
aDesc
,
bDesc
;
// cubeMathType
// see doc: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
float
alpha
;
float
beta
;
int8_t
mt
;
uint64_t
workspaceSize
;
MatmulAclnnDescriptor
(
Device
_device
);
};
typedef
struct
MatmulAclnnDescriptor
*
MatmulAclnnDescriptor_t
;
infiniopStatus_t
aclnnCreateMatmulDescriptor
(
AscendHandle_t
handle
,
MatmulAclnnDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
float
alpha
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
float
beta
,
int8_t
cubeMathType
);
infiniopStatus_t
aclnnGetMatmulWorkspaceSize
(
MatmulAclnnDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
aclnnMatmul
(
MatmulAclnnDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
);
infiniopStatus_t
aclnnDestroyMatmulDescriptor
(
MatmulAclnnDescriptor_t
desc
);
#endif
src/infiniop/ops/matmul/bang/matmul_cnnl.cc
0 → 100644
View file @
46da1a27
#
include
"matmul_cnnl.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h"
#include "../../utils.h"
#include "cnrt.h"
infiniopStatus_t
bangCreateMatmulDescriptor
(
BangHandle_t
handle
,
MatmulBangDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
float
alpha
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
float
beta
)
{
infiniopStatus_t
*
status
=
new
infiniopStatus_t
{
STATUS_EXECUTION_FAILED
};
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
status
,
false
);
if
(
*
status
!=
STATUS_SUCCESS
)
{
return
*
status
;
}
cnnlTensorDescriptor_t
aDesc
,
bDesc
,
cDesc
;
cnnlCreateTensorDescriptor
(
&
aDesc
);
cnnlCreateTensorDescriptor
(
&
bDesc
);
cnnlCreateTensorDescriptor
(
&
cDesc
);
setMatrixTensorEx
(
aDesc
,
info
.
a_matrix
);
setMatrixTensorEx
(
bDesc
,
info
.
b_matrix
);
setMatrixTensorEx
(
cDesc
,
info
.
c_matrix
);
cnnlMatMulDescriptor_t
opDesc
;
cnnlMatMulAlgo_t
algo
;
cnnlMatMulHeuristicResult_t
algoResult
;
cnnlMatMulDescCreate
(
&
opDesc
);
cnnlMatMulAlgoCreate
(
&
algo
);
cnnlCreateMatMulHeuristicResult
(
&
algoResult
);
int32_t
use_stride
=
true
;
cnnlSetMatMulDescAttr
(
opDesc
,
CNNL_MATMUL_USE_STRIDE
,
&
use_stride
,
sizeof
(
int32_t
));
*
desc_ptr
=
new
MatmulBangDescriptor
{
handle
->
device
,
handle
->
device_id
,
info
,
alpha
,
beta
,
c_desc
->
dt
,
handle
->
cnnl_handles
,
aDesc
,
bDesc
,
cDesc
,
opDesc
,
algo
,
algoResult
};
return
STATUS_SUCCESS
;
}
infiniopStatus_t
bangGetMatmulWorkspaceSize
(
MatmulBangDescriptor_t
desc
,
uint64_t
*
size
)
{
*
size
=
0
;
return
STATUS_SUCCESS
;
}
infiniopStatus_t
bangDestroyMatmulDescriptor
(
MatmulBangDescriptor_t
desc
)
{
desc
->
cnnl_handles
=
nullptr
;
cnnlDestroyTensorDescriptor
(
desc
->
aDesc
);
cnnlDestroyTensorDescriptor
(
desc
->
bDesc
);
cnnlDestroyTensorDescriptor
(
desc
->
cDesc
);
cnnlMatMulDescDestroy
(
desc
->
opDesc
);
cnnlMatMulAlgoDestroy
(
desc
->
algo
);
cnnlDestroyMatMulHeuristicResult
(
desc
->
algoResult
);
delete
desc
;
return
STATUS_SUCCESS
;
}
void
matmul_cnnl_f16
(
MatmulBangDescriptor_t
desc
,
void
*
workspace
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
auto
info
=
desc
->
info
;
if
(
info
.
is_transed
)
{
std
::
swap
(
a
,
b
);
}
use_cnnl
(
desc
->
cnnl_handles
,
desc
->
device_id
,
(
cnrtQueue_t
)
stream
,
[
&
](
cnnlHandle_t
handle
)
{
int
count
=
0
;
cnnlGetBatchMatMulAlgoHeuristic
(
handle
,
desc
->
opDesc
,
desc
->
aDesc
,
desc
->
bDesc
,
desc
->
cDesc
,
NULL
,
1
,
&
desc
->
algoResult
,
&
count
);
size_t
wsSize
;
cnnlGetBatchMatMulHeuristicResult
(
desc
->
algoResult
,
desc
->
algo
,
&
wsSize
);
cnrtMalloc
(
&
workspace
,
wsSize
);
cnnlBatchMatMulBCast_v2
(
handle
,
desc
->
opDesc
,
desc
->
algo
,
&
alpha
,
desc
->
aDesc
,
a
,
desc
->
bDesc
,
b
,
&
beta
,
desc
->
cDesc
,
c
,
workspace
,
wsSize
);
});
}
infiniopStatus_t
bangMatmul
(
MatmulBangDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
void
*
stream
)
{
if
(
cnrtSetDevice
(
desc
->
device_id
)
!=
cnrtSuccess
)
{
return
STATUS_BAD_DEVICE
;
}
float
alpha
=
desc
->
alpha
;
float
beta
=
desc
->
beta
;
if
(
dtype_eq
(
desc
->
dtype
,
F16
))
{
matmul_cnnl_f16
(
desc
,
workspace
,
c
,
beta
,
a
,
b
,
alpha
,
stream
);
cnrtQueueSync
((
cnrtQueue_t
)
stream
);
return
STATUS_SUCCESS
;
}
return
STATUS_BAD_TENSOR_DTYPE
;
}
src/infiniop/ops/matmul/bang/matmul_cnnl.h
0 → 100644
View file @
46da1a27
#ifndef __CNNL_MATMUL_H__
#define __CNNL_MATMUL_H__
#include "../../../devices/bang/bang_handle.h"
#include "../blas.h"
#include "cnnl.h"
#include "cnnl_extra.h"
#include "operators.h"
struct
MatmulBangDescriptor
{
Device
device
;
int
device_id
;
MatmulInfo
info
;
float
alpha
;
float
beta
;
DT
dtype
;
std
::
shared_ptr
<
Pool
<
cnnlHandle_t
>>
cnnl_handles
;
cnnlTensorDescriptor_t
aDesc
;
cnnlTensorDescriptor_t
bDesc
;
cnnlTensorDescriptor_t
cDesc
;
cnnlMatMulDescriptor_t
opDesc
;
cnnlMatMulAlgo_t
algo
;
cnnlMatMulHeuristicResult_t
algoResult
;
};
typedef
struct
MatmulBangDescriptor
*
MatmulBangDescriptor_t
;
infiniopStatus_t
bangCreateMatmulDescriptor
(
BangHandle_t
handle
,
MatmulBangDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
float
alpha
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
float
beta
);
infiniopStatus_t
bangGetMatmulWorkspaceSize
(
MatmulBangDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
bangMatmul
(
MatmulBangDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
void
*
stream
);
infiniopStatus_t
bangDestroyMatmulDescriptor
(
MatmulBangDescriptor_t
desc
);
inline
void
setMatrixTensorEx
(
cnnlTensorDescriptor_t
desc
,
const
BlasMatrix
&
matrix
,
bool
trans
=
false
)
{
int
ndim
=
matrix
.
ndim
;
int
batch
=
matrix
.
batch
;
int
stride
=
static_cast
<
int
>
(
matrix
.
stride
);
int
rows
=
matrix
.
rows
;
int
cols
=
matrix
.
cols
;
int
row_stride
=
matrix
.
row_stride
;
int
col_stride
=
matrix
.
col_stride
;
if
(
ndim
==
3
)
{
std
::
vector
<
int
>
dim_size
=
{
batch
,
rows
,
cols
};
std
::
vector
<
int
>
dim_stride
=
{
stride
,
row_stride
,
col_stride
};
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
CNNL_DTYPE_HALF
,
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
}
else
if
(
ndim
==
2
)
{
std
::
vector
<
int
>
dim_size
=
{
rows
,
cols
};
std
::
vector
<
int
>
dim_stride
=
{
row_stride
,
col_stride
};
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
CNNL_DTYPE_HALF
,
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
}
}
#endif// __CNNL_MATMUL_H__
src/infiniop/ops/matmul/blas.h
0 → 100644
View file @
46da1a27
#ifndef __BLAS_H__
#define __BLAS_H__
#include "../utils.h"
#include "infiniop/operator.h"
#include <algorithm>
#include <stdint.h>
typedef
struct
BlasMatrix
{
size_t
ndim
;
size_t
batch
;
int64_t
stride
;
size_t
rows
;
size_t
cols
;
int64_t
row_stride
;
int64_t
col_stride
;
BlasMatrix
()
{}
BlasMatrix
(
infiniopTensorDescriptor_t
layout
,
infiniopStatus_t
*
status
)
{
if
(
layout
->
ndim
==
2
)
{
this
->
ndim
=
2
;
this
->
batch
=
1
;
this
->
stride
=
0
;
this
->
rows
=
layout
->
shape
[
0
];
this
->
cols
=
layout
->
shape
[
1
];
this
->
row_stride
=
layout
->
strides
[
0
];
this
->
col_stride
=
layout
->
strides
[
1
];
}
else
if
(
layout
->
ndim
==
3
)
{
this
->
ndim
=
3
;
this
->
batch
=
layout
->
shape
[
0
];
this
->
stride
=
this
->
batch
==
1
?
0
:
layout
->
strides
[
0
];
this
->
rows
=
layout
->
shape
[
1
];
this
->
cols
=
layout
->
shape
[
2
];
this
->
row_stride
=
layout
->
strides
[
1
];
this
->
col_stride
=
layout
->
strides
[
2
];
}
else
{
*
status
=
INFINIOP_STATUS_BAD_TENSOR_SHAPE
;
return
;
}
if
(
this
->
row_stride
!=
1
&&
this
->
col_stride
!=
1
)
{
*
status
=
INFINIOP_STATUS_BAD_TENSOR_STRIDES
;
return
;
}
*
status
=
INFINIOP_STATUS_SUCCESS
;
}
bool
match_batch
(
int
batch
)
const
{
return
this
->
batch
==
batch
||
this
->
batch
==
1
;
}
void
transpose
()
{
std
::
swap
(
rows
,
cols
);
std
::
swap
(
row_stride
,
col_stride
);
}
int
ld
()
const
{
if
(
this
->
row_stride
==
1
)
{
return
this
->
col_stride
;
}
else
{
return
this
->
row_stride
;
}
}
}
BlasMatrix
;
struct
MatmulInfo
{
BlasMatrix
a_matrix
;
BlasMatrix
b_matrix
;
BlasMatrix
c_matrix
;
size_t
m
,
n
,
k
,
batch
;
bool
is_transed
=
false
;
MatmulInfo
(
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
infiniopStatus_t
*
status
,
bool
col_major
=
true
)
{
a_matrix
=
BlasMatrix
(
a_desc
,
status
);
if
(
*
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
;
}
b_matrix
=
BlasMatrix
(
b_desc
,
status
);
if
(
*
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
;
}
c_matrix
=
BlasMatrix
(
c_desc
,
status
);
if
(
*
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
;
}
if
(
c_matrix
.
rows
!=
a_matrix
.
rows
||
c_matrix
.
cols
!=
b_matrix
.
cols
||
a_matrix
.
cols
!=
b_matrix
.
rows
){
*
status
=
INFINIOP_STATUS_BAD_TENSOR_SHAPE
;
return
;
}
batch
=
c_matrix
.
batch
;
if
(
!
a_matrix
.
match_batch
(
batch
)
||
!
b_matrix
.
match_batch
(
batch
))
{
*
status
=
INFINIOP_STATUS_BAD_TENSOR_SHAPE
;
return
;
}
if
((
col_major
&&
c_matrix
.
col_stride
==
1
)
||
(
!
col_major
&&
c_matrix
.
row_stride
==
1
))
{
c_matrix
.
transpose
();
b_matrix
.
transpose
();
a_matrix
.
transpose
();
std
::
swap
(
a_matrix
,
b_matrix
);
is_transed
=
true
;
}
m
=
c_matrix
.
rows
;
n
=
c_matrix
.
cols
;
k
=
a_matrix
.
cols
;
}
};
#endif// __BLAS_H__
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
0 → 100644
View file @
46da1a27
#include "./matmul_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../utils.h"
#include <cmath>
infiniopStatus_t
cpuCreateMatmulDescriptor
(
infiniopCpuHandle_t
handle
,
MatmulCpuDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
infiniDtype_t
dtype
=
c_desc
->
dtype
;
if
(
dtype
!=
INFINI_DTYPE_F16
&&
dtype
!=
INFINI_DTYPE_F32
)
{
return
INFINIOP_STATUS_BAD_TENSOR_DTYPE
;
}
infiniopStatus_t
status
;
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
*
desc_ptr
=
new
MatmulCpuDescriptor
{
INFINI_DEVICE_CPU
,
dtype
,
info
};
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cpuGetMatmulWorkspaceSize
(
MatmulCpuDescriptor_t
desc
,
uint64_t
*
size
)
{
*
size
=
0
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cpuDestroyMatmulDescriptor
(
MatmulCpuDescriptor_t
desc
)
{
delete
desc
;
return
INFINIOP_STATUS_SUCCESS
;
}
template
<
typename
Tdata
>
infiniopStatus_t
matmul_cpu
(
MatmulCpuDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
)
{
auto
info
=
desc
->
info
;
if
(
info
.
is_transed
)
{
std
::
swap
(
a
,
b
);
}
for
(
int
i
=
0
;
i
<
info
.
batch
;
++
i
)
{
for
(
int
m_
=
0
;
m_
<
info
.
m
;
++
m_
)
{
for
(
int
n_
=
0
;
n_
<
info
.
n
;
++
n_
)
{
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
float
sum
=
0
;
for
(
int
k_
=
0
;
k_
<
info
.
k
;
++
k_
)
{
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
sum
+=
f16_to_f32
(
*
a_
)
*
f16_to_f32
(
*
b_
);
}
else
{
sum
+=
*
a_
*
(
*
b_
);
}
}
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
if
(
beta
==
0
)
{
*
c_
=
f32_to_f16
(
alpha
*
sum
);
}
else
{
*
c_
=
f32_to_f16
(
beta
*
f16_to_f32
(
*
c_
)
+
alpha
*
sum
);
}
}
else
{
*
c_
=
beta
*
(
*
c_
)
+
alpha
*
sum
;
}
}
}
}
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cpuMatmul
(
MatmulCpuDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
)
{
if
(
desc
->
dtype
==
INFINI_DTYPE_F16
)
{
return
matmul_cpu
<
uint16_t
>
(
desc
,
c
,
beta
,
a
,
b
,
alpha
);
}
if
(
desc
->
dtype
==
INFINI_DTYPE_F32
)
{
return
matmul_cpu
<
float
>
(
desc
,
c
,
beta
,
a
,
b
,
alpha
);
}
return
INFINIOP_STATUS_BAD_TENSOR_DTYPE
;
}
src/infiniop/ops/matmul/cpu/matmul_cpu.h
0 → 100644
View file @
46da1a27
#ifndef __INFINIOP_MATMUL_CPU_H__
#define __INFINIOP_MATMUL_CPU_H__
#include "../../../devices/cpu/cpu_handle.h"
#include "../blas.h"
#include "infiniop/operator.h"
typedef
struct
MatmulCpuDescriptor
{
infiniDevice_t
device
;
infiniDtype_t
dtype
;
MatmulInfo
info
;
}
MatmulCpuDescriptor
;
typedef
struct
MatmulCpuDescriptor
*
MatmulCpuDescriptor_t
;
infiniopStatus_t
cpuCreateMatmulDescriptor
(
infiniopCpuHandle_t
handle
,
MatmulCpuDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
);
infiniopStatus_t
cpuGetMatmulWorkspaceSize
(
MatmulCpuDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
cpuMatmul
(
MatmulCpuDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
);
infiniopStatus_t
cpuDestroyMatmulDescriptor
(
MatmulCpuDescriptor_t
desc
);
#endif// __INFINIOP_MATMUL_CPU_H__
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
0 → 100644
View file @
46da1a27
#include "./matmul_cuda.cuh"
#include "../../utils.h"
infiniopStatus_t
cudaCreateMatmulDescriptor
(
infiniopCudaHandle_t
handle
,
infiniopMatmulCudaDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
infiniDtype_t
dtype
=
c_desc
->
dtype
;
if
(
dtype
!=
INFINI_DTYPE_F16
&&
dtype
!=
INFINI_DTYPE_F32
)
{
return
INFINIOP_STATUS_BAD_TENSOR_DTYPE
;
}
infiniopStatus_t
status
;
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
*
desc_ptr
=
new
InfiniopMatmulCudaDescriptor
{
handle
->
device
,
dtype
,
handle
->
device_id
,
info
,
handle
->
cublas_handles_t
};
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cudaGetMatmulWorkspaceSize
(
infiniopMatmulCudaDescriptor_t
desc
,
uint64_t
*
size
)
{
*
size
=
0
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cudaDestroyMatmulDescriptor
(
infiniopMatmulCudaDescriptor_t
desc
)
{
desc
->
cublas_handles_t
=
nullptr
;
delete
desc
;
return
INFINIOP_STATUS_SUCCESS
;
}
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
0 → 100644
View file @
46da1a27
#ifndef __INFINIOP_MATMUL_CUDA_H__
#define __INFINIOP_MATMUL_CUDA_H__
#include "matmul_cuda_api.h"
#include "../../../devices/cuda/common_cuda.cuh"
#include <memory>
#include "../blas.h"
typedef
struct
InfiniopMatmulCudaDescriptor
{
infiniDevice_t
device
;
infiniDtype_t
dtype
;
int
device_id
;
MatmulInfo
info
;
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handles_t
;
}
InfiniopMatmulCudaDescriptor
;
#endif// __INFINIOP_MATMUL_CUDA_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
0 → 100644
View file @
46da1a27
#ifndef __INFINIOP_MATMUL_CUDA_API_H__
#define __INFINIOP_MATMUL_CUDA_API_H__
#include "../../../devices/cuda/cuda_handle.h"
#include "infiniop/operator.h"
struct
InfiniopMatmulCudaDescriptor
;
typedef
struct
InfiniopMatmulCudaDescriptor
*
infiniopMatmulCudaDescriptor_t
;
infiniopStatus_t
cudaCreateMatmulDescriptor
(
infiniopCudaHandle_t
handle
,
infiniopMatmulCudaDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
);
infiniopStatus_t
cudaGetMatmulWorkspaceSize
(
infiniopMatmulCudaDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
cudaMatmul
(
infiniopMatmulCudaDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
,
void
*
stream
);
infiniopStatus_t
cudaDestroyMatmulDescriptor
(
infiniopMatmulCudaDescriptor_t
desc
);
#endif // __INFINIOP_MATMUL_CUDA_API_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
0 → 100644
View file @
46da1a27
#include "../../utils.h"
#include "./matmul_cuda.cuh"
template
<
typename
Tdata
>
infiniopStatus_t
matmul_cuda
(
infiniopMatmulCudaDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
auto
info
=
desc
->
info
;
if
(
info
.
is_transed
)
{
std
::
swap
(
a
,
b
);
}
cudaDataType
a_type
,
b_type
,
c_type
;
cublasComputeType_t
compute_type
;
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
a_type
=
b_type
=
c_type
=
CUDA_R_16F
;
compute_type
=
CUBLAS_COMPUTE_32F
;
}
else
{
a_type
=
b_type
=
c_type
=
CUDA_R_32F
;
#ifdef ENABLE_SUGON_CUDA_API
compute_type
=
CUBLAS_COMPUTE_32F
;
#else
compute_type
=
CUBLAS_COMPUTE_32F_FAST_TF32
;
#endif
}
auto
op_a
=
info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
use_cublas
(
desc
->
cublas_handles_t
,
desc
->
device_id
,
(
cudaStream_t
)
stream
,
[
&
](
cublasHandle_t
handle
)
{
cublasGemmStridedBatchedEx
(
handle
,
op_a
,
op_b
,
info
.
m
,
info
.
n
,
info
.
k
,
&
alpha
,
a
,
a_type
,
info
.
a_matrix
.
ld
(),
info
.
a_matrix
.
stride
,
b
,
b_type
,
info
.
b_matrix
.
ld
(),
info
.
b_matrix
.
stride
,
&
beta
,
c
,
c_type
,
info
.
c_matrix
.
ld
(),
info
.
c_matrix
.
stride
,
info
.
batch
,
compute_type
,
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
});
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cudaMatmul
(
infiniopMatmulCudaDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
,
void
*
stream
)
{
if
(
desc
->
dtype
==
INFINI_DTYPE_F16
)
{
return
matmul_cuda
<
half
>
(
desc
,
c
,
beta
,
a
,
b
,
alpha
,
stream
);
}
if
(
desc
->
dtype
==
INFINI_DTYPE_F32
)
{
return
matmul_cuda
<
float
>
(
desc
,
c
,
beta
,
a
,
b
,
alpha
,
stream
);
}
return
INFINIOP_STATUS_BAD_TENSOR_DTYPE
;
}
src/infiniop/ops/matmul/operator.cc
0 → 100644
View file @
46da1a27
#include "../utils.h"
#include "infiniop/ops/matmul.h"
#ifdef ENABLE_CPU_API
#include "cpu/matmul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/matmul_cuda_api.h"
#endif
#ifdef ENABLE_CAMBRICON_MLU
#include "bang/matmul_cnnl.h"
#endif
#ifdef ENABLE_ASCEND_NPU
#include "ascend/matmul_aclnn.h"
#endif
__C
infiniopStatus_t
infiniopCreateMatmulDescriptor
(
infiniopHandle_t
handle
,
infiniopMatmulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuCreateMatmulDescriptor
((
infiniopCpuHandle_t
)
handle
,
(
MatmulCpuDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
cudaCreateMatmulDescriptor
((
infiniopCudaHandle_t
)
handle
,
(
infiniopMatmulCudaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateMatmulDescriptor
((
BangHandle_t
)
handle
,
(
MatmulBangDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnCreateMatmulDescriptor
((
AscendHandle_t
)
handle
,
(
MatmulAclnnDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
,
1
);
}
#endif
}
return
INFINIOP_STATUS_BAD_DEVICE
;
}
__C
infiniopStatus_t
infiniopGetMatmulWorkspaceSize
(
infiniopMatmulDescriptor_t
desc
,
uint64_t
*
size
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuGetMatmulWorkspaceSize
((
MatmulCpuDescriptor_t
)
desc
,
size
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
cudaGetMatmulWorkspaceSize
((
infiniopMatmulCudaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangGetMatmulWorkspaceSize
((
MatmulBangDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnGetMatmulWorkspaceSize
((
MatmulAclnnDescriptor_t
)
desc
,
size
);
}
#endif
}
return
INFINIOP_STATUS_BAD_DEVICE
;
}
__C
infiniopStatus_t
infiniopMatmul
(
infiniopMatmulDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
,
void
*
stream
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuMatmul
((
MatmulCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
a
,
b
,
alpha
,
beta
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
return
cudaMatmul
((
infiniopMatmulCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
a
,
b
,
alpha
,
beta
,
stream
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangMatmul
((
MatmulBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
alpha
,
a
,
b
,
beta
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
return
aclnnMatmul
((
MatmulAclnnDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
alpha
,
a
,
b
,
beta
,
stream
);
#endif
}
return
INFINIOP_STATUS_BAD_DEVICE
;
}
__C
infiniopStatus_t
infiniopDestroyMatmulDescriptor
(
infiniopMatmulDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuDestroyMatmulDescriptor
((
MatmulCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
cudaDestroyMatmulDescriptor
((
infiniopMatmulCudaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroyMatmulDescriptor
((
MatmulBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnDestroyMatmulDescriptor
((
MatmulAclnnDescriptor_t
)
desc
);
}
#endif
}
return
INFINIOP_STATUS_BAD_DEVICE
;
}
src/infiniop/ops/utils.h
0 → 100644
View file @
46da1a27
#ifndef __UTILS_H__
#define __UTILS_H__
#include "infiniop/tensor_descriptor.h"
#include <algorithm>
#include <iostream>
#include <numeric>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
/* This file contains some useful macros and helper functions */
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define CHECK_ERROR(call, target, errCode) \
do { \
if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) \
<< " but got " << value \
<< " in file " << __FILE__ \
<< ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return (errCode); \
} \
} while (0)
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr; \
CHECK_ERROR(value, target, errCode)
#define CHECK_STATUS(call, target) \
do { \
if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) \
<< " but got " << value \
<< " in file " << __FILE__ \
<< ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return value; \
} \
} while (0)
inline
std
::
vector
<
int64_t
>
get_byte_strides
(
infiniopTensorDescriptor_t
desc
)
{
std
::
vector
<
int64_t
>
strides
(
desc
->
ndim
);
for
(
uint64_t
i
=
0
;
i
<
desc
->
ndim
;
i
++
)
{
strides
[
i
]
=
desc
->
strides
[
i
]
*
infini_sizeof
(
desc
->
dtype
);
}
return
strides
;
}
// calculate the broadcasted shape for two tensors
inline
bool
getBroadcastShape
(
const
uint64_t
*
shape1
,
uint64_t
ndim1
,
const
uint64_t
*
shape2
,
uint64_t
ndim2
,
uint64_t
*
broadcast_shape
,
uint64_t
*
padded_shape1
,
uint64_t
*
padded_shape2
,
uint64_t
max_rank
)
{
// prepending and initializing
std
::
fill
(
padded_shape1
,
padded_shape1
+
max_rank
,
1
);
std
::
fill
(
padded_shape2
,
padded_shape2
+
max_rank
,
1
);
std
::
copy
(
shape1
,
shape1
+
ndim1
,
padded_shape1
+
max_rank
-
ndim1
);
std
::
copy
(
shape2
,
shape2
+
ndim2
,
padded_shape2
+
max_rank
-
ndim2
);
// compute broadcasted shape
for
(
size_t
i
=
0
;
i
<
max_rank
;
++
i
)
{
if
(
padded_shape1
[
i
]
==
padded_shape2
[
i
]
||
padded_shape1
[
i
]
==
1
||
padded_shape2
[
i
]
==
1
)
{
broadcast_shape
[
i
]
=
std
::
max
(
padded_shape1
[
i
],
padded_shape2
[
i
]);
}
else
{
return
false
;
}
}
return
true
;
}
// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
inline
bool
isValidBroadcastShape
(
infiniopTensorDescriptor_t
a
,
infiniopTensorDescriptor_t
b
,
infiniopTensorDescriptor_t
c
,
uint64_t
broadcast_ndim
)
{
std
::
vector
<
uint64_t
>
broadcast_shape_
(
broadcast_ndim
),
padded_shape1_
(
broadcast_ndim
),
padded_shape2_
(
broadcast_ndim
);
auto
broadcast_shape
=
broadcast_shape_
.
data
(),
padded_shape1
=
padded_shape1_
.
data
(),
padded_shape2
=
padded_shape2_
.
data
();
if
(
broadcast_ndim
!=
c
->
ndim
||
!
getBroadcastShape
(
a
->
shape
,
a
->
ndim
,
b
->
shape
,
b
->
ndim
,
broadcast_shape
,
padded_shape1
,
padded_shape2
,
broadcast_ndim
))
{
return
false
;
}
return
std
::
equal
(
broadcast_shape
,
broadcast_shape
+
broadcast_ndim
,
c
->
shape
);
}
// check if the shape of tensor src can be validly broadcasted to that of the tensor dst
inline
bool
isValidBroadcastShape
(
infiniopTensorDescriptor_t
dst
,
infiniopTensorDescriptor_t
src
)
{
if
(
dst
->
ndim
<
src
->
ndim
)
{
return
false
;
}
std
::
vector
<
size_t
>
padded_shape_
(
dst
->
ndim
);
auto
padded_shape
=
padded_shape_
.
data
();
std
::
fill
(
padded_shape
,
padded_shape
+
dst
->
ndim
,
1
);
std
::
copy
(
src
->
shape
,
src
->
shape
+
src
->
ndim
,
padded_shape
+
dst
->
ndim
-
src
->
ndim
);
for
(
size_t
i
=
0
;
i
<
dst
->
ndim
;
++
i
)
{
if
(
padded_shape
[
i
]
!=
dst
->
shape
[
i
]
&&
padded_shape
[
i
]
!=
1
)
{
return
false
;
}
}
return
true
;
}
// check if the shape of tensor c is valid after broadcasting tensors a and b
inline
bool
isValidBroadcastShape
(
infiniopTensorDescriptor_t
a
,
infiniopTensorDescriptor_t
b
,
infiniopTensorDescriptor_t
c
)
{
return
isValidBroadcastShape
(
a
,
b
,
c
,
std
::
max
(
a
->
ndim
,
b
->
ndim
));
}
inline
size_t
get_byte_size
(
infiniopTensorDescriptor_t
desc
)
{
size_t
size
=
1
;
for
(
size_t
i
=
0
;
i
<
desc
->
ndim
;
i
++
)
{
size
*=
desc
->
shape
[
i
];
}
return
size
*
infini_sizeof
(
desc
->
dtype
);
}
// permute the dimensions of a tensor descriptor
inline
infiniopTensorDescriptor_t
permute
(
infiniopTensorDescriptor_t
desc
,
const
std
::
vector
<
size_t
>
&
order
)
{
size_t
ndim
=
desc
->
ndim
;
if
(
order
.
size
()
!=
ndim
)
{
return
nullptr
;
}
size_t
*
shape
=
new
size_t
[
ndim
];
int64_t
*
strides
=
new
int64_t
[
ndim
];
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
if
(
std
::
find
(
order
.
begin
(),
order
.
end
(),
i
)
==
order
.
end
())
{
return
nullptr
;
}
shape
[
i
]
=
desc
->
shape
[
order
[
i
]];
strides
[
i
]
=
desc
->
strides
[
order
[
i
]];
}
return
new
InfiniopTensorDescriptor
{
desc
->
dtype
,
ndim
,
shape
,
strides
};
}
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous
inline
bool
is_contiguous
(
const
infiniopTensorDescriptor_t
&
desc
,
size_t
dim_start
,
size_t
dim_end
)
{
for
(
size_t
i
=
dim_start
+
1
;
i
<=
dim_end
;
i
++
)
{
if
(
desc
->
strides
[
i
-
1
]
!=
static_cast
<
int64_t
>
(
desc
->
shape
[
i
])
*
desc
->
strides
[
i
])
{
return
false
;
}
}
return
true
;
}
inline
bool
is_contiguous
(
const
infiniopTensorDescriptor_t
&
desc
)
{
if
(
desc
->
ndim
==
0
)
{
return
true
;
}
return
is_contiguous
(
desc
,
0
,
desc
->
ndim
-
1
);
}
// merge the dimensions [dim_start, dim_end] of a tensor descriptor
inline
infiniopTensorDescriptor_t
dim_merge
(
infiniopTensorDescriptor_t
desc
,
size_t
dim_start
,
size_t
dim_end
)
{
size_t
ndim
=
desc
->
ndim
;
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
)
{
return
nullptr
;
}
size_t
new_ndim
=
ndim
-
(
dim_end
-
dim_start
);
size_t
*
new_shape
=
new
size_t
[
new_ndim
];
int64_t
*
new_strides
=
new
int64_t
[
new_ndim
];
size_t
index
=
0
;
for
(
size_t
i
=
0
;
i
<
dim_start
;
i
++
)
{
new_shape
[
index
]
=
desc
->
shape
[
i
];
new_strides
[
index
]
=
desc
->
strides
[
i
];
index
++
;
}
if
(
!
is_contiguous
(
desc
,
dim_start
,
dim_end
))
{
return
nullptr
;
}
new_shape
[
index
]
=
1
;
for
(
size_t
i
=
dim_start
;
i
<=
dim_end
;
i
++
)
{
new_shape
[
index
]
*=
desc
->
shape
[
i
];
}
new_strides
[
index
]
=
desc
->
strides
[
dim_end
];
index
++
;
for
(
size_t
i
=
dim_end
+
1
;
i
<
ndim
;
i
++
)
{
new_shape
[
index
]
=
desc
->
shape
[
i
];
new_strides
[
index
]
=
desc
->
strides
[
i
];
index
++
;
}
return
new
InfiniopTensorDescriptor
{
desc
->
dtype
,
new_ndim
,
new_shape
,
new_strides
};
}
// split the dimension dim of a tensor descriptor into multiple dimensions
inline
infiniopTensorDescriptor_t
dim_split
(
infiniopTensorDescriptor_t
desc
,
size_t
dim
,
const
std
::
vector
<
size_t
>
&
dims
)
{
size_t
ndim
=
desc
->
ndim
;
if
(
desc
->
shape
[
dim
]
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
{}))
{
return
nullptr
;
}
size_t
new_ndim
=
ndim
+
dims
.
size
()
-
1
;
size_t
*
new_shape
=
new
size_t
[
new_ndim
];
int64_t
*
new_strides
=
new
int64_t
[
new_ndim
];
size_t
index
=
0
;
for
(
size_t
i
=
0
;
i
<
dim
;
i
++
)
{
new_shape
[
index
]
=
desc
->
shape
[
i
];
new_strides
[
index
]
=
desc
->
strides
[
i
];
index
++
;
}
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
new_shape
[
index
]
=
dims
[
i
];
new_strides
[
index
]
=
desc
->
strides
[
dim
]
*
desc
->
shape
[
dim
]
/
std
::
accumulate
(
dims
.
begin
(),
dims
.
begin
()
+
i
+
1
,
1
,
std
::
multiplies
<
size_t
>
());
index
++
;
}
for
(
size_t
i
=
dim
+
1
;
i
<
ndim
;
i
++
)
{
new_shape
[
index
]
=
desc
->
shape
[
i
];
new_strides
[
index
]
=
desc
->
strides
[
i
];
index
++
;
}
return
new
InfiniopTensorDescriptor
{
desc
->
dtype
,
new_ndim
,
new_shape
,
new_strides
};
}
#endif// __UTILS_H__
src/infiniop/tensor_descriptor.cc
0 → 100644
View file @
46da1a27
#include "infiniop/tensor_descriptor.h"
#include <cstring>
__C
__export
infiniopStatus_t
infiniopCreateTensorDescriptor
(
infiniopTensorDescriptor_t
*
desc_ptr
,
size_t
ndim
,
size_t
const
*
shape_
,
int64_t
const
*
strides_
,
infiniDtype_t
datatype
)
{
size_t
*
shape
=
new
size_t
[
ndim
];
int64_t
*
strides
=
new
int64_t
[
ndim
];
std
::
memcpy
(
shape
,
shape_
,
ndim
*
sizeof
(
size_t
));
if
(
strides_
)
{
std
::
memcpy
(
strides
,
strides_
,
ndim
*
sizeof
(
int64_t
));
}
else
{
int64_t
dsize
=
1
;
for
(
int
i
=
ndim
-
1
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dsize
;
dsize
*=
shape
[
i
];
}
}
*
desc_ptr
=
new
InfiniopTensorDescriptor
{
datatype
,
ndim
,
shape
,
strides
};
return
INFINIOP_STATUS_SUCCESS
;
}
__C
__export
infiniopStatus_t
infiniopDestroyTensorDescriptor
(
infiniopTensorDescriptor_t
desc
)
{
delete
[]
desc
->
shape
;
delete
[]
desc
->
strides
;
delete
desc
;
return
INFINIOP_STATUS_SUCCESS
;
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment