Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
e77735ef
Unverified
Commit
e77735ef
authored
Feb 25, 2025
by
PanZezhong1725
Committed by
GitHub
Feb 25, 2025
Browse files
Merge pull request #65 from YdrMaster/main
issue/63 重构算子定义文件结构及风格修改
parents
b7893d65
3144cc9c
Changes
40
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
249 additions
and
263 deletions
+249
-263
include/infiniop/handle.h
include/infiniop/handle.h
+1
-1
include/infiniop/operator.h
include/infiniop/operator.h
+6
-3
include/infiniop/ops/attention.h
include/infiniop/ops/attention.h
+5
-5
include/infiniop/ops/mlp.h
include/infiniop/ops/mlp.h
+5
-5
include/infiniop/tensor_descriptor.h
include/infiniop/tensor_descriptor.h
+2
-2
scripts/format.py
scripts/format.py
+2
-2
src/infiniop/devices/ascend/tensor_aclnn.h
src/infiniop/devices/ascend/tensor_aclnn.h
+1
-1
src/infiniop/devices/cpu/common_cpu.cc
src/infiniop/devices/cpu/common_cpu.cc
+19
-9
src/infiniop/devices/cpu/common_cpu.h
src/infiniop/devices/cpu/common_cpu.h
+7
-6
src/infiniop/devices/cpu/cpu_handle.cc
src/infiniop/devices/cpu/cpu_handle.cc
+1
-1
src/infiniop/devices/cuda/common_cuda.cuh
src/infiniop/devices/cuda/common_cuda.cuh
+17
-14
src/infiniop/devices/cuda/cuda_handle.cu
src/infiniop/devices/cuda/cuda_handle.cu
+1
-1
src/infiniop/devices/handle.cc
src/infiniop/devices/handle.cc
+5
-5
src/infiniop/operator.cc
src/infiniop/operator.cc
+15
-0
src/infiniop/ops/causal_softmax/operator.cc
src/infiniop/ops/causal_softmax/operator.cc
+3
-3
src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+0
-151
src/infiniop/ops/matmul/ascend/matmul_aclnn.h
src/infiniop/ops/matmul/ascend/matmul_aclnn.h
+0
-29
src/infiniop/ops/matmul/ascend/matmul_aclnn_api.h
src/infiniop/ops/matmul/ascend/matmul_aclnn_api.h
+0
-25
src/infiniop/ops/matmul/ascend/matmul_ascend.cc
src/infiniop/ops/matmul/ascend/matmul_ascend.cc
+151
-0
src/infiniop/ops/matmul/ascend/matmul_ascend.h
src/infiniop/ops/matmul/ascend/matmul_ascend.h
+8
-0
No files found.
include/infiniop/handle.h
View file @
e77735ef
...
...
@@ -2,7 +2,7 @@
#define __INFINIOP_HANDLE__
#include "../infinicore.h"
#include "
./
status.h"
#include "status.h"
typedef
struct
InfiniopHandle
{
infiniDevice_t
device
;
...
...
include/infiniop/operator.h
View file @
e77735ef
#ifndef __INFINIOP_OPERATOR___
#define __INFINIOP_OPERATOR___
#include "
./
handle.h"
#include "
./
tensor_descriptor.h"
#include "handle.h"
#include "tensor_descriptor.h"
// Base descriptor for all operators
typedef
struct
InfiniopDescriptor
{
infiniDevice_t
device
;
infiniDevice_t
device
_type
;
int
device_id
;
}
InfiniopDescriptor
;
__C
__export
infiniopStatus_t
infiniopGetDescriptorDeviceType
(
const
InfiniopDescriptor
*
desc_ptr
,
infiniDevice_t
*
device_type
);
__C
__export
infiniopStatus_t
infiniopGetDescriptorDeviceId
(
const
InfiniopDescriptor
*
desc_ptr
,
int
*
device_id
);
#endif //__INFINIOP_OPERATOR___
include/infiniop/ops/attention.h
View file @
e77735ef
...
...
@@ -2,8 +2,8 @@
#define __INFINIOP_ATTENTION_H__
#include "../operator.h"
#include "
./
matmul.h"
#include "
./
swiglu.h"
#include "matmul.h"
#include "swiglu.h"
typedef
InfiniopDescriptor
*
infiniopAttentionDescriptor_t
;
...
...
@@ -23,9 +23,9 @@ __C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t de
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
void
const
*
q
,
void
const
*
k
,
void
const
*
v
,
const
void
*
q
,
const
void
*
k
,
const
void
*
v
,
void
*
k_cache
,
void
*
v_cache
,
void
*
stream
);
...
...
include/infiniop/ops/mlp.h
View file @
e77735ef
...
...
@@ -2,8 +2,8 @@
#define __INFINIOP_MLP_H__
#include "../operator.h"
#include "
./
matmul.h"
#include "
./
swiglu.h"
#include "matmul.h"
#include "swiglu.h"
typedef
InfiniopDescriptor
*
infiniopMLPDescriptor_t
;
...
...
@@ -22,9 +22,9 @@ __C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
const
*
x
,
void
const
*
w12
,
void
const
*
w3
,
const
void
*
x
,
const
void
*
w12
,
const
void
*
w3
,
void
*
stream
);
__C
__export
infiniopStatus_t
infiniopDestroyMLPDescriptor
(
infiniopMLPDescriptor_t
desc
);
...
...
include/infiniop/tensor_descriptor.h
View file @
e77735ef
...
...
@@ -2,7 +2,7 @@
#define __INFINIOP_TENSOR_DESCRIPTOR__
#include "../infinicore.h"
#include "
./
status.h"
#include "status.h"
struct
InfiniopTensorDescriptor
{
// Datatype
...
...
@@ -17,7 +17,7 @@ struct InfiniopTensorDescriptor {
typedef
struct
InfiniopTensorDescriptor
*
infiniopTensorDescriptor_t
;
__C
__export
infiniopStatus_t
infiniopCreateTensorDescriptor
(
infiniopTensorDescriptor_t
*
desc_ptr
,
size_t
ndim
,
size_t
cons
t
*
shape
,
ptrdiff_t
const
*
strides
,
infiniDtype_t
dtype
);
__C
__export
infiniopStatus_t
infiniopCreateTensorDescriptor
(
infiniopTensorDescriptor_t
*
desc_ptr
,
size_t
ndim
,
const
size_
t
*
shape
,
const
ptrdiff_t
*
strides
,
infiniDtype_t
dtype
);
__C
__export
infiniopStatus_t
infiniopDestroyTensorDescriptor
(
infiniopTensorDescriptor_t
desc
);
...
...
scripts/format.py
View file @
e77735ef
...
...
@@ -91,7 +91,7 @@ def git_added_files():
try
:
# 使用 git diff --cached --name-only 获取所有已添加到暂存区的文件
result
=
subprocess
.
run
(
[
"git"
,
"diff"
,
"--cached"
,
"--name-only"
],
[
"git"
,
"diff"
,
"--cached"
,
"--diff-filter=AMR"
,
"--name-only"
],
capture_output
=
True
,
text
=
True
,
check
=
True
,
...
...
@@ -162,7 +162,7 @@ def main():
if
args
.
ref
is
None
and
args
.
path
is
None
:
# Last commit.
print
(
"{Fore.GREEN}Formating git added files.{Style.RESET_ALL}"
)
print
(
f
"
{
Fore
.
GREEN
}
Formating git added files.
{
Style
.
RESET_ALL
}
"
)
files
=
git_added_files
()
else
:
...
...
src/infiniop/devices/ascend/tensor_aclnn.h
View file @
e77735ef
#ifndef __ACLNN_TENSOR__
#define __ACLNN_TENSOR__
#include "
./
common_ascend.h"
#include "common_ascend.h"
#include "infiniop/operator.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
...
...
src/infiniop/devices/cpu/common_cpu.cc
View file @
e77735ef
#include "
./
common_cpu.h"
#include "common_cpu.h"
float
f16_to_f32
(
uint16_t
h
)
{
uint32_t
sign
=
(
h
&
0x8000
)
<<
16
;
...
...
@@ -59,9 +59,11 @@ uint16_t f32_to_f16(float val) {
}
}
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
ptrdiff_t
const
*
broadcasted_strides
,
ptrdiff_t
const
*
target_strides
)
{
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
...
...
@@ -70,8 +72,11 @@ size_t indexToReducedOffset(size_t flat_index, size_t ndim,
return
res
;
}
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
size_t
const
*
shape
,
ptrdiff_t
const
*
strides
)
{
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>=
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
...
...
@@ -80,7 +85,10 @@ size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
return
res
;
}
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
size_t
const
*
pads
)
{
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
const
size_t
*
pads
)
{
size_t
total_size
=
1
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
total_size
*=
shape
[
i
]
+
(
i
<
2
?
0
:
2
*
pads
[
i
-
2
]);
...
...
@@ -88,8 +96,10 @@ size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
return
total_size
;
}
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
size_t
const
*
shape
,
size_t
const
*
pads
)
{
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
const
size_t
*
shape
,
const
size_t
*
pads
)
{
std
::
vector
<
size_t
>
padded_shape
(
ndim
);
memcpy
(
padded_shape
.
data
(),
shape
,
ndim
*
sizeof
(
size_t
));
for
(
size_t
i
=
2
;
i
<
ndim
;
++
i
)
{
...
...
src/infiniop/devices/cpu/common_cpu.h
View file @
e77735ef
#ifndef __INFINIOP_
_
COMMON_CPU_H__
#define __INFINIOP_
_
COMMON_CPU_H__
#ifndef __INFINIOP_COMMON_CPU_H__
#define __INFINIOP_COMMON_CPU_H__
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <vector>
...
...
@@ -13,18 +14,18 @@ float f16_to_f32(uint16_t code);
uint16_t
f32_to_f16
(
float
val
);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
ptrdiff_t
const
*
broadcasted_strides
,
ptrdiff_t
const
*
target_strides
);
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
);
// return the memory offset a tensor given flattened index
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
size_t
cons
t
*
shape
,
ptrdiff_t
const
*
strides
);
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_
t
*
shape
,
const
ptrdiff_t
*
strides
);
/**
* get the total array size (element count) after applying padding for a
* ndim-ary tensor with the given shape
*/
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
size_t
cons
t
*
pads
);
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
const
size_
t
*
pads
);
// calculate the padded shape and store the result in padded_shape
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
size_t
const
*
shape
,
size_t
cons
t
*
pads
);
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
const
size_t
*
shape
,
const
size_
t
*
pads
);
#endif // __INFINIOP__COMMON_CPU_H__
src/infiniop/devices/cpu/cpu_handle.cc
View file @
e77735ef
#include "
./
cpu_handle.h"
#include "cpu_handle.h"
infiniopStatus_t
createCpuHandle
(
infiniopCpuHandle_t
*
handle_ptr
)
{
*
handle_ptr
=
new
InfiniopHandle
{
INFINI_DEVICE_CPU
,
0
};
...
...
src/infiniop/devices/cuda/common_cuda.cuh
View file @
e77735ef
...
...
@@ -48,26 +48,25 @@ struct InfiniopCudaHandle {
};
template
<
typename
T
>
void
use_cublas
(
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handle_pool
,
int
device_id
,
cudaStream_t
stream
,
T
const
&
f
)
{
auto
handle
=
cublas_handle_
pool
->
pop
();
void
use_cublas
(
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
&
pool
,
cudaStream_t
stream
,
const
T
&
f
)
{
auto
handle
=
pool
->
pop
();
if
(
!
handle
)
{
cublasCreate
(
&
(
*
handle
));
}
cublasSetStream
(
*
handle
,
(
cudaStream_t
)
stream
);
cublasSetStream
(
*
handle
,
stream
);
f
(
*
handle
);
cublas_handle_
pool
->
push
(
std
::
move
(
*
handle
));
pool
->
push
(
std
::
move
(
*
handle
));
}
template
<
typename
T
>
cudnnStatus_t
use_cudnn
(
std
::
shared_ptr
<
Pool
<
cudnnHandle_t
>>
cudnn_handle_pool
,
int
device_id
,
cudaStream_t
stream
,
T
const
&
f
)
{
auto
handle
=
cudnn_handle_
pool
->
pop
();
void
use_cudnn
(
std
::
shared_ptr
<
Pool
<
cudnnHandle_t
>>
&
pool
,
cudaStream_t
stream
,
const
T
&
f
)
{
auto
handle
=
pool
->
pop
();
if
(
!
handle
)
{
cudnnCreate
(
&
(
*
handle
));
}
cudnnSetStream
(
*
handle
,
stream
);
cudnnStatus_t
status
=
f
(
*
handle
);
cudnn_handle_pool
->
push
(
std
::
move
(
*
handle
));
return
status
;
f
(
*
handle
);
pool
->
push
(
std
::
move
(
*
handle
));
}
inline
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
)
{
...
...
@@ -96,8 +95,10 @@ inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
// return the memory offset of original tensor, given the flattened index of
// broadcasted tensor
inline
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
ptrdiff_t
const
*
broadcasted_strides
,
ptrdiff_t
const
*
target_strides
)
{
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
...
...
@@ -107,9 +108,11 @@ inline __device__ __host__ size_t indexToReducedOffset(
}
// get the memory offset of the given element in a tensor given its flat index
inline
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
size_t
const
*
shape
,
ptrdiff_t
const
*
strides
)
{
inline
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
...
...
src/infiniop/devices/cuda/cuda_handle.cu
View file @
e77735ef
#include "
./
common_cuda.cuh"
#include "common_cuda.cuh"
infiniopStatus_t
createCudaHandle
(
infiniopCudaHandle_t
*
handle_ptr
,
infiniDevice_t
cuda_device_type
)
{
// Create a new cublas handle pool
...
...
src/infiniop/devices/handle.cc
View file @
e77735ef
#include "infiniop/handle.h"
#ifdef ENABLE_CPU_API
#include "
./
cpu/cpu_handle.h"
#include "cpu/cpu_handle.h"
#endif
#ifdef ENABLE_CUDA_API
#include "
./
cuda/cuda_handle.h"
#include "cuda/cuda_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "
./
bang/bang_handle.h"
#include "bang/bang_handle.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "
./
ascend/ascend_handle.h"
#include "ascend/ascend_handle.h"
#endif
__C
infiniopStatus_t
infiniopCreateHandle
(
infiniopHandle_t
*
handle_ptr
,
...
...
src/infiniop/operator.cc
0 → 100644
View file @
e77735ef
#include "infiniop/operator.h"
infiniopStatus_t
infiniopGetDescriptorDeviceType
(
const
InfiniopDescriptor
*
desc_ptr
,
infiniDevice_t
*
device_type
)
{
*
device_type
=
desc_ptr
->
device_type
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
infiniopGetDescriptorDeviceId
(
const
InfiniopDescriptor
*
desc_ptr
,
int
*
device_id
)
{
*
device_id
=
desc_ptr
->
device_id
;
return
INFINIOP_STATUS_SUCCESS
;
}
src/infiniop/ops/causal_softmax/operator.cc
View file @
e77735ef
...
...
@@ -41,7 +41,7 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
}
__C
infiniopStatus_t
infiniopGetCausalSoftmaxWorkspaceSize
(
infiniopCausalSoftmaxDescriptor_t
desc
,
size_t
*
size
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
_type
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuGetCausalSoftmaxWorkspaceSize
((
CausalSoftmaxCpuDescriptor_t
)
desc
,
size
);
...
...
@@ -79,7 +79,7 @@ __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmax
}
__C
infiniopStatus_t
infiniopCausalSoftmax
(
infiniopCausalSoftmaxDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
data
,
void
*
stream
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
_type
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuCausalSoftmax
((
CausalSoftmaxCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
data
,
stream
);
...
...
@@ -116,7 +116,7 @@ __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t des
}
__C
infiniopStatus_t
infiniopDestroyCausalSoftmaxDescriptor
(
infiniopCausalSoftmaxDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
_type
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuDestroyCausalSoftmaxDescriptor
((
CausalSoftmaxCpuDescriptor_t
)
desc
);
...
...
src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
deleted
100644 → 0
View file @
b7893d65
#include "matmul_aclnn.h"
InfiniopMatmulAclnnDescriptor
::
InfiniopMatmulAclnnDescriptor
(
infiniDevice_t
_device
)
{
device
=
_device
;
device_id
=
0
;
executor
=
nullptr
;
info
=
nullptr
;
cDesc
=
new
aclnnTensorDescriptor
();
aDesc
=
new
aclnnTensorDescriptor
();
bDesc
=
new
aclnnTensorDescriptor
();
mt
=
1
;
workspaceSize
=
0
;
}
infiniopStatus_t
aclnnCreateMatmulDescriptor
(
infiniopAscendHandle_t
handle
,
MatmulAclnnDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
int8_t
mt
)
{
infiniDtype_t
dtype
=
c_desc
->
dtype
;
if
(
dtype
!=
INFINI_DTYPE_F16
&&
dtype
!=
INFINI_DTYPE_F32
)
{
return
INFINIOP_STATUS_BAD_TENSOR_DTYPE
;
}
*
desc_ptr
=
new
InfiniopMatmulAclnnDescriptor
(
handle
->
device
);
(
*
desc_ptr
)
->
device_id
=
handle
->
device_id
;
(
*
desc_ptr
)
->
dtype
=
dtype
;
(
*
desc_ptr
)
->
mt
=
mt
;
infiniopStatus_t
status
;
auto
info
=
new
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
false
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
(
*
desc_ptr
)
->
info
=
info
;
auto
&
cDesc
=
(
*
desc_ptr
)
->
cDesc
;
auto
&
aDesc
=
(
*
desc_ptr
)
->
aDesc
;
auto
&
bDesc
=
(
*
desc_ptr
)
->
bDesc
;
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
// operation
CHECK_STATUS
(
cDesc
->
setDescriptor
(
toAclDataType
(
c_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
info
->
c_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
->
c_matrix
.
cols
)},
{
info
->
c_matrix
.
row_stride
,
info
->
c_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
aDesc
->
setDescriptor
(
toAclDataType
(
a_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
info
->
a_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
->
a_matrix
.
cols
)},
{
info
->
a_matrix
.
row_stride
,
info
->
a_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
bDesc
->
setDescriptor
(
toAclDataType
(
b_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
info
->
b_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
->
b_matrix
.
cols
)},
{
info
->
b_matrix
.
row_stride
,
info
->
b_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
cDesc
->
createTensor
(),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
aDesc
->
createTensor
(),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
bDesc
->
createTensor
(),
INFINIOP_STATUS_SUCCESS
);
auto
&
workspaceSize
=
(
*
desc_ptr
)
->
workspaceSize
;
auto
&
executor
=
(
*
desc_ptr
)
->
executor
;
aclTensor
*
tc
=
cDesc
->
t
;
aclTensor
*
ta
=
aDesc
->
t
;
aclTensor
*
tb
=
bDesc
->
t
;
aclnnStatus
ret
;
int64_t
transA
=
0
;
int64_t
transB
=
0
;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
ret
=
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
0.5
f
,
0.5
f
,
transA
,
transB
,
tc
,
(
*
desc_ptr
)
->
mt
,
&
workspaceSize
,
&
executor
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemmGetWorkspaceSize failed. ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
aclSetAclOpExecutorRepeatable
(
executor
);
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
aclnnGetMatmulWorkspaceSize
(
MatmulAclnnDescriptor_t
desc
,
size_t
*
size
)
{
*
size
=
desc
->
workspaceSize
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
aclnnMatmul
(
MatmulAclnnDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
,
void
*
stream
)
{
auto
&
cDesc
=
desc
->
cDesc
;
auto
&
aDesc
=
desc
->
aDesc
;
auto
&
bDesc
=
desc
->
bDesc
;
aclTensor
*
tc
=
cDesc
->
t
;
aclTensor
*
ta
=
aDesc
->
t
;
aclTensor
*
tb
=
bDesc
->
t
;
auto
batch
=
desc
->
info
->
batch
;
size_t
workspaceSize
;
aclnnStatus
ret
;
ret
=
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
alpha
,
beta
,
0
,
0
,
tc
,
desc
->
mt
,
&
workspaceSize
,
&
(
desc
->
executor
));
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemmGetWorkspaceSize failed. ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
if
(
workspace_size
<
workspaceSize
)
{
return
INFINIOP_STATUS_INSUFFICIENT_WORKSPACE
;
}
aclSetAclOpExecutorRepeatable
(
desc
->
executor
);
for
(
size_t
i
=
0
;
i
<
batch
;
i
++
)
{
AclSetTensorAddr
(
desc
->
executor
,
0
,
ta
,
(
char
*
)(
a
)
+
i
*
desc
->
info
->
a_matrix
.
stride
*
infiniSizeof
(
desc
->
dtype
));
AclSetTensorAddr
(
desc
->
executor
,
1
,
tb
,
(
char
*
)(
b
)
+
i
*
desc
->
info
->
b_matrix
.
stride
*
infiniSizeof
(
desc
->
dtype
));
AclSetTensorAddr
(
desc
->
executor
,
2
,
tc
,
(
char
*
)(
c
)
+
i
*
desc
->
info
->
c_matrix
.
stride
*
infiniSizeof
(
desc
->
dtype
));
AclSetTensorAddr
(
desc
->
executor
,
3
,
tc
,
(
char
*
)(
c
)
+
i
*
desc
->
info
->
c_matrix
.
stride
*
infiniSizeof
(
desc
->
dtype
));
ret
=
aclnnGemm
(
workspace
,
workspaceSize
,
desc
->
executor
,
stream
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemm failed. ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
}
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
aclnnDestroyMatmulDescriptor
(
MatmulAclnnDescriptor_t
desc
)
{
delete
desc
->
cDesc
;
delete
desc
->
bDesc
;
delete
desc
->
aDesc
;
delete
desc
->
info
;
aclDestroyAclOpExecutor
(
desc
->
executor
);
delete
desc
;
return
INFINIOP_STATUS_SUCCESS
;
}
src/infiniop/ops/matmul/ascend/matmul_aclnn.h
deleted
100644 → 0
View file @
b7893d65
#ifndef __ACLNN_MATMUL_H__
#define __ACLNN_MATMUL_H__
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "../blas.h"
#include "matmul_aclnn_api.h"
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h>
struct
InfiniopMatmulAclnnDescriptor
{
infiniDevice_t
device
;
int
device_id
;
aclOpExecutor
*
executor
;
MatmulInfo
*
info
;
infiniDtype_t
dtype
;
aclnnTensorDescriptor_t
cDesc
,
aDesc
,
bDesc
;
// cubeMathType
// see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t
mt
;
size_t
workspaceSize
;
InfiniopMatmulAclnnDescriptor
(
infiniDevice_t
_device
);
};
#endif
src/infiniop/ops/matmul/ascend/matmul_aclnn_api.h
deleted
100644 → 0
View file @
b7893d65
#ifndef __INFINIOP_MATMUL_ACLNN_API_H__
#define __INFINIOP_MATMUL_ACLNN_API_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "infiniop/operator.h"
struct
InfiniopMatmulAclnnDescriptor
;
typedef
struct
InfiniopMatmulAclnnDescriptor
*
MatmulAclnnDescriptor_t
;
infiniopStatus_t
aclnnCreateMatmulDescriptor
(
infiniopAscendHandle_t
handle
,
MatmulAclnnDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
int8_t
cubeMathType
);
infiniopStatus_t
aclnnGetMatmulWorkspaceSize
(
MatmulAclnnDescriptor_t
desc
,
size_t
*
size
);
infiniopStatus_t
aclnnMatmul
(
MatmulAclnnDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
float
beta
,
void
*
stream
);
infiniopStatus_t
aclnnDestroyMatmulDescriptor
(
MatmulAclnnDescriptor_t
desc
);
#endif // __INFINIOP_MATMUL_ACLNN_API_H__
src/infiniop/ops/matmul/ascend/matmul_ascend.cc
0 → 100644
View file @
e77735ef
#include "matmul_ascend.h"
#include "../../../devices/ascend/ascend_handle.h"
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h>
namespace
matmul
::
ascend
{
struct
Descriptor
::
Opaque
{
mutable
aclOpExecutor
*
executor
;
aclnnTensorDescriptor_t
c
,
a
,
b
;
// cubeMathType
// see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t
mt
;
~
Opaque
()
{
delete
c
;
delete
a
;
delete
b
;
aclDestroyAclOpExecutor
(
executor
);
}
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniopStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
auto
handle
=
reinterpret_cast
<
infiniopAscendHandle_t
>
(
handle_
);
auto
dtype
=
c_desc
->
dtype
;
if
(
dtype
!=
INFINI_DTYPE_F16
&&
dtype
!=
INFINI_DTYPE_F32
)
{
return
INFINIOP_STATUS_BAD_TENSOR_DTYPE
;
}
infiniopStatus_t
status
;
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
ROW_MAJOR
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
auto
c
=
new
aclnnTensorDescriptor
(),
a
=
new
aclnnTensorDescriptor
(),
b
=
new
aclnnTensorDescriptor
();
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
// operation
CHECK_STATUS
(
c
->
setDescriptor
(
toAclDataType
(
c_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
info
.
c_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
.
c_matrix
.
cols
)},
{
info
.
c_matrix
.
row_stride
,
info
.
c_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
a
->
setDescriptor
(
toAclDataType
(
a_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
info
.
a_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
.
a_matrix
.
cols
)},
{
info
.
a_matrix
.
row_stride
,
info
.
a_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
b
->
setDescriptor
(
toAclDataType
(
b_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
info
.
b_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
.
b_matrix
.
cols
)},
{
info
.
b_matrix
.
row_stride
,
info
.
b_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
c
->
createTensor
(),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
a
->
createTensor
(),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
b
->
createTensor
(),
INFINIOP_STATUS_SUCCESS
);
auto
tc
=
c
->
t
,
ta
=
a
->
t
,
tb
=
b
->
t
;
aclOpExecutor
*
executor
;
size_t
workspace_size
;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
int8_t
mt
=
1
;
auto
ret
=
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
.5
,
.5
,
0
,
0
,
tc
,
mt
,
&
workspace_size
,
&
executor
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemmGetWorkspaceSize failed. ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
aclSetAclOpExecutorRepeatable
(
executor
);
*
desc_ptr
=
new
Descriptor
(
dtype
,
info
,
workspace_size
,
new
Opaque
{
executor
,
c
,
a
,
b
,
mt
,
},
handle
->
device
,
handle
->
device_id
);
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspaceSize_
,
void
*
c
,
float
beta
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
void
*
stream
)
const
{
auto
tc
=
_opaque
->
c
->
t
,
ta
=
_opaque
->
a
->
t
,
tb
=
_opaque
->
b
->
t
;
size_t
workspace_size
;
auto
ret
=
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
alpha
,
beta
,
0
,
0
,
tc
,
_opaque
->
mt
,
&
workspace_size
,
&
(
_opaque
->
executor
));
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemmGetWorkspaceSize failed. ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
if
(
workspaceSize_
<
workspace_size
)
{
return
INFINIOP_STATUS_INSUFFICIENT_WORKSPACE
;
}
aclSetAclOpExecutorRepeatable
(
_opaque
->
executor
);
auto
unit
=
infiniSizeof
(
_dtype
);
for
(
size_t
i
=
0
;
i
<
_info
.
batch
;
++
i
)
{
AclSetTensorAddr
(
_opaque
->
executor
,
0
,
ta
,
((
char
*
)
a
)
+
i
*
_info
.
a_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
1
,
tb
,
((
char
*
)
b
)
+
i
*
_info
.
b_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
2
,
tc
,
((
char
*
)
c
)
+
i
*
_info
.
c_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
3
,
tc
,
((
char
*
)
c
)
+
i
*
_info
.
c_matrix
.
stride
*
unit
);
ret
=
aclnnGemm
(
workspace
,
workspace_size
,
_opaque
->
executor
,
stream
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnGemm failed. ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
}
return
INFINIOP_STATUS_SUCCESS
;
}
}
// namespace matmul::ascend
src/infiniop/ops/matmul/ascend/matmul_ascend.h
0 → 100644
View file @
e77735ef
#ifndef __MATMUL_ASCEND_H__
#define __MATMUL_ASCEND_H__
#include "../matmul.h"
DESCRIPTOR
(
ascend
)
#endif // __MATMUL_ASCEND_H__
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment