Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8e83bdca
Commit
8e83bdca
authored
Feb 12, 2025
by
PanZezhong
Browse files
feat: cambricon matmul, add fp32
parent
58c0de0c
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
350 additions
and
257 deletions
+350
-257
src/infiniop/devices/bang/bang_handle.cc
src/infiniop/devices/bang/bang_handle.cc
+8
-1
src/infiniop/devices/bang/bang_handle.h
src/infiniop/devices/bang/bang_handle.h
+4
-23
src/infiniop/devices/bang/common_bang.h
src/infiniop/devices/bang/common_bang.h
+71
-32
src/infiniop/devices/handle.cc
src/infiniop/devices/handle.cc
+5
-6
src/infiniop/ops/matmul/bang/matmul_cnnl.cc
src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+59
-62
src/infiniop/ops/matmul/bang/matmul_cnnl.h
src/infiniop/ops/matmul/bang/matmul_cnnl.h
+15
-30
src/infiniop/ops/matmul/bang/matmul_cnnl_api.h
src/infiniop/ops/matmul/bang/matmul_cnnl_api.h
+26
-0
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+28
-24
src/infiniop/ops/matmul/cpu/matmul_cpu.h
src/infiniop/ops/matmul/cpu/matmul_cpu.h
+12
-17
src/infiniop/ops/matmul/operator.cc
src/infiniop/ops/matmul/operator.cc
+71
-60
xmake.lua
xmake.lua
+2
-2
xmake/bang.lua
xmake/bang.lua
+49
-0
No files found.
src/infiniop/devices/bang/bang_handle.cc
View file @
8e83bdca
#include "bang_handle.h"
#include "common_bang.h"
#include <memory>
#include "../pool.h"
infiniopStatus_t
createBangHandle
(
infiniopBangHandle_t
*
handle_ptr
,
int
device_id
)
{
unsigned
int
device_count
;
...
...
@@ -19,3 +21,8 @@ infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_i
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
deleteBangHandle
(
infiniopBangHandle_t
handle
){
delete
handle
;
return
INFINIOP_STATUS_SUCCESS
;
}
src/infiniop/devices/bang/bang_handle.h
View file @
8e83bdca
#ifndef BANG_HANDLE_H
#define BANG_HANDLE_H
#include "../pool.h"
#include "cnnl.h"
#include "cnrt.h"
#include "infiniop/handle.h"
#include <memory>
struct
InfiniopBangHandle
{
infiniDevice_t
device
;
int
device_id
;
std
::
shared_ptr
<
Pool
<
cnnlHandle_t
>>
cnnl_handles
;
};
struct
InfiniopBangHandle
;
typedef
struct
InfiniopBangHandle
*
infiniopBangHandle_t
;
infiniopStatus_t
createBangHandle
(
infiniopBangHandle_t
*
handle_ptr
,
int
device_id
);
template
<
typename
T
>
void
use_cnnl
(
std
::
shared_ptr
<
Pool
<
cnnlHandle_t
>>
&
pool
,
int
device_id
,
cnrtQueue_t
queue
,
T
const
&
f
)
{
auto
handle
=
pool
->
pop
();
if
(
!
handle
)
{
cnrtSetDevice
(
device_id
);
cnnlCreate
(
&
(
*
handle
));
}
cnnlSetQueue
(
*
handle
,
(
cnrtQueue_t
)
queue
);
f
(
*
handle
);
pool
->
push
(
std
::
move
(
*
handle
));
}
infiniopStatus_t
createBangHandle
(
infiniopBangHandle_t
*
handle_ptr
,
int
device_id
);
infiniopStatus_t
deleteBangHandle
(
infiniopBangHandle_t
handle
);
#endif
src/infiniop/devices/bang/common_bang.h
View file @
8e83bdca
#ifndef __COMMON_BANG_H__
#define __COMMON_BANG_H__
#include "../pool.h"
#include "bang_handle.h"
#include "cnnl.h"
#include "infinicore.h"
#include "cnrt.h"
#include "infiniop/tensor_descriptor.h"
#include <memory>
#include <vector>
const
int
NRAM_MAX_SIZE
=
1024
*
256
;
//the maximum NRAM memory is 1024 * 768
const
int
GD
RAM_MAX_SIZE
=
1024
*
1024
*
1024
;
//
the maximum NRAM memory is 1024 * 768
#define N
RAM_MAX_SIZE
(
1024 *
256)
// set cnnl tensor descriptor without strides11
inline
void
setCnnlTensor
(
cnnlTensorDescriptor_t
desc
,
const
TensorDescriptor
*
layout
)
{
std
::
vector
<
int
>
dims
(
layout
->
ndim
);
for
(
uint64_t
i
=
0
;
i
<
layout
->
ndim
;
i
++
)
{
dims
[
i
]
=
static_cast
<
int
>
(
layout
->
shape
[
i
]);
}
cnnlSetTensorDescriptor
(
desc
,
CNNL_LAYOUT_ARRAY
,
CNNL_DTYPE_HALF
,
dims
.
size
(),
dims
.
data
());
}
#define GDRAM_MAX_SIZE (1024 * 1024 * 1024)
// set cnnl tensor descriptor with strides
inline
void
setCnnlTensorEx
(
cnnlTensorDescriptor_t
desc
,
const
TensorDescriptor
*
layout
)
{
std
::
vector
<
int
>
dim_size
(
layout
->
ndim
),
dim_stride
(
layout
->
ndim
);
for
(
uint64_t
i
=
0
;
i
<
layout
->
ndim
;
i
++
)
{
dim_size
[
i
]
=
static_cast
<
int
>
(
layout
->
shape
[
i
]);
dim_stride
[
i
]
=
static_cast
<
int
>
(
layout
->
strides
[
i
]
/
layout
->
dt
.
size
);
}
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
CNNL_DTYPE_HALF
,
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
}
struct
InfiniopBangHandle
{
infiniDevice_t
device
;
int
device_id
;
std
::
shared_ptr
<
Pool
<
cnnlHandle_t
>>
cnnl_handles
;
};
inline
cnnlDataType_t
cnnlDataTypeConvert
(
infiniDtype_t
dataType
)
{
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_F32
))
{
switch
(
dataType
)
{
case
INFINI_DTYPE_F32
:
return
CNNL_DTYPE_FLOAT
;
}
else
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_F64
))
{
case
INFINI_DTYPE_F64
:
return
CNNL_DTYPE_DOUBLE
;
}
else
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_F16
))
{
case
INFINI_DTYPE_F16
:
return
CNNL_DTYPE_HALF
;
}
else
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_I8
))
{
case
INFINI_DTYPE_I8
:
return
CNNL_DTYPE_INT8
;
}
else
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_I32
))
{
case
INFINI_DTYPE_I32
:
return
CNNL_DTYPE_INT32
;
}
else
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_U8
))
{
case
INFINI_DTYPE_U8
:
return
CNNL_DTYPE_UINT8
;
}
else
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_BF16
))
{
case
INFINI_DTYPE_BF16
:
return
CNNL_DTYPE_BFLOAT16
;
}
else
if
(
dtype_eq
(
dataType
,
INFINI_DTYPE_I64
))
{
case
INFINI_DTYPE_I64
:
return
CNNL_DTYPE_INT64
;
}
else
{
default:
return
CNNL_DTYPE_INVALID
;
}
}
#endif// __COMMON_BANG_H__
template
<
typename
T
>
void
use_cnnl
(
std
::
shared_ptr
<
Pool
<
cnnlHandle_t
>>
&
pool
,
cnrtQueue_t
queue
,
T
const
&
f
)
{
auto
handle
=
pool
->
pop
();
if
(
!
handle
)
{
cnnlCreate
(
&
(
*
handle
));
}
cnnlSetQueue
(
*
handle
,
(
cnrtQueue_t
)
queue
);
f
(
*
handle
);
pool
->
push
(
std
::
move
(
*
handle
));
}
template
<
typename
T
>
void
use_cnnl
(
std
::
shared_ptr
<
Pool
<
cnnlHandle_t
>>
&
pool
,
T
const
&
f
)
{
auto
handle
=
pool
->
pop
();
if
(
!
handle
)
{
cnnlCreate
(
&
(
*
handle
));
}
f
(
*
handle
);
pool
->
push
(
std
::
move
(
*
handle
));
}
// set cnnl tensor descriptor without strides11
inline
void
setCnnlTensor
(
cnnlTensorDescriptor_t
desc
,
const
infiniopTensorDescriptor_t
layout
)
{
std
::
vector
<
int
>
dims
(
layout
->
ndim
);
for
(
size_t
i
=
0
;
i
<
layout
->
ndim
;
i
++
)
{
dims
[
i
]
=
static_cast
<
int
>
(
layout
->
shape
[
i
]);
}
cnnlSetTensorDescriptor
(
desc
,
CNNL_LAYOUT_ARRAY
,
cnnlDataTypeConvert
(
layout
->
dtype
),
dims
.
size
(),
dims
.
data
());
}
// set cnnl tensor descriptor with strides
inline
void
setCnnlTensorEx
(
cnnlTensorDescriptor_t
desc
,
const
infiniopTensorDescriptor_t
layout
)
{
std
::
vector
<
int
>
dim_size
(
layout
->
ndim
),
dim_stride
(
layout
->
ndim
);
for
(
size_t
i
=
0
;
i
<
layout
->
ndim
;
i
++
)
{
dim_size
[
i
]
=
static_cast
<
int
>
(
layout
->
shape
[
i
]);
dim_stride
[
i
]
=
static_cast
<
int
>
(
layout
->
strides
[
i
]);
}
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
cnnlDataTypeConvert
(
layout
->
dtype
),
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
}
#endif // __COMMON_BANG_H__
src/infiniop/devices/handle.cc
View file @
8e83bdca
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CUDA_API
#include "./cuda/cuda_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_
MLU
#ifdef ENABLE_CAMBRICON_
API
#include "./bang/bang_handle.h"
#endif
#ifdef ENABLE_ASCEND_API
...
...
@@ -32,7 +32,7 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDe
}
#endif
#ifdef ENABLE_CAMBRICON_API
case
DevCambriconMlu
:
{
case
INFINI_DEVICE_CAMBRICON
:
{
return
createBangHandle
((
infiniopBangHandle_t
*
)
handle_ptr
,
device_id
);
}
#endif
...
...
@@ -58,10 +58,9 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
return
deleteCudaHandle
((
infiniopCudaHandle_t
)
handle
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
delete
(
infiniopBangHandle_t
)
handle
;
return
STATUS_SUCCESS
;
#ifdef ENABLE_CAMBRICON_API
case
INFINI_DEVICE_CAMBRICON
:
{
return
deleteBangHandle
((
infiniopBangHandle_t
)
handle
);
}
#endif
#ifdef ENABLE_ASCEND_API
...
...
src/infiniop/ops/matmul/bang/matmul_cnnl.cc
View file @
8e83bdca
#
include
"matmul_cnnl.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h"
#include "../../utils.h"
#include "cnrt.h"
infiniopStatus_t
bangCreateMatmulDescriptor
(
BangHandle_t
handle
,
MatmulBangDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
float
alpha
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
float
beta
)
{
infiniopStatus_t
*
status
=
new
infiniopStatus_t
{
STATUS_EXECUTION_FAILED
};
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
status
,
false
);
if
(
*
status
!=
STATUS_SUCCESS
)
{
return
*
status
;
#include "matmul_cnnl_api.h"
infiniopStatus_t
bangCreateMatmulDescriptor
(
infiniopBangHandle_t
handle
,
infiniopMatmulBangDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
infiniopStatus_t
status
;
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
false
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
cnnlTensorDescriptor_t
aDesc
,
bDesc
,
cDesc
;
cnnlCreateTensorDescriptor
(
&
aDesc
);
cnnlCreateTensorDescriptor
(
&
bDesc
);
cnnlCreateTensorDescriptor
(
&
cDesc
);
setMatrixTensorEx
(
aDesc
,
info
.
a_matrix
);
setMatrixTensorEx
(
bDesc
,
info
.
b_matrix
);
setMatrixTensorEx
(
cDesc
,
info
.
c_matrix
);
setMatrixTensorEx
(
aDesc
,
info
.
a_matrix
,
a_desc
->
dtype
);
setMatrixTensorEx
(
bDesc
,
info
.
b_matrix
,
b_desc
->
dtype
);
setMatrixTensorEx
(
cDesc
,
info
.
c_matrix
,
c_desc
->
dtype
);
cnnlMatMulDescriptor_t
opDesc
;
cnnlMatMulAlgo_t
algo
;
...
...
@@ -33,28 +30,37 @@ infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
int32_t
use_stride
=
true
;
cnnlSetMatMulDescAttr
(
opDesc
,
CNNL_MATMUL_USE_STRIDE
,
&
use_stride
,
sizeof
(
int32_t
));
*
desc_ptr
=
new
MatmulBangDescriptor
{
handle
->
device
,
int
count
=
0
;
use_cnnl
(
handle
->
cnnl_handles
,
[
&
](
cnnlHandle_t
_handle
)
{
cnnlGetBatchMatMulAlgoHeuristic
(
_handle
,
opDesc
,
aDesc
,
bDesc
,
cDesc
,
NULL
,
1
,
&
algoResult
,
&
count
);
});
size_t
workspace_size
;
cnnlGetBatchMatMulHeuristicResult
(
algoResult
,
algo
,
&
workspace_size
);
*
desc_ptr
=
new
InfiniopMatmulBangDescriptor
{
handle
->
device
,
handle
->
device_id
,
info
,
alpha
,
beta
,
c_desc
->
dt
,
c_desc
->
dtype
,
handle
->
cnnl_handles
,
aDesc
,
bDesc
,
cDesc
,
opDesc
,
algo
,
algoResult
};
return
STATUS_SUCCESS
;
algoResult
,
workspace_size
};
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
bangGetMatmulWorkspaceSize
(
MatmulBangDescriptor_t
desc
,
uint64_t
*
size
)
{
*
size
=
0
;
return
STATUS_SUCCESS
;
infiniopStatus_t
bangGetMatmulWorkspaceSize
(
infiniopMatmulBangDescriptor_t
desc
,
size_t
*
size
)
{
*
size
=
desc
->
workspace_size
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
bangDestroyMatmulDescriptor
(
MatmulBangDescriptor_t
desc
)
{
infiniopStatus_t
bangDestroyMatmulDescriptor
(
infiniopMatmulBangDescriptor_t
desc
)
{
desc
->
cnnl_handles
=
nullptr
;
cnnlDestroyTensorDescriptor
(
desc
->
aDesc
);
cnnlDestroyTensorDescriptor
(
desc
->
bDesc
);
...
...
@@ -63,41 +69,32 @@ infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) {
cnnlMatMulAlgoDestroy
(
desc
->
algo
);
cnnlDestroyMatMulHeuristicResult
(
desc
->
algoResult
);
delete
desc
;
return
STATUS_SUCCESS
;
return
INFINIOP_
STATUS_SUCCESS
;
}
void
matmul_cnnl_f16
(
MatmulBangDescriptor_t
desc
,
void
*
workspace
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
void
matmul_cnnl
(
infiniopMatmulBangDescriptor_t
desc
,
void
*
workspace
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
auto
info
=
desc
->
info
;
if
(
info
.
is_transed
)
{
std
::
swap
(
a
,
b
);
}
use_cnnl
(
desc
->
cnnl_handles
,
desc
->
device_id
,
(
cnrtQueue_t
)
stream
,
[
&
](
cnnlHandle_t
handle
)
{
int
count
=
0
;
cnnlGetBatchMatMulAlgoHeuristic
(
handle
,
desc
->
opDesc
,
desc
->
aDesc
,
desc
->
bDesc
,
desc
->
cDesc
,
NULL
,
1
,
&
desc
->
algoResult
,
&
count
);
size_t
wsSize
;
cnnlGetBatchMatMulHeuristicResult
(
desc
->
algoResult
,
desc
->
algo
,
&
wsSize
);
cnrtMalloc
(
&
workspace
,
wsSize
);
cnnlBatchMatMulBCast_v2
(
handle
,
desc
->
opDesc
,
desc
->
algo
,
&
alpha
,
desc
->
aDesc
,
a
,
desc
->
bDesc
,
b
,
&
beta
,
desc
->
cDesc
,
c
,
workspace
,
wsSize
);
use_cnnl
(
desc
->
cnnl_handles
,
(
cnrtQueue_t
)
stream
,
[
&
](
cnnlHandle_t
handle
)
{
cnnlBatchMatMulBCast_v2
(
handle
,
desc
->
opDesc
,
desc
->
algo
,
&
alpha
,
desc
->
aDesc
,
a
,
desc
->
bDesc
,
b
,
&
beta
,
desc
->
cDesc
,
c
,
workspace
,
desc
->
workspace_size
);
});
}
infiniopStatus_t
bangMatmul
(
MatmulBangDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
void
*
stream
)
{
if
(
cnrtSetDevice
(
desc
->
device_id
)
!=
cnrtSuccess
)
{
return
STATUS_BAD_DEVICE
;
}
float
alpha
=
desc
->
alpha
;
float
beta
=
desc
->
beta
;
if
(
dtype_eq
(
desc
->
dtype
,
F16
))
{
matmul_cnnl_f16
(
desc
,
workspace
,
c
,
beta
,
a
,
b
,
alpha
,
stream
);
infiniopStatus_t
bangMatmul
(
infiniopMatmulBangDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
,
void
*
stream
)
{
if
(
desc
->
dtype
==
INFINI_DTYPE_F16
||
desc
->
dtype
==
INFINI_DTYPE_F32
)
{
matmul_cnnl
(
desc
,
workspace
,
c
,
beta
,
a
,
b
,
alpha
,
stream
);
cnrtQueueSync
((
cnrtQueue_t
)
stream
);
return
STATUS_SUCCESS
;
return
INFINIOP_
STATUS_SUCCESS
;
}
return
STATUS_BAD_TENSOR_DTYPE
;
return
INFINIOP_
STATUS_BAD_TENSOR_DTYPE
;
}
src/infiniop/ops/matmul/bang/matmul_cnnl.h
View file @
8e83bdca
#ifndef __CNNL_MATMUL_H__
#define __CNNL_MATMUL_H__
#include "../../../devices/bang/
bang_handle
.h"
#include "../../../devices/bang/
common_bang
.h"
#include "../blas.h"
#include "cnnl.h"
#include "cnnl_extra.h"
#include "operators.h"
struct
MatmulBangDescriptor
{
Device
device
;
struct
Infiniop
MatmulBangDescriptor
{
infini
Device
_t
device
;
int
device_id
;
MatmulInfo
info
;
float
alpha
;
float
beta
;
DT
dtype
;
infiniDtype_t
dtype
;
std
::
shared_ptr
<
Pool
<
cnnlHandle_t
>>
cnnl_handles
;
cnnlTensorDescriptor_t
aDesc
;
cnnlTensorDescriptor_t
bDesc
;
...
...
@@ -20,24 +16,12 @@ struct MatmulBangDescriptor {
cnnlMatMulDescriptor_t
opDesc
;
cnnlMatMulAlgo_t
algo
;
cnnlMatMulHeuristicResult_t
algoResult
;
size_t
workspace_size
;
};
typedef
struct
MatmulBangDescriptor
*
MatmulBangDescriptor_t
;
infiniopStatus_t
bangCreateMatmulDescriptor
(
BangHandle_t
handle
,
MatmulBangDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
float
alpha
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
float
beta
);
infiniopStatus_t
bangGetMatmulWorkspaceSize
(
MatmulBangDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
bangMatmul
(
MatmulBangDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
void
*
stream
);
infiniopStatus_t
bangDestroyMatmulDescriptor
(
MatmulBangDescriptor_t
desc
);
inline
void
setMatrixTensorEx
(
cnnlTensorDescriptor_t
desc
,
const
BlasMatrix
&
matrix
,
bool
trans
=
false
)
{
inline
void
setMatrixTensorEx
(
cnnlTensorDescriptor_t
desc
,
const
BlasMatrix
&
matrix
,
infiniDtype_t
dtype
,
bool
trans
=
false
)
{
int
ndim
=
matrix
.
ndim
;
int
batch
=
matrix
.
batch
;
int
stride
=
static_cast
<
int
>
(
matrix
.
stride
);
...
...
@@ -49,15 +33,16 @@ inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &mat
if
(
ndim
==
3
)
{
std
::
vector
<
int
>
dim_size
=
{
batch
,
rows
,
cols
};
std
::
vector
<
int
>
dim_stride
=
{
stride
,
row_stride
,
col_stride
};
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
CNNL_DTYPE_HALF
,
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
cnnlDataTypeConvert
(
dtype
),
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
}
else
if
(
ndim
==
2
)
{
std
::
vector
<
int
>
dim_size
=
{
rows
,
cols
};
std
::
vector
<
int
>
dim_stride
=
{
row_stride
,
col_stride
};
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
CNNL_DTYPE_HALF
,
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
cnnlSetTensorDescriptorEx
(
desc
,
CNNL_LAYOUT_ARRAY
,
cnnlDataTypeConvert
(
dtype
),
dim_size
.
size
(),
dim_size
.
data
(),
dim_stride
.
data
());
}
}
#endif// __CNNL_MATMUL_H__
#endif // __CNNL_MATMUL_H__
src/infiniop/ops/matmul/bang/matmul_cnnl_api.h
0 → 100644
View file @
8e83bdca
#ifndef __CNNL_MATMUL_API_H__
#define __CNNL_MATMUL_API_H__
#include "../../../devices/bang/bang_handle.h"
#include "infiniop/operator.h"
struct
InfiniopMatmulBangDescriptor
;
typedef
struct
InfiniopMatmulBangDescriptor
*
infiniopMatmulBangDescriptor_t
;
infiniopStatus_t
bangCreateMatmulDescriptor
(
infiniopBangHandle_t
handle
,
infiniopMatmulBangDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
);
infiniopStatus_t
bangGetMatmulWorkspaceSize
(
infiniopMatmulBangDescriptor_t
desc
,
size_t
*
size
);
infiniopStatus_t
bangMatmul
(
infiniopMatmulBangDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
,
void
*
stream
);
infiniopStatus_t
bangDestroyMatmulDescriptor
(
infiniopMatmulBangDescriptor_t
desc
);
#endif
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
View file @
8e83bdca
...
...
@@ -3,10 +3,9 @@
#include "../../utils.h"
#include <cmath>
infiniopStatus_t
cpuCreateMatmulDescriptor
(
infiniopCpuHandle_t
handle
,
MatmulCpuDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopStatus_t
cpuCreateMatmulDescriptor
(
infiniopCpuHandle_t
handle
,
infiniopMatmulCpuDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
infiniDtype_t
dtype
=
c_desc
->
dtype
;
...
...
@@ -20,26 +19,27 @@ infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
return
status
;
}
*
desc_ptr
=
new
MatmulCpuDescriptor
{
INFINI_DEVICE_CPU
,
dtype
,
info
};
*
desc_ptr
=
new
MatmulCpuDescriptor
{
INFINI_DEVICE_CPU
,
dtype
,
info
};
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cpuGetMatmulWorkspaceSize
(
MatmulCpuDescriptor_t
desc
,
uint64_t
*
size
)
{
infiniopStatus_t
cpuGetMatmulWorkspaceSize
(
infiniopMatmulCpuDescriptor_t
desc
,
uint64_t
*
size
)
{
*
size
=
0
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cpuDestroyMatmulDescriptor
(
MatmulCpuDescriptor_t
desc
)
{
infiniopStatus_t
cpuDestroyMatmulDescriptor
(
infiniopMatmulCpuDescriptor_t
desc
)
{
delete
desc
;
return
INFINIOP_STATUS_SUCCESS
;
}
template
<
typename
Tdata
>
infiniopStatus_t
matmul_cpu
(
MatmulCpuDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
)
{
template
<
typename
Tdata
>
infiniopStatus_t
matmul_cpu
(
infiniopMatmulCpuDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
)
{
auto
info
=
desc
->
info
;
if
(
info
.
is_transed
)
{
...
...
@@ -49,11 +49,20 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi
for
(
int
i
=
0
;
i
<
info
.
batch
;
++
i
)
{
for
(
int
m_
=
0
;
m_
<
info
.
m
;
++
m_
)
{
for
(
int
n_
=
0
;
n_
<
info
.
n
;
++
n_
)
{
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
float
sum
=
0
;
for
(
int
k_
=
0
;
k_
<
info
.
k
;
++
k_
)
{
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
sum
+=
f16_to_f32
(
*
a_
)
*
f16_to_f32
(
*
b_
);
}
else
{
...
...
@@ -75,14 +84,9 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
cpuMatmul
(
MatmulCpuDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
)
{
infiniopStatus_t
cpuMatmul
(
infiniopMatmulCpuDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
)
{
if
(
desc
->
dtype
==
INFINI_DTYPE_F16
)
{
return
matmul_cpu
<
uint16_t
>
(
desc
,
c
,
beta
,
a
,
b
,
alpha
);
}
...
...
src/infiniop/ops/matmul/cpu/matmul_cpu.h
View file @
8e83bdca
...
...
@@ -11,25 +11,20 @@ typedef struct MatmulCpuDescriptor {
MatmulInfo
info
;
}
MatmulCpuDescriptor
;
typedef
struct
MatmulCpuDescriptor
*
MatmulCpuDescriptor_t
;
typedef
struct
MatmulCpuDescriptor
*
infiniop
MatmulCpuDescriptor_t
;
infiniopStatus_t
cpuCreateMatmulDescriptor
(
infiniopCpuHandle_t
handle
,
MatmulCpuDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopStatus_t
cpuCreateMatmulDescriptor
(
infiniopCpuHandle_t
handle
,
infiniopMatmulCpuDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
);
infiniopStatus_t
cpuGetMatmulWorkspaceSize
(
MatmulCpuDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
cpuGetMatmulWorkspaceSize
(
infiniopMatmulCpuDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
cpuMatmul
(
MatmulCpuDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
);
infiniopStatus_t
cpuMatmul
(
infiniopMatmulCpuDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
float
beta
);
infiniopStatus_t
cpuDestroyMatmulDescriptor
(
MatmulCpuDescriptor_t
desc
);
infiniopStatus_t
cpuDestroyMatmulDescriptor
(
infiniop
MatmulCpuDescriptor_t
desc
);
#endif// __INFINIOP_MATMUL_CPU_H__
#endif
// __INFINIOP_MATMUL_CPU_H__
src/infiniop/ops/matmul/operator.cc
View file @
8e83bdca
...
...
@@ -7,38 +7,43 @@
#ifdef ENABLE_CUDA_API
#include "cuda/matmul_cuda_api.h"
#endif
#ifdef ENABLE_CAMBRICON_
MLU
#include "bang/matmul_cnnl.h"
#ifdef ENABLE_CAMBRICON_
API
#include "bang/matmul_cnnl
_api
.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/matmul_aclnn_api.h"
#endif
__C
infiniopStatus_t
infiniopCreateMatmulDescriptor
(
infiniopHandle_t
handle
,
infiniopMatmulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
__C
infiniopStatus_t
infiniopCreateMatmulDescriptor
(
infiniopHandle_t
handle
,
infiniopMatmulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuCreateMatmulDescriptor
((
infiniopCpuHandle_t
)
handle
,
(
MatmulCpuDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
return
cpuCreateMatmulDescriptor
(
(
infiniopCpuHandle_t
)
handle
,
(
infiniopMatmulCpuDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
cudaCreateMatmulDescriptor
((
infiniopCudaHandle_t
)
handle
,
(
infiniopMatmulCudaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
return
cudaCreateMatmulDescriptor
(
(
infiniopCudaHandle_t
)
handle
,
(
infiniopMatmulCudaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateMatmulDescriptor
((
BangHandle_t
)
handle
,
(
MatmulBangDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#ifdef ENABLE_CAMBRICON_API
case
INFINI_DEVICE_CAMBRICON
:
{
return
bangCreateMatmulDescriptor
(
(
infiniopBangHandle_t
)
handle
,
(
infiniopMatmulBangDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
}
#endif
#ifdef ENABLE_ASCEND_API
case
INFINI_DEVICE_ASCEND
:
{
return
aclnnCreateMatmulDescriptor
(
(
infiniopAscendHandle_t
)
handle
,
(
MatmulAclnnDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
,
1
);
return
aclnnCreateMatmulDescriptor
(
(
infiniopAscendHandle_t
)
handle
,
(
MatmulAclnnDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
,
1
);
}
#endif
}
...
...
@@ -50,23 +55,25 @@ infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, size_t *size) {
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuGetMatmulWorkspaceSize
((
MatmulCpuDescriptor_t
)
desc
,
size
);
return
cpuGetMatmulWorkspaceSize
((
infiniopMatmulCpuDescriptor_t
)
desc
,
size
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
cudaGetMatmulWorkspaceSize
((
infiniopMatmulCudaDescriptor_t
)
desc
,
size
);
return
cudaGetMatmulWorkspaceSize
((
infiniopMatmulCudaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangGetMatmulWorkspaceSize
((
MatmulBangDescriptor_t
)
desc
,
size
);
#ifdef ENABLE_CAMBRICON_API
case
INFINI_DEVICE_CAMBRICON
:
{
return
bangGetMatmulWorkspaceSize
((
infiniopMatmulBangDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_ASCEND_API
case
INFINI_DEVICE_ASCEND
:
{
return
aclnnGetMatmulWorkspaceSize
((
MatmulAclnnDescriptor_t
)
desc
,
size
);
return
aclnnGetMatmulWorkspaceSize
((
MatmulAclnnDescriptor_t
)
desc
,
size
);
}
#endif
}
...
...
@@ -80,15 +87,17 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc,
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuMatmul
((
MatmulCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
a
,
b
,
alpha
,
beta
);
return
cpuMatmul
((
infiniopMatmulCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
a
,
b
,
alpha
,
beta
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
return
cudaMatmul
((
infiniopMatmulCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
a
,
b
,
alpha
,
beta
,
stream
);
return
cudaMatmul
((
infiniopMatmulCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
a
,
b
,
alpha
,
beta
,
stream
);
#endif
#ifdef ENABLE_CAMBRICON_
MLU
case
DevCambriconMlu
:
{
return
bangMatmul
((
MatmulBangDescriptor_t
)
desc
,
workspace
,
#ifdef ENABLE_CAMBRICON_
API
case
INFINI_DEVICE_CAMBRICON
:
{
return
bangMatmul
((
infiniop
MatmulBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
c
,
a
,
b
,
alpha
,
beta
,
stream
);
}
#endif
...
...
@@ -101,26 +110,28 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc,
return
INFINIOP_STATUS_BAD_DEVICE
;
}
__C
infiniopStatus_t
infiniopDestroyMatmulDescriptor
(
infiniopMatmulDescriptor_t
desc
)
{
__C
infiniopStatus_t
infiniopDestroyMatmulDescriptor
(
infiniopMatmulDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU_API
case
INFINI_DEVICE_CPU
:
return
cpuDestroyMatmulDescriptor
((
MatmulCpuDescriptor_t
)
desc
);
return
cpuDestroyMatmulDescriptor
((
infiniop
MatmulCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_CUDA_API
case
INFINI_DEVICE_NVIDIA
:
{
return
cudaDestroyMatmulDescriptor
((
infiniopMatmulCudaDescriptor_t
)
desc
);
return
cudaDestroyMatmulDescriptor
(
(
infiniopMatmulCudaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_
MLU
case
DevCambriconMlu
:
{
return
bangDestroyMatmulDescriptor
((
MatmulBangDescriptor_t
)
desc
);
#ifdef ENABLE_CAMBRICON_
API
case
INFINI_DEVICE_CAMBRICON
:
{
return
bangDestroyMatmulDescriptor
((
infiniop
MatmulBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_API
case
INFINI_DEVICE_ASCEND
:
{
return
aclnnDestroyMatmulDescriptor
((
MatmulAclnnDescriptor_t
)
desc
);
return
aclnnDestroyMatmulDescriptor
((
MatmulAclnnDescriptor_t
)
desc
);
}
#endif
}
...
...
xmake.lua
View file @
8e83bdca
...
...
@@ -50,6 +50,7 @@ option_end()
if
has_config
(
"cambricon-mlu"
)
then
add_defines
(
"ENABLE_CAMBRICON_API"
)
includes
(
"xmake/bang.lua"
)
end
-- 华为昇腾
...
...
@@ -124,7 +125,7 @@ target("infiniop")
end
if
has_config
(
"cambricon-mlu"
)
then
add_deps
(
"cambricon
-mlu
"
)
add_deps
(
"
infiniop-
cambricon"
)
end
if
has_config
(
"ascend-npu"
)
then
add_deps
(
"infiniop-ascend"
)
...
...
@@ -143,5 +144,4 @@ target("infiniop")
add_installfiles
(
"include/infiniop/*.h"
,
{
prefixdir
=
"include/infiniop"
})
add_installfiles
(
"include/infiniop.h"
,
{
prefixdir
=
"include"
})
add_installfiles
(
"include/infinicore.h"
,
{
prefixdir
=
"include"
})
target_end
()
xmake/bang.lua
0 → 100644
View file @
8e83bdca
local
NEUWARE_HOME
=
os.getenv
(
"NEUWARE_HOME"
)
or
"/usr/local/neuware"
add_includedirs
(
path
.
join
(
NEUWARE_HOME
,
"include"
))
add_linkdirs
(
path
.
join
(
NEUWARE_HOME
,
"lib64"
))
add_linkdirs
(
path
.
join
(
NEUWARE_HOME
,
"lib"
))
add_links
(
"libcnrt.so"
)
add_links
(
"libcnnl.so"
)
add_links
(
"libcnnl_extra.so"
)
add_links
(
"libcnpapi.so"
)
rule
(
"mlu"
)
set_extensions
(
".mlu"
)
on_load
(
function
(
target
)
target
:
add
(
"includedirs"
,
path
.
join
(
os
.
projectdir
(),
"include"
))
end
)
on_build_file
(
function
(
target
,
sourcefile
)
local
objectfile
=
target
:
objectfile
(
sourcefile
)
os
.
mkdir
(
path
.
directory
(
objectfile
))
local
cc
=
"cncc"
local
includedirs
=
table.concat
(
target
:
get
(
"includedirs"
),
" "
)
local
args
=
{
"-c"
,
sourcefile
,
"-o"
,
objectfile
,
"--bang-mlu-arch=mtp_592"
,
"-O3"
,
"-fPIC"
,
"-Wall"
,
"-Werror"
,
"-std=c++17"
,
"-pthread"
}
for
_
,
includedir
in
ipairs
(
target
:
get
(
"includedirs"
))
do
table.insert
(
args
,
"-I"
..
includedir
)
end
os
.
execv
(
cc
,
args
)
table.insert
(
target
:
objectfiles
(),
objectfile
)
end
)
rule_end
()
local
src_dir
=
path
.
join
(
os
.
projectdir
(),
"src"
,
"infiniop"
)
target
(
"infiniop-cambricon"
)
set_kind
(
"static"
)
on_install
(
function
(
target
)
end
)
set_languages
(
"cxx17"
)
add_files
(
src_dir
..
"/devices/bang/*.cc"
,
src_dir
..
"/ops/*/bang/*.cc"
)
local
mlu_files
=
os
.
files
(
src_dir
..
"/ops/*/bang/*.mlu"
)
if
#
mlu_files
>
0
then
add_files
(
mlu_files
,
{
rule
=
"mlu"
})
end
add_cxflags
(
"-lstdc++ -Wall -Werror -fPIC"
)
target_end
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment