Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
db775824
Commit
db775824
authored
Apr 24, 2022
by
Jehandad Khan
Browse files
Merge branch 'develop' into jd/dev_pkg
parents
74397984
7c0b1498
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
825 additions
and
380 deletions
+825
-380
Dockerfile
Dockerfile
+1
-0
Jenkinsfile
Jenkinsfile
+1
-1
example/06_conv2d_fwd_bias_relu/CMakeLists.txt
example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+1
-0
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+1
-0
example/09_convnd_fwd/CMakeLists.txt
example/09_convnd_fwd/CMakeLists.txt
+3
-0
example/10_conv2d_bwd_data/CMakeLists.txt
example/10_conv2d_bwd_data/CMakeLists.txt
+1
-0
example/11_conv2d_bwd_weight/CMakeLists.txt
example/11_conv2d_bwd_weight/CMakeLists.txt
+1
-0
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+5
-13
example/17_convnd_bwd_data_xdl/CMakeLists.txt
example/17_convnd_bwd_data_xdl/CMakeLists.txt
+1
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+11
-9
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+6
-3
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+53
-3
include/ck/utility/reduction_common.hpp
include/ck/utility/reduction_common.hpp
+2
-2
include/ck/utility/transpose_vectors.hpp
include/ck/utility/transpose_vectors.hpp
+82
-1
library/CMakeLists.txt
library/CMakeLists.txt
+1
-0
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+7
-30
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+13
-12
library/include/ck/library/utility/conv_fwd_util.hpp
library/include/ck/library/utility/conv_fwd_util.hpp
+323
-306
library/include/ck/library/utility/fill.hpp
library/include/ck/library/utility/fill.hpp
+81
-0
library/include/ck/library/utility/op_instance_engine.hpp
library/include/ck/library/utility/op_instance_engine.hpp
+231
-0
No files found.
Dockerfile
View file @
db775824
...
@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
...
@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
vim
\
vim
\
zlib1g-dev
\
zlib1g-dev
\
openssh-server
\
openssh-server
\
clang-format-10
\
kmod
&&
\
kmod
&&
\
apt-get clean
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
rm
-rf
/var/lib/apt/lists/
*
...
...
Jenkinsfile
View file @
db775824
...
@@ -204,7 +204,7 @@ pipeline {
...
@@ -204,7 +204,7 @@ pipeline {
stage
(
'Clang Format'
)
{
stage
(
'Clang Format'
)
{
agent
{
label
rocmnode
(
"nogpu"
)
}
agent
{
label
rocmnode
(
"nogpu"
)
}
environment
{
environment
{
execute_cmd
=
"find . -iname \'*.h\' \
execute_cmd
=
"find .
.
-iname \'*.h\' \
-o -iname \'*.hpp\' \
-o -iname \'*.hpp\' \
-o -iname \'*.cpp\' \
-o -iname \'*.cpp\' \
-o -iname \'*.h.in\' \
-o -iname \'*.h.in\' \
...
...
example/06_conv2d_fwd_bias_relu/CMakeLists.txt
View file @
db775824
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp
)
target_link_libraries
(
example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util
)
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
View file @
db775824
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp
)
target_link_libraries
(
example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util
)
example/09_convnd_fwd/CMakeLists.txt
View file @
db775824
add_example_executable
(
example_convnd_fwd_xdl convnd_fwd_xdl.cpp
)
add_example_executable
(
example_convnd_fwd_xdl convnd_fwd_xdl.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl PRIVATE conv_fwd_util
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util
)
example/10_conv2d_bwd_data/CMakeLists.txt
View file @
db775824
add_example_executable
(
example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp
)
add_example_executable
(
example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util
)
example/11_conv2d_bwd_weight/CMakeLists.txt
View file @
db775824
add_example_executable
(
example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp
)
add_example_executable
(
example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util
)
example/12_reduce/reduce_blockwise.cpp
View file @
db775824
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <getopt.h>
#include <getopt.h>
#include <half.hpp>
#include "check_err.hpp"
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
...
@@ -27,10 +26,6 @@ using InDataType = ck::half_t;
...
@@ -27,10 +26,6 @@ using InDataType = ck::half_t;
using
OutDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
AccDataType
=
float
;
using
HostInDataType
=
half_float
::
half
;
using
HostOutDataType
=
half_float
::
half
;
using
HostAccDataType
=
float
;
constexpr
int
Rank
=
4
;
constexpr
int
Rank
=
4
;
constexpr
int
NumReduceDim
=
3
;
constexpr
int
NumReduceDim
=
3
;
...
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
...
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
if
(
args
.
do_verification
)
if
(
args
.
do_verification
)
{
{
ReductionHost
<
Host
InDataType
,
ReductionHost
<
InDataType
,
Host
AccDataType
,
AccDataType
,
Host
OutDataType
,
OutDataType
,
ReduceOpId
,
ReduceOpId
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
...
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
...
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
NeedIndices
>
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
reinterpret_cast
<
const
HostInDataType
*>
(
in
.
mData
.
data
()),
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
beta
,
reinterpret_cast
<
HostOutDataType
*>
(
out_ref
.
mData
.
data
()),
out_indices_ref
.
mData
.
data
());
};
};
const
auto
i_inLengths
=
to_int_vector
(
args
.
inLengths
);
const
auto
i_inLengths
=
to_int_vector
(
args
.
inLengths
);
...
...
example/17_convnd_bwd_data_xdl/CMakeLists.txt
View file @
db775824
add_example_executable
(
example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp
)
add_example_executable
(
example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp
)
target_link_libraries
(
example_convnd_bwd_data_xdl PRIVATE conv_fwd_util
)
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
db775824
...
@@ -39,6 +39,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -39,6 +39,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
auto
xdlops_gemm
=
XdlopsGemm
<
FloatAB
,
MPerXDL
,
NPerXDL
,
KPack
>
{};
static
constexpr
auto
xdlops_gemm
=
XdlopsGemm
<
FloatAB
,
MPerXDL
,
NPerXDL
,
KPack
>
{};
static
constexpr
index_t
KPerThread
=
KPerBlock
/
xdlops_gemm
.
K0PerXdlops
;
static
constexpr
index_t
MWaves
=
MPerBlock
/
(
MRepeat
*
MPerXDL
);
static
constexpr
index_t
MWaves
=
MPerBlock
/
(
MRepeat
*
MPerXDL
);
static
constexpr
index_t
NWaves
=
NPerBlock
/
(
NRepeat
*
NPerXDL
);
static
constexpr
index_t
NWaves
=
NPerBlock
/
(
NRepeat
*
NPerXDL
);
...
@@ -71,7 +73,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -71,7 +73,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
const
auto
xdlops_a_idx
=
xdlops_gemm
.
CalculateAThreadOriginDataIndex
();
const
auto
xdlops_a_idx
=
xdlops_gemm
.
CalculateAThreadOriginDataIndex
();
return
make_tuple
(
0
,
waveId_m
,
xdlops_a_idx
[
I1
],
Number
<
KPack
>
{}
*
xdlops_a_idx
[
I0
]);
return
make_tuple
(
0
,
waveId_m
,
xdlops_a_idx
[
I1
],
KPerThread
*
xdlops_a_idx
[
I0
]);
}
}
__device__
static
auto
CalculateBThreadOriginDataIndex
()
__device__
static
auto
CalculateBThreadOriginDataIndex
()
...
@@ -82,7 +84,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -82,7 +84,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
const
auto
xdlops_b_idx
=
xdlops_gemm
.
CalculateBThreadOriginDataIndex
();
const
auto
xdlops_b_idx
=
xdlops_gemm
.
CalculateBThreadOriginDataIndex
();
return
make_tuple
(
0
,
waveId_n
,
xdlops_b_idx
[
I1
],
Number
<
KPack
>
{}
*
xdlops_b_idx
[
I0
]);
return
make_tuple
(
0
,
waveId_n
,
xdlops_b_idx
[
I1
],
KPerThread
*
xdlops_b_idx
[
I0
]);
}
}
template
<
index_t
m0
,
index_t
n0
,
index_t
xdlops_i
,
index_t
blk_i
>
template
<
index_t
m0
,
index_t
n0
,
index_t
xdlops_i
,
index_t
blk_i
>
...
@@ -273,7 +275,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -273,7 +275,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
make_tuple
(
I0
,
I0
,
I0
,
I0
),
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
);
b_thread_buf
);
static_for
<
0
,
KPer
Block
,
KPack
*
xdlops_gemm
.
K0PerXdlops
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
KPer
Thread
,
KPack
>
{}([
&
](
auto
k
)
{
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
...
@@ -300,13 +302,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -300,13 +302,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
}
}
private:
private:
// A[M0, M1, M2, KPer
Block
]
// A[M0, M1, M2, KPer
Thread
]
static
constexpr
auto
a_thread_desc_
=
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Block
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Thread
>
{}));
// B[N0, N1, N2, KPer
Block
]
// B[N0, N1, N2, KPer
Thread
]
static
constexpr
auto
b_thread_desc_
=
static
constexpr
auto
b_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Block
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Thread
>
{}));
// C[M, N, NumRegXdlops]
// C[M, N, NumRegXdlops]
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
...
@@ -316,7 +318,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -316,7 +318,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
FloatAB
,
FloatAB
,
decltype
(
a_block_desc_m0_m1_m2_k
),
decltype
(
a_block_desc_m0_m1_m2_k
),
decltype
(
a_thread_desc_
),
decltype
(
a_thread_desc_
),
Sequence
<
1
,
1
,
1
,
KPer
Block
>
,
Sequence
<
1
,
1
,
1
,
KPer
Thread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
A_K1
,
A_K1
,
...
@@ -326,7 +328,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -326,7 +328,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
FloatAB
,
FloatAB
,
decltype
(
b_block_desc_n0_n1_n2_k
),
decltype
(
b_block_desc_n0_n1_n2_k
),
decltype
(
b_thread_desc_
),
decltype
(
b_thread_desc_
),
Sequence
<
1
,
1
,
1
,
KPer
Block
>
,
Sequence
<
1
,
1
,
1
,
KPer
Thread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
B_K1
,
B_K1
,
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
db775824
...
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// TODO make this logic more generic for more sub-dword datatype
// TODO make this logic more generic for more sub-dword datatype
if
constexpr
(
SrcVectorDim
!=
DstVectorDim
&&
if
constexpr
(
SrcVectorDim
!=
DstVectorDim
&&
is_same
<
half_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
((
is_same
<
half_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
is_same
<
half_t
,
remove_cvref_t
<
DstData
>>::
value
&&
is_same
<
half_t
,
remove_cvref_t
<
DstData
>>::
value
&&
SrcScalarPerVector
%
2
==
0
&&
DstScalarPerVector
%
2
==
0
)
SrcScalarPerVector
%
2
==
0
&&
DstScalarPerVector
%
2
==
0
)
||
(
is_same
<
int8_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
is_same
<
int8_t
,
remove_cvref_t
<
DstData
>>::
value
&&
SrcScalarPerVector
%
4
==
0
&&
DstScalarPerVector
%
4
==
0
)))
{
{
// each transpose does
// each transpose does
// DstScalarPerVector # of src vectors in src_thread_scratch_
// DstScalarPerVector # of src vectors in src_thread_scratch_
...
...
include/ck/utility/math_v2.hpp
View file @
db775824
#ifndef CK_MATH_V2_HPP
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include <cmath>
#include "data_type.hpp"
#include "data_type.hpp"
#include "half.hpp"
namespace
ck
{
namespace
ck
{
namespace
math
{
namespace
math
{
static
inline
__device__
half_t
abs
(
half_t
x
)
{
return
__habs
(
x
);
};
static
inline
__host__
float
abs
(
float
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__device__
half_t
sqrtf
(
half_t
x
)
{
return
hsqrt
(
x
);
};
static
inline
__device__
bool
isnan
(
half_t
x
)
{
return
__hisnan
(
x
);
};
static
inline
__host__
double
abs
(
double
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
int8_t
abs
(
int8_t
x
)
{
int8_t
sgn
=
x
>>
(
8
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__host__
int32_t
abs
(
int32_t
x
)
{
int32_t
sgn
=
x
>>
(
32
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__host__
half_t
abs
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
half_float
::
half
abs_xx
=
half_float
::
abs
(
xx
);
half_t
abs_x
=
*
reinterpret_cast
<
half_t
*>
(
&
abs_xx
);
return
abs_x
;
};
static
inline
__host__
float
isnan
(
float
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
double
isnan
(
double
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
int8_t
isnan
(
int8_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__host__
int32_t
isnan
(
int32_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__host__
bool
isnan
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
return
half_float
::
isnan
(
xx
);
};
}
// namespace math
}
// namespace math
}
// namespace ck
}
// namespace ck
...
...
include/ck/utility/reduction_common.hpp
View file @
db775824
...
@@ -33,7 +33,7 @@ namespace ck {
...
@@ -33,7 +33,7 @@ namespace ck {
struct
float_equal_one
struct
float_equal_one
{
{
template
<
class
T
>
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
__host__
__device__
inline
bool
operator
()(
T
x
)
{
{
return
x
<=
static_cast
<
T
>
(
1.0
f
)
and
x
>=
static_cast
<
T
>
(
1.0
f
);
return
x
<=
static_cast
<
T
>
(
1.0
f
)
and
x
>=
static_cast
<
T
>
(
1.0
f
);
};
};
...
@@ -42,7 +42,7 @@ struct float_equal_one
...
@@ -42,7 +42,7 @@ struct float_equal_one
struct
float_equal_zero
struct
float_equal_zero
{
{
template
<
class
T
>
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
__host__
__device__
inline
bool
operator
()(
T
x
)
{
{
return
x
<=
static_cast
<
T
>
(
0.0
f
)
and
x
>=
static_cast
<
T
>
(
0.0
f
);
return
x
<=
static_cast
<
T
>
(
0.0
f
)
and
x
>=
static_cast
<
T
>
(
0.0
f
);
};
};
...
...
include/ck/utility/transpose_vectors.hpp
View file @
db775824
...
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
...
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
template
<
index_t
NX
,
index_t
NY
>
template
<
index_t
NX
,
index_t
NY
>
struct
transpose_vectors
<
half_t
,
NX
,
NY
>
struct
transpose_vectors
<
half_t
,
NX
,
NY
>
{
{
// we got [NY * NX] am
m
ount of S data to be transposed
// we got [NY * NX] amount of S data to be transposed
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_y
=
NX
;
static
constexpr
index_t
s_per_y
=
NX
;
...
@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
...
@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
}
}
};
};
// transpose int8 4x4
__device__
void
transpose_int8_4x4
(
const
int8x4_t
&
x0
,
const
int8x4_t
&
x1
,
const
int8x4_t
&
x2
,
const
int8x4_t
&
x3
,
int8x4_t
&
y0
,
int8x4_t
&
y1
,
int8x4_t
&
y2
,
int8x4_t
&
y3
)
{
int32_t
t0
,
t1
;
int32_t
z0
,
z1
,
z2
,
z3
;
constexpr
int32_t
m0
=
0x05010400
;
constexpr
int32_t
m1
=
0x05040100
;
constexpr
int32_t
m2
=
0x07060302
;
constexpr
int32_t
m3
=
0x07030602
;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits first)
// clang-format off
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t0
)
:
"v"
(
bit_cast
<
int32_t
>
(
x1
)),
"v"
(
bit_cast
<
int32_t
>
(
x0
)),
"s"
(
m0
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t1
)
:
"v"
(
bit_cast
<
int32_t
>
(
x3
)),
"v"
(
bit_cast
<
int32_t
>
(
x2
)),
"s"
(
m0
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z0
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m1
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z1
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m2
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t0
)
:
"v"
(
bit_cast
<
int32_t
>
(
x1
)),
"v"
(
bit_cast
<
int32_t
>
(
x0
)),
"s"
(
m3
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t1
)
:
"v"
(
bit_cast
<
int32_t
>
(
x3
)),
"v"
(
bit_cast
<
int32_t
>
(
x2
)),
"s"
(
m3
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z2
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m1
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z3
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m2
));
// clang-format on
y0
=
bit_cast
<
int8x4_t
>
(
z0
);
y1
=
bit_cast
<
int8x4_t
>
(
z1
);
y2
=
bit_cast
<
int8x4_t
>
(
z2
);
y3
=
bit_cast
<
int8x4_t
>
(
z3
);
}
template
<
index_t
NX
,
index_t
NY
>
struct
transpose_vectors
<
int8_t
,
NX
,
NY
>
{
// we got [NY * NX] amount of S data to be transposed
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_y
=
NX
;
using
S
=
int8_t
;
using
VX
=
vector_type
<
int8_t
,
s_per_x
>
;
using
VY
=
vector_type
<
int8_t
,
s_per_y
>
;
__device__
void
operator
()(
const
StaticallyIndexedArray
<
const
VX
&
,
NX
>&
vx_tuple
,
StaticallyIndexedArray
<
VY
&
,
NY
>&
vy_tuple
)
{
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static_assert
((
NX
%
4
==
0
&&
NY
%
4
==
0
),
"wrong!"
);
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for
<
0
,
NY
,
4
>
{}([
&
](
auto
iy
)
{
static_for
<
0
,
NX
,
4
>
{}([
&
](
auto
ix
)
{
// reference to 4 int8 data from vx_tuple
const
auto
&
x_s4_0
=
vx_tuple
[
ix
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_1
=
vx_tuple
[
ix
+
I1
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_2
=
vx_tuple
[
ix
+
I2
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_3
=
vx_tuple
[
ix
+
I3
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
// reference to 4 int8 data from vy_tuple
auto
&
y_s4_0
=
vy_tuple
(
iy
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_1
=
vy_tuple
(
iy
+
I1
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_2
=
vy_tuple
(
iy
+
I2
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_3
=
vy_tuple
(
iy
+
I3
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
// transpose
transpose_int8_4x4
(
x_s4_0
,
x_s4_1
,
x_s4_2
,
x_s4_3
,
y_s4_0
,
y_s4_1
,
y_s4_2
,
y_s4_3
);
});
});
}
};
}
// namespace ck
}
// namespace ck
#endif
#endif
library/CMakeLists.txt
View file @
db775824
add_subdirectory
(
src/host_tensor
)
add_subdirectory
(
src/host_tensor
)
add_subdirectory
(
src/tensor_operation_instance/gpu
)
add_subdirectory
(
src/tensor_operation_instance/gpu
)
add_subdirectory
(
src/utility
)
library/include/ck/library/host_tensor/host_reduce_util.hpp
View file @
db775824
...
@@ -26,7 +26,6 @@
...
@@ -26,7 +26,6 @@
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <half.hpp>
#include <limits>
#include <limits>
#include <cmath>
#include <cmath>
#include <cassert>
#include <cassert>
...
@@ -34,6 +33,8 @@
...
@@ -34,6 +33,8 @@
#include <string>
#include <string>
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -42,34 +43,10 @@ namespace host_reduce {
...
@@ -42,34 +43,10 @@ namespace host_reduce {
using
ck
::
NanPropagation
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
using
ck
::
ReduceTensorOp
;
template
<
typename
T
>
static
inline
bool
float_equal_one
(
T
);
static
inline
bool
float_equal_one
(
float
x
)
{
return
x
==
1.0
f
;
};
static
inline
bool
float_equal_one
(
double
x
)
{
return
x
==
1.0
;
};
static
inline
bool
float_equal_one
(
half_float
::
half
x
)
{
return
x
==
static_cast
<
half_float
::
half
>
(
1.0
f
);
};
template
<
typename
T
>
static
inline
bool
float_equal_zero
(
T
x
);
static
inline
bool
float_equal_zero
(
float
x
)
{
return
x
==
0.0
f
;
};
static
inline
bool
float_equal_zero
(
double
x
)
{
return
x
==
0.0
;
};
static
inline
bool
float_equal_zero
(
half_float
::
half
x
)
{
return
x
==
static_cast
<
half_float
::
half
>
(
0.0
f
);
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
{
{
using
std
::
abs
;
using
ck
::
math
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
{
...
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
...
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
}
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
{
return
(
std
::
n
umeric
_l
imits
<
AccDataType
>::
m
ax
());
return
(
ck
::
N
umeric
L
imits
<
AccDataType
>::
M
ax
());
}
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
{
return
(
std
::
n
umeric
_l
imits
<
AccDataType
>::
l
owest
());
return
(
ck
::
N
umeric
L
imits
<
AccDataType
>::
L
owest
());
}
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
{
...
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
...
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType
&
accuVal
,
AccDataType
&
accuVal
,
AccDataType
currVal
)
AccDataType
currVal
)
{
{
using
std
::
isnan
;
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
if
constexpr
(
!
PropagateNan
)
{
{
...
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
...
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
int
&
accuIndex
,
int
&
accuIndex
,
int
currIndex
)
int
currIndex
)
{
{
using
std
::
isnan
;
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
if
constexpr
(
!
PropagateNan
)
{
{
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
db775824
...
@@ -32,6 +32,7 @@
...
@@ -32,6 +32,7 @@
#include <functional>
#include <functional>
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
#include "data_type.hpp"
...
@@ -196,10 +197,10 @@ struct ReductionHost
...
@@ -196,10 +197,10 @@ struct ReductionHost
OutDataType
*
out_data
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
IndexDataType
*
out_indices
)
{
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
...
@@ -227,10 +228,10 @@ struct ReductionHost
...
@@ -227,10 +228,10 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
...
@@ -263,13 +264,13 @@ struct ReductionHost
...
@@ -263,13 +264,13 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
type_convert
<
AccDataType
>
(
beta
);
...
@@ -303,10 +304,10 @@ struct ReductionHost
...
@@ -303,10 +304,10 @@ struct ReductionHost
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
{
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
...
@@ -330,10 +331,10 @@ struct ReductionHost
...
@@ -330,10 +331,10 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
...
@@ -361,13 +362,13 @@ struct ReductionHost
...
@@ -361,13 +362,13 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
type_convert
<
AccDataType
>
(
beta
);
...
...
library/include/ck/library/utility/conv_fwd_util.hpp
View file @
db775824
#ifndef CONV_FWD_UTIL_HPP
#pragma once
#define CONV_FWD_UTIL_HPP
#include <algorithm>
#include <cstdlib>
#include <cstdlib>
#include <functional>
#include <functional>
#include <iterator>
#include <iterator>
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
#include <random>
#include <tuple>
#include <tuple>
#include <type_traits>
#include <type_traits>
#include <vector>
#include <vector>
...
@@ -18,10 +15,50 @@
...
@@ -18,10 +15,50 @@
#include "device_conv_fwd.hpp"
#include "device_conv_fwd.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
#include "element_wise_operation.hpp"
#include "fill.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "op_instance_engine.hpp"
#include "reference_conv_fwd.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
using
DeviceConvFwdNoOpPtr
=
DeviceConvFwdPtr
<
element_wise
::
PassThrough
,
element_wise
::
PassThrough
,
element_wise
::
PassThrough
>
;
namespace
device_conv1d_fwd_instance
{
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv1d_fwd_instance
namespace
device_conv2d_fwd_instance
{
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv2d_fwd_instance
namespace
device_conv3d_fwd_instance
{
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv3d_fwd_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
ck
{
namespace
utils
{
namespace
utils
{
namespace
conv
{
namespace
conv
{
...
@@ -47,20 +84,7 @@ std::size_t get_flops(ck::index_t N,
...
@@ -47,20 +84,7 @@ std::size_t get_flops(ck::index_t N,
ck
::
index_t
C
,
ck
::
index_t
C
,
ck
::
index_t
K
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
)
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
);
{
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
N
*
K
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
C
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
}
/**
/**
* @brief Calculate number of bytes read/write by convolution algorithm.
* @brief Calculate number of bytes read/write by convolution algorithm.
...
@@ -110,20 +134,7 @@ std::size_t get_btype(ck::index_t N,
...
@@ -110,20 +134,7 @@ std::size_t get_btype(ck::index_t N,
struct
ConvParams
struct
ConvParams
{
{
ConvParams
()
ConvParams
();
:
num_dim_spatial
(
2
),
N
(
128
),
K
(
256
),
C
(
192
),
filter_spatial_lengths
(
2
,
3
),
input_spatial_lengths
(
2
,
71
),
conv_filter_strides
(
2
,
2
),
conv_filter_dilations
(
2
,
1
),
input_left_pads
(
2
,
1
),
input_right_pads
(
2
,
1
)
{
}
ConvParams
(
ck
::
index_t
n_dim
,
ConvParams
(
ck
::
index_t
n_dim
,
ck
::
index_t
n_batch
,
ck
::
index_t
n_batch
,
ck
::
index_t
n_out_channels
,
ck
::
index_t
n_out_channels
,
...
@@ -133,29 +144,7 @@ struct ConvParams
...
@@ -133,29 +144,7 @@ struct ConvParams
const
std
::
vector
<
ck
::
index_t
>&
strides
,
const
std
::
vector
<
ck
::
index_t
>&
strides
,
const
std
::
vector
<
ck
::
index_t
>&
dilations
,
const
std
::
vector
<
ck
::
index_t
>&
dilations
,
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
right_pads
)
const
std
::
vector
<
ck
::
index_t
>&
right_pads
);
:
num_dim_spatial
(
n_dim
),
N
(
n_batch
),
K
(
n_out_channels
),
C
(
n_in_channels
),
filter_spatial_lengths
(
filters_len
),
input_spatial_lengths
(
input_len
),
conv_filter_strides
(
strides
),
conv_filter_dilations
(
dilations
),
input_left_pads
(
left_pads
),
input_right_pads
(
right_pads
)
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
}
ck
::
index_t
num_dim_spatial
;
ck
::
index_t
num_dim_spatial
;
ck
::
index_t
N
;
ck
::
index_t
N
;
...
@@ -171,35 +160,11 @@ struct ConvParams
...
@@ -171,35 +160,11 @@ struct ConvParams
std
::
vector
<
ck
::
index_t
>
input_left_pads
;
std
::
vector
<
ck
::
index_t
>
input_left_pads
;
std
::
vector
<
ck
::
index_t
>
input_right_pads
;
std
::
vector
<
ck
::
index_t
>
input_right_pads
;
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
;
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
idx_eff
=
(
filter_spatial_lengths
[
i
]
-
1
)
*
conv_filter_dilations
[
i
]
+
1
;
out_spatial_len
[
i
]
=
(
input_spatial_lengths
[
i
]
+
input_left_pads
[
i
]
+
input_right_pads
[
i
]
-
idx_eff
)
/
conv_filter_strides
[
i
]
+
1
;
}
return
out_spatial_len
;
}
};
};
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
arg_idx
,
char
*
const
argv
[]);
/**
/**
* @brief Gets the host tensor descriptor.
* @brief Gets the host tensor descriptor.
*
*
...
@@ -221,13 +186,13 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>&
...
@@ -221,13 +186,13 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>&
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKW
>::
value
)
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKW
>::
value
)
{
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
(
{
C
*
dims
[
2
],
dims
[
2
],
1
})
)
;
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
],
dims
[
2
],
1
});
}
}
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>::
value
||
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>::
value
)
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>::
value
)
{
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
(
{
C
*
dims
[
2
],
1
,
C
})
)
;
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
],
1
,
C
});
}
}
// 2D
// 2D
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
...
@@ -273,132 +238,14 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>&
...
@@ -273,132 +238,14 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>&
throw
std
::
runtime_error
(
err_msg
.
str
());
throw
std
::
runtime_error
(
err_msg
.
str
());
}
}
template
<
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
,
typename
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
,
typename
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
,
typename
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
>
auto
get_host_tensors
(
const
ConvParams
&
params
,
bool
init
=
true
)
{
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
end
(
params
.
filter_spatial_lengths
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
input_dims
,
InLayout
{}));
Tensor
<
WeiDataType
>
weights
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
filter_dims
,
WeiLayout
{}));
Tensor
<
OutDataType
>
host_output
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
output_dims
,
OutLayout
{}));
Tensor
<
OutDataType
>
device_output
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
output_dims
,
OutLayout
{}));
if
(
init
)
{
std
::
mt19937
gen
(
11939
);
if
constexpr
(
std
::
is_same
<
InDataType
,
uint8_t
>::
value
)
{
std
::
uniform_int_distribution
<>
dis
(
-
5
,
5
);
std
::
generate
(
input
.
begin
(),
input
.
end
(),
[
&
dis
,
&
gen
]()
{
return
InDataType
(
dis
(
gen
));
});
std
::
generate
(
weights
.
begin
(),
weights
.
end
(),
[
&
dis
,
&
gen
]()
{
return
WeiDataType
(
dis
(
gen
));
});
}
else
{
std
::
uniform_real_distribution
<>
dis
(
0.
f
,
1.
f
);
std
::
generate
(
input
.
begin
(),
input
.
end
(),
[
&
dis
,
&
gen
]()
{
return
InDataType
(
dis
(
gen
));
});
std
::
generate
(
weights
.
begin
(),
weights
.
end
(),
[
&
dis
,
&
gen
]()
{
return
WeiDataType
(
dis
(
gen
));
});
}
std
::
fill
(
host_output
.
begin
(),
host_output
.
end
(),
OutDataType
(
0.
f
));
std
::
fill
(
device_output
.
begin
(),
device_output
.
end
(),
OutDataType
(
0.
f
));
}
return
std
::
make_tuple
(
input
,
weights
,
host_output
,
device_output
);
}
HostTensorDescriptor
get_output_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
HostTensorDescriptor
get_output_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
int
num_dim_spatial
=
2
);
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWK
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWK
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWK
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
int
num_dim_spatial
=
2
);
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KZYXC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KYXC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KXC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
int
num_dim_spatial
=
2
);
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
ck
::
index_t
NDim
,
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
InDataType
=
float
,
...
@@ -432,123 +279,293 @@ void run_reference_convolution_forward(const ConvParams& params,
...
@@ -432,123 +279,293 @@ void run_reference_convolution_forward(const ConvParams& params,
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
}
}
template
<
ck
::
index_t
NDim
,
template
<
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
typename
InDataType
=
float
,
struct
ConvolutionFwdInstances
;
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
,
template
<
>
template
<
ck
::
index_t
,
typename
,
typename
,
typename
>
struct
ConvolutionFwdInstances
<
float
,
float
,
float
>
class
DeviceConvNDFwdInstance
>
void
run_convolution_forward
(
const
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
int
NumDimSpatial
,
typename
std
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
static
std
::
vector
<
DeviceConvFwdNoOpPtr
>
Get
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
}
};
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
template
<
>
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
struct
ConvolutionFwdInstances
<
half_t
,
half_t
,
half_t
>
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
{
template
<
int
NumDimSpatial
,
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
typename
std
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
static
std
::
vector
<
DeviceConvFwdNoOpPtr
>
Get
()
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
auto
conv
=
DeviceConvNDFwdInstance
<
NDim
,
InDataType
,
WeiDataType
,
OutDataType
>
();
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
{
throw
std
::
runtime_error
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
"Error! device_conv with the specified compilation parameters does "
if
constexpr
(
NumDimSpatial
==
1
)
"not support this Conv problem"
);
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
conv_ptrs
);
return
conv_ptrs
;
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
}
}
};
invoker
.
Run
(
argument
);
template
<
>
out_device_buf
.
FromDevice
(
output
.
mData
.
data
());
struct
ConvolutionFwdInstances
<
bhalf_t
,
bhalf_t
,
bhalf_t
>
}
{
template
<
int
NumDimSpatial
,
typename
std
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
static
std
::
vector
<
DeviceConvFwdNoOpPtr
>
Get
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
}
};
template
<
ck
::
index_t
NDim
,
template
<
>
typename
InDataType
=
float
,
struct
ConvolutionFwdInstances
<
int8_t
,
int8_t
,
int8_t
>
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
bool
run_convolution_forward_instances
(
const
ConvParams
&
params
,
const
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
conv_ptrs
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
,
const
Tensor
<
OutDataType
>&
host_output
)
{
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
int
NumDimSpatial
,
typename
std
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
static
std
::
vector
<
DeviceConvFwdNoOpPtr
>
Get
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
}
};
template
<
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
,
typename
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
,
typename
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
,
typename
InElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
WeiElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
OutElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
InputInitFun
=
FillUniform
<
InDataType
>,
typename
WeightsInitFun
=
FillUniform
<
WeiDataType
>>
class
ConvFwdOpInstance
:
public
ck
::
utils
::
OpInstance
<
OutDataType
,
InDataType
,
WeiDataType
>
{
using
DeviceConvFwdOp
=
tensor_operation
::
device
::
DeviceConvFwd
<
InElementwiseOp
,
WeiElementwiseOp
,
OutElementwiseOp
>
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
using
DeviceBuffers
=
std
::
vector
<
DeviceMemPtr
>
;
using
BaseType
=
ck
::
utils
::
OpInstance
<
OutDataType
,
InDataType
,
WeiDataType
>
;
template
<
typename
T
>
using
TensorPtr
=
std
::
unique_ptr
<
Tensor
<
T
>>
;
using
InTensorsTuple
=
std
::
tuple
<
TensorPtr
<
InDataType
>
,
TensorPtr
<
WeiDataType
>>
;
public:
ConvFwdOpInstance
()
=
delete
;
ConvFwdOpInstance
(
const
ConvFwdOpInstance
&
)
=
default
;
ConvFwdOpInstance
&
operator
=
(
const
ConvFwdOpInstance
&
)
=
default
;
ConvFwdOpInstance
(
const
ConvParams
&
params
,
bool
do_init
=
true
,
const
InputInitFun
&
input_init_f
=
InputInitFun
{},
const
WeightsInitFun
&
weights_init_f
=
WeightsInitFun
{})
:
BaseType
(),
params_
{
params
},
output_spatial_lengths_
{
params
.
GetOutputSpatialLengths
()},
do_init_
{
do_init
},
input_init_f_
{
input_init_f
},
weights_init_f_
{
weights_init_f
}
{
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
virtual
~
ConvFwdOpInstance
()
override
{};
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
virtual
InTensorsTuple
GetInputTensors
()
const
override
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
{
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
N
),
static_cast
<
std
::
size_t
>
(
params_
.
C
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params_
.
input_spatial_lengths
),
std
::
end
(
params_
.
input_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
K
),
static_cast
<
std
::
size_t
>
(
params_
.
C
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params_
.
filter_spatial_lengths
),
std
::
end
(
params_
.
filter_spatial_lengths
));
auto
input
=
std
::
make_unique
<
Tensor
<
InDataType
>>
(
get_host_tensor_descriptor
(
input_dims
,
InLayout
{}));
auto
weights
=
std
::
make_unique
<
Tensor
<
WeiDataType
>>
(
get_host_tensor_descriptor
(
filter_dims
,
WeiLayout
{}));
if
(
do_init_
)
{
input_init_f_
(
input
->
begin
(),
input
->
end
());
weights_init_f_
(
weights
->
begin
(),
weights
->
end
());
}
bool
res
{
true
};
return
std
::
make_tuple
(
std
::
move
(
input
),
std
::
move
(
weights
));
for
(
auto
&
conv_ptr
:
conv_ptrs
)
}
virtual
TensorPtr
<
OutDataType
>
GetOutputTensor
()
const
override
{
{
auto
invoker
=
conv_ptr
->
MakeInvokerPointer
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
N
),
auto
argument
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
std
::
size_t
>
(
params_
.
K
)};
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
output_dims
.
insert
(
std
::
end
(
output_dims
),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
std
::
begin
(
output_spatial_lengths_
),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
std
::
end
(
output_spatial_lengths_
));
params
.
N
,
auto
output
=
std
::
make_unique
<
Tensor
<
OutDataType
>>
(
params
.
K
,
get_host_tensor_descriptor
(
output_dims
,
OutLayout
{}));
params
.
C
,
params
.
input_spatial_lengths
,
if
(
do_init_
)
params
.
filter_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
if
(
conv_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
{
float
atol
{
1e-5
f
};
std
::
fill
(
output
->
begin
(),
output
->
end
(),
OutDataType
(
0.
f
));
float
rtol
{
1e-4
f
};
if
constexpr
(
std
::
is_same_v
<
InDataType
,
ck
::
half_t
>
)
{
atol
=
1e-4
f
;
rtol
=
2.5e-3
f
;
}
invoker
->
Run
(
argument
.
get
());
out_device_buf
.
FromDevice
(
output
.
mData
.
data
());
res
=
res
&&
ck
::
utils
::
check_err
(
output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
atol
,
rtol
);
hipGetErrorString
(
hipMemset
(
out_device_buf
.
GetDeviceBuffer
(),
0
,
out_device_buf
.
mMemSize
));
}
}
return
output
;
}
}
return
res
;
}
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
(
tensor_operation
::
device
::
BaseOperator
*
op_ptr
)
const
override
{
static_assert
(
std
::
is_same_v
<
InElementwiseOp
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
);
static_assert
(
std
::
is_same_v
<
OutElementwiseOp
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
);
static_assert
(
std
::
is_same_v
<
WeiElementwiseOp
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
);
auto
conv_ptr
=
dynamic_cast
<
DeviceConvFwdOp
*>
(
op_ptr
);
if
(
!
conv_ptr
)
{
throw
std
::
runtime_error
(
"[ConvFwdOpInstance]: couldn't cast op_ptr to DeviceConvFwdNoOpPtr type!"
);
}
return
conv_ptr
->
MakeInvokerPointer
();
}
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseArgument
>
MakeArgumentPointer
(
tensor_operation
::
device
::
BaseOperator
*
op_ptr
,
const
DeviceBuffers
&
in_device_buffers
,
const
DeviceMemPtr
&
out_device_buffer
)
const
override
{
static_assert
(
std
::
is_same_v
<
InElementwiseOp
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
);
static_assert
(
std
::
is_same_v
<
OutElementwiseOp
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
);
static_assert
(
std
::
is_same_v
<
WeiElementwiseOp
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
);
auto
conv_ptr
=
dynamic_cast
<
DeviceConvFwdOp
*>
(
op_ptr
);
if
(
!
conv_ptr
)
{
throw
std
::
runtime_error
(
"[ConvFwdOpInstance]: couldn't cast op_ptr to DeviceConvFwdNoOpPtr type!"
);
}
return
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buffers
[
0
]
->
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
in_device_buffers
[
1
]
->
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buffer
->
GetDeviceBuffer
()),
params_
.
N
,
params_
.
K
,
params_
.
C
,
params_
.
input_spatial_lengths
,
params_
.
filter_spatial_lengths
,
output_spatial_lengths_
,
params_
.
conv_filter_strides
,
params_
.
conv_filter_dilations
,
params_
.
input_left_pads
,
params_
.
input_right_pads
,
InElementwiseOp
{},
WeiElementwiseOp
{},
OutElementwiseOp
{});
}
virtual
std
::
size_t
GetFlops
()
const
override
{
return
get_flops
(
params_
.
N
,
params_
.
C
,
params_
.
K
,
params_
.
filter_spatial_lengths
,
output_spatial_lengths_
);
}
virtual
std
::
size_t
GetBtype
()
const
override
{
return
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params_
.
N
,
params_
.
C
,
params_
.
K
,
params_
.
input_spatial_lengths
,
params_
.
filter_spatial_lengths
,
output_spatial_lengths_
);
}
private:
const
ConvParams
&
params_
;
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths_
;
const
bool
do_init_
;
const
InputInitFun
&
input_init_f_
;
const
WeightsInitFun
&
weights_init_f_
;
};
}
// namespace conv
}
// namespace conv
}
// namespace utils
}
// namespace utils
}
// namespace ck
}
// namespace ck
#endif
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
ck
::
utils
::
conv
::
ConvParams
&
p
);
library/include/ck/library/utility/fill.hpp
0 → 100644
View file @
db775824
#pragma once
#include <algorithm>
#include <random>
#include "data_type.hpp"
namespace
ck
{
namespace
utils
{
// template <typename T, class Enable = void>
// struct FillUniform;
// TODO: what's wrong with this specialization???
// err: segmentation fault in mt19937 - infinite loop like.
// template <typename T>
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
// !std::is_same<T, bhalf_t>::value>::type>
// {
// int a_{0};
// int b_{5};
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen{11939};
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// std::is_same<T, bhalf_t>::value>::type>
template
<
typename
T
>
struct
FillUniform
{
float
a_
{
0
};
float
b_
{
5
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
{
11939
};
std
::
uniform_real_distribution
<>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
}
};
template
<
typename
T
>
struct
FillMonotonicSeq
{
T
init_value_
{
0
};
T
step_
{
1
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
generate
(
first
,
last
,
[
=
,
n
=
init_value_
]()
mutable
{
auto
tmp
=
n
;
n
+=
step_
;
return
tmp
;
});
}
};
template
<
typename
T
>
struct
FillConstant
{
T
value_
{
0
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
fill
(
first
,
last
,
value_
);
}
};
}
// namespace utils
}
// namespace ck
library/include/ck/library/utility/op_instance_engine.hpp
0 → 100644
View file @
db775824
#pragma once
#include <cstdlib>
#include <limits>
#include <memory>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>
#include "check_err.hpp"
#include "device_base.hpp"
#include "functional2.hpp"
namespace
ck
{
namespace
utils
{
struct
ProfileBestConfig
{
std
::
string
best_op_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_tflops
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
std
::
numeric_limits
<
float
>::
max
();
};
/**
* @brief This class describes an operation instance(s).
*
* Op instance defines a particular specializations of operator
* template. Thanks to this specific input/output data types, data
* layouts and modifying elementwise operations it is able to create
* it's input/output tensors, provide pointers to instances which
* can execute it and all operation specific parameters.
*/
template
<
typename
OutDataType
,
typename
...
InArgTypes
>
class
OpInstance
{
public:
template
<
typename
T
>
using
TensorPtr
=
std
::
unique_ptr
<
Tensor
<
T
>>
;
using
InTensorsTuple
=
std
::
tuple
<
TensorPtr
<
InArgTypes
>
...
>
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
using
DeviceBuffers
=
std
::
vector
<
DeviceMemPtr
>
;
OpInstance
()
=
default
;
OpInstance
(
const
OpInstance
&
)
=
default
;
OpInstance
&
operator
=
(
const
OpInstance
&
)
=
default
;
virtual
~
OpInstance
(){};
virtual
InTensorsTuple
GetInputTensors
()
const
=
0
;
virtual
TensorPtr
<
OutDataType
>
GetOutputTensor
()
const
=
0
;
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
(
tensor_operation
::
device
::
BaseOperator
*
)
const
=
0
;
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseArgument
>
MakeArgumentPointer
(
tensor_operation
::
device
::
BaseOperator
*
,
const
DeviceBuffers
&
,
const
DeviceMemPtr
&
)
const
=
0
;
virtual
std
::
size_t
GetFlops
()
const
=
0
;
virtual
std
::
size_t
GetBtype
()
const
=
0
;
};
/**
* @brief A generic operation instance run engine.
*/
template
<
typename
OutDataType
,
typename
...
InArgTypes
>
class
OpInstanceRunEngine
{
public:
using
OpInstanceT
=
OpInstance
<
InArgTypes
...,
OutDataType
>
;
template
<
typename
T
>
using
TensorPtr
=
std
::
unique_ptr
<
Tensor
<
T
>>
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
using
InTensorsTuple
=
std
::
tuple
<
TensorPtr
<
InArgTypes
>
...
>
;
using
DeviceBuffers
=
std
::
vector
<
DeviceMemPtr
>
;
using
InArgsTypesTuple
=
std
::
tuple
<
InArgTypes
...
>
;
OpInstanceRunEngine
()
=
delete
;
template
<
typename
ReferenceOp
=
std
::
function
<
void
()>
>
OpInstanceRunEngine
(
const
OpInstanceT
&
op_instance
,
const
ReferenceOp
&
reference_op
=
ReferenceOp
{})
:
op_instance_
{
op_instance
}
{
in_tensors_
=
op_instance_
.
GetInputTensors
();
out_tensor_
=
op_instance_
.
GetOutputTensor
();
if
constexpr
(
std
::
is_invocable_v
<
ReferenceOp
,
const
Tensor
<
InArgTypes
>&
...,
Tensor
<
OutDataType
>&>
)
{
ref_output_
=
op_instance_
.
GetOutputTensor
();
CallRefOpUnpackArgs
(
reference_op
,
std
::
make_index_sequence
<
kNInArgs_
>
{});
}
AllocateDeviceInputTensors
(
std
::
make_index_sequence
<
kNInArgs_
>
{});
out_device_buffer_
=
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
OutDataType
)
*
out_tensor_
->
mDesc
.
GetElementSpace
());
out_device_buffer_
->
SetZero
();
}
virtual
~
OpInstanceRunEngine
(){};
template
<
typename
OpInstancePtr
>
bool
Test
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
)
{
bool
res
{
true
};
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
invoker
=
op_instance_
.
MakeInvokerPointer
(
op_ptr
.
get
());
auto
argument
=
op_instance_
.
MakeArgumentPointer
(
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
invoker
->
Run
(
argument
.
get
());
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
{
throw
std
::
runtime_error
(
"OpInstanceRunEngine::Test: Reference value not availabe."
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
check_err
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
out_device_buffer_
->
SetZero
();
}
}
return
res
;
}
template
<
typename
OpInstancePtr
>
ProfileBestConfig
Profile
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
,
int
nrepeat
=
100
,
bool
do_verification
=
false
,
bool
do_log
=
false
)
{
bool
res
{
true
};
ProfileBestConfig
best_config
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
invoker
=
op_instance_
.
MakeInvokerPointer
(
op_ptr
.
get
());
auto
argument
=
op_instance_
.
MakeArgumentPointer
(
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
avg_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
std
::
size_t
flops
=
op_instance_
.
GetFlops
();
std
::
size_t
num_btype
=
op_instance_
.
GetBtype
();
float
tflops
=
static_cast
<
float
>
(
flops
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
<
best_config
.
best_tflops
)
{
best_config
.
best_op_name
=
op_name
;
best_config
.
best_tflops
=
tflops
;
best_config
.
best_gb_per_sec
=
gb_per_sec
;
best_config
.
best_avg_time
=
avg_time
;
}
if
(
do_verification
)
{
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
{
throw
std
::
runtime_error
(
"OpInstanceRunEngine::Profile: Reference value not availabe."
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
if
(
do_log
)
{}
}
out_device_buffer_
->
SetZero
();
}
}
return
best_config
;
}
void
SetAtol
(
double
a
)
{
atol_
=
a
;
}
void
SetRtol
(
double
r
)
{
rtol_
=
r
;
}
private:
template
<
typename
F
,
std
::
size_t
...
Is
>
void
CallRefOpUnpackArgs
(
const
F
&
f
,
std
::
index_sequence
<
Is
...
>
)
const
{
f
(
*
std
::
get
<
Is
>
(
in_tensors_
)...,
*
ref_output_
);
}
template
<
std
::
size_t
...
Is
>
void
AllocateDeviceInputTensors
(
std
::
index_sequence
<
Is
...
>
)
{
(
AllocateDeviceInputTensorsImpl
<
Is
>
(),
...);
}
template
<
std
::
size_t
Index
>
void
AllocateDeviceInputTensorsImpl
()
{
const
auto
&
ts
=
std
::
get
<
Index
>
(
in_tensors_
);
in_device_buffers_
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
std
::
tuple_element_t
<
Index
,
InArgsTypesTuple
>
)
*
ts
->
mDesc
.
GetElementSpace
()))
->
ToDevice
(
ts
->
mData
.
data
());
}
static
constexpr
std
::
size_t
kNInArgs_
=
std
::
tuple_size_v
<
InTensorsTuple
>
;
const
OpInstanceT
&
op_instance_
;
double
rtol_
{
1e-5
};
double
atol_
{
1e-8
};
InTensorsTuple
in_tensors_
;
TensorPtr
<
OutDataType
>
out_tensor_
;
TensorPtr
<
OutDataType
>
ref_output_
;
DeviceBuffers
in_device_buffers_
;
DeviceMemPtr
out_device_buffer_
;
template
<
typename
T
>
bool
CheckErr
(
const
std
::
vector
<
T
>&
dev_out
,
const
std
::
vector
<
T
>&
ref_out
)
const
{
return
ck
::
utils
::
check_err
(
dev_out
,
ref_out
,
"Error: incorrect results!"
,
atol_
,
rtol_
);
}
};
}
// namespace utils
}
// namespace ck
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment