Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b054669b
Commit
b054669b
authored
Jul 14, 2022
by
Chao Liu
Browse files
update profiler for conv bwd data and weight
parent
6b6360b1
Changes
25
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1080 additions
and
1024 deletions
+1080
-1024
example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+4
-6
example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
...conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+4
-4
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+1
-1
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+1
-1
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+4
-4
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+4
-4
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
...nvnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+4
-4
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
...e_tensor_operation/cpu/reference_conv_backward_weight.hpp
+165
-78
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
...eference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+167
-64
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+1
-2
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+1
-1
profiler/include/profile_conv_bwd_data_impl.hpp
profiler/include/profile_conv_bwd_data_impl.hpp
+143
-193
profiler/include/profile_conv_bwd_weight_impl.hpp
profiler/include/profile_conv_bwd_weight_impl.hpp
+331
-177
profiler/include/profile_conv_fwd_impl.hpp
profiler/include/profile_conv_fwd_impl.hpp
+3
-3
profiler/include/profile_convnd_bwd_weight_impl.hpp
profiler/include/profile_convnd_bwd_weight_impl.hpp
+0
-478
profiler/include/profile_gemm_bias_add_reduce_impl.hpp
profiler/include/profile_gemm_bias_add_reduce_impl.hpp
+1
-1
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+1
-1
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+1
-1
profiler/include/profile_normalization_impl.hpp
profiler/include/profile_normalization_impl.hpp
+1
-1
profiler/src/profile_conv_bwd_data.cpp
profiler/src/profile_conv_bwd_data.cpp
+243
-0
No files found.
example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
View file @
b054669b
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
@@ -106,7 +106,7 @@ void PrintUseMsg()
...
@@ -106,7 +106,7 @@ void PrintUseMsg()
<<
std
::
endl
;
<<
std
::
endl
;
}
}
ck
::
utils
::
conv
::
ConvParams
ParseConvParams
(
int
argc
,
char
*
argv
[])
ck
::
tensor_operation
::
device
::
ConvParams
ParseConvParams
(
int
argc
,
char
*
argv
[])
{
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int
num_dim_spatial
=
2
;
int
num_dim_spatial
=
2
;
...
@@ -118,7 +118,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
...
@@ -118,7 +118,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
exit
(
0
);
exit
(
0
);
}
}
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
int
arg_idx
=
4
;
int
arg_idx
=
4
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
...
@@ -164,14 +164,12 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
...
@@ -164,14 +164,12 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
true
;
bool
do_verification
=
true
;
int
init_method
=
1
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
const
int
num_dim_spatial
=
2
;
const
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
if
(
argc
>=
4
)
if
(
argc
>=
4
)
{
{
...
...
example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
View file @
b054669b
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
@@ -103,7 +103,7 @@ void PrintUseMsg()
...
@@ -103,7 +103,7 @@ void PrintUseMsg()
<<
std
::
endl
;
<<
std
::
endl
;
}
}
ck
::
utils
::
conv
::
ConvParams
ParseConvParams
(
int
argc
,
char
*
argv
[])
ck
::
tensor_operation
::
device
::
ConvParams
ParseConvParams
(
int
argc
,
char
*
argv
[])
{
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int
num_dim_spatial
=
2
;
int
num_dim_spatial
=
2
;
...
@@ -115,7 +115,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
...
@@ -115,7 +115,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
exit
(
0
);
exit
(
0
);
}
}
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
int
arg_idx
=
4
;
int
arg_idx
=
4
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
...
@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
...
@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
const
int
num_dim_spatial
=
2
;
const
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
if
(
argc
>=
4
)
if
(
argc
>=
4
)
{
{
...
...
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
View file @
b054669b
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
...
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
View file @
b054669b
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
...
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
View file @
b054669b
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
@@ -100,10 +100,10 @@ void print_use_msg()
...
@@ -100,10 +100,10 @@ void print_use_msg()
<<
" <right padding>, (ie RightPy, RightPx for 2D)
\n
"
<<
" <right padding>, (ie RightPy, RightPx for 2D)
\n
"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
ck
::
utils
::
conv
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
char
*
argv
[])
ck
::
tensor_operation
::
device
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
char
*
argv
[])
{
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
int
arg_idx
=
5
;
int
arg_idx
=
5
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
...
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
...
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
params
.
C_
=
128
;
params
.
C_
=
128
;
if
(
argc
==
4
)
if
(
argc
==
4
)
...
...
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
View file @
b054669b
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
@@ -107,10 +107,10 @@ void print_use_msg()
...
@@ -107,10 +107,10 @@ void print_use_msg()
<<
std
::
endl
;
<<
std
::
endl
;
}
}
ck
::
utils
::
conv
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
char
*
argv
[])
ck
::
tensor_operation
::
device
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
char
*
argv
[])
{
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
int
arg_idx
=
7
;
int
arg_idx
=
7
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
...
@@ -180,7 +180,7 @@ int main(int argc, char* argv[])
...
@@ -180,7 +180,7 @@ int main(int argc, char* argv[])
int
do_log
=
0
;
int
do_log
=
0
;
int
split_k
=
1
;
int
split_k
=
1
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
params
.
C_
=
128
;
params
.
C_
=
128
;
if
(
argc
==
6
)
if
(
argc
==
6
)
...
...
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
View file @
b054669b
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
@@ -127,10 +127,10 @@ void print_use_msg()
...
@@ -127,10 +127,10 @@ void print_use_msg()
<<
std
::
endl
;
<<
std
::
endl
;
}
}
ck
::
utils
::
conv
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
char
*
argv
[])
ck
::
tensor_operation
::
device
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
char
*
argv
[])
{
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
int
arg_idx
=
7
;
int
arg_idx
=
7
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
...
@@ -200,7 +200,7 @@ int main(int argc, char* argv[])
...
@@ -200,7 +200,7 @@ int main(int argc, char* argv[])
int
do_log
=
0
;
int
do_log
=
0
;
int
split_k
=
2
;
int
split_k
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
params
.
C_
=
128
;
params
.
C_
=
128
;
if
(
argc
==
6
)
if
(
argc
==
6
)
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
View file @
b054669b
...
@@ -13,15 +13,17 @@ namespace ck {
...
@@ -13,15 +13,17 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
host
{
namespace
host
{
// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X]
template
<
ck
::
index_t
NumDimSpatial
,
template
<
typename
InDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
OutDataType
,
typename
InElementwiseOperation
,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
,
typename
OutElementwiseOperation
,
ck
::
index_t
NumDimSpatial
=
2
,
typename
std
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
typename
ck
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
struct
ReferenceConvBwdWeight
:
public
device
::
BaseOperator
struct
ReferenceConvBwdWeight
:
public
device
::
BaseOperator
{
{
// Argument
// Argument
...
@@ -69,158 +71,240 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
...
@@ -69,158 +71,240 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
{
{
using
Argument
=
ReferenceConvBwdWeight
::
Argument
;
using
Argument
=
ReferenceConvBwdWeight
::
Argument
;
// FIXME: properly implement "TensorView" for doing transpose or refer to dimension by name
float
Run
(
const
Argument
&
arg
)
float
Run
(
const
Argument
&
arg
)
{
{
// tensor descriptor in NCHW/KXYC/NKHW dimensional order
HostTensorDescriptor
in_desc
=
arg
.
input_
.
mDesc
;
HostTensorDescriptor
wei_desc
=
arg
.
weight_
.
mDesc
;
HostTensorDescriptor
out_desc
=
arg
.
output_
.
mDesc
;
// input
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>
)
{
in_desc
=
transpose_host_tensor_descriptor_given_new2old
(
in_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
2
,
1
});
}
else
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NHWC
>
)
{
in_desc
=
transpose_host_tensor_descriptor_given_new2old
(
in_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
3
,
1
,
2
});
}
else
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWC
>
)
{
in_desc
=
transpose_host_tensor_descriptor_given_new2old
(
in_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
4
,
1
,
2
,
3
});
}
// weight
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>
)
{
wei_desc
=
transpose_host_tensor_descriptor_given_new2old
(
wei_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
2
,
1
});
}
else
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KYXC
>
)
{
wei_desc
=
transpose_host_tensor_descriptor_given_new2old
(
wei_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
3
,
1
,
2
});
}
else
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>
)
{
wei_desc
=
transpose_host_tensor_descriptor_given_new2old
(
wei_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
4
,
1
,
2
,
3
});
}
// output
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>
)
{
out_desc
=
transpose_host_tensor_descriptor_given_new2old
(
out_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
2
,
1
});
}
else
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
)
{
out_desc
=
transpose_host_tensor_descriptor_given_new2old
(
out_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
3
,
1
,
2
});
}
else
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
)
{
out_desc
=
transpose_host_tensor_descriptor_given_new2old
(
out_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
4
,
1
,
2
,
3
});
}
if
constexpr
(
NumDimSpatial
==
1
)
if
constexpr
(
NumDimSpatial
==
1
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
auto
f_kcx
=
[
&
](
auto
k
,
auto
c
,
auto
x
)
{
auto
f_kcx
=
[
&
](
auto
k
,
auto
c
,
auto
x
)
{
float
v_acc
=
0
;
float
v_acc
=
0
;
for
(
std
::
size_t
n
=
0
;
n
<
arg
.
output_
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
n
=
0
;
n
<
out_desc
.
GetLengths
()[
0
];
++
n
)
{
{
for
(
std
::
size_t
wo
=
0
;
wo
<
arg
.
output_
.
mD
esc
.
GetLengths
()[
2
];
++
wo
)
for
(
std
::
size_t
wo
=
0
;
wo
<
out_d
esc
.
GetLengths
()[
2
];
++
wo
)
{
{
auto
wi
=
auto
wi
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
I0
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
0
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
I0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
I0
]);
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
if
(
wi
>=
0
&&
if
(
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
mD
esc
.
GetLengths
()[
2
])
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
in_d
esc
.
GetLengths
()[
2
])
{
{
float
v_out
;
float
v_out
;
float
v_in
;
float
v_in
;
arg
.
out_element_op_
(
v_out
,
// FIXME hacky
ck
::
type_convert
<
float
>
(
arg
.
output_
(
n
,
k
,
wo
)));
arg
.
out_element_op_
(
arg
.
in_element_op_
(
v_in
,
v_out
,
ck
::
type_convert
<
float
>
(
arg
.
input_
(
n
,
c
,
wi
)));
ck
::
type_convert
<
float
>
(
arg
.
output_
.
mData
[
out_desc
.
GetOffsetFromMultiIndex
(
n
,
k
,
wo
)]));
// FIXME hacky
arg
.
in_element_op_
(
v_in
,
ck
::
type_convert
<
float
>
(
arg
.
input_
.
mData
[
in_desc
.
GetOffsetFromMultiIndex
(
n
,
c
,
wi
)]));
v_acc
+=
v_out
*
v_in
;
v_acc
+=
v_out
*
v_in
;
}
}
}
}
}
}
float
v_wei
;
float
v_wei
;
arg
.
wei_element_op_
(
v_wei
,
v_acc
);
arg
.
wei_element_op_
(
v_wei
,
v_acc
);
arg
.
weight_
(
k
,
c
,
x
)
=
ck
::
type_convert
<
WeiDataType
>
(
v_wei
);
// FIXME hacky
arg
.
weight_
.
mData
[
wei_desc
.
GetOffsetFromMultiIndex
(
k
,
c
,
x
)]
=
ck
::
type_convert
<
WeiDataType
>
(
v_wei
);
};
};
make_ParallelTensorFunctor
(
f_kcx
,
make_ParallelTensorFunctor
(
f_kcx
,
arg
.
weight_
.
mD
esc
.
GetLengths
()[
0
],
wei_d
esc
.
GetLengths
()[
0
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
1
],
wei_d
esc
.
GetLengths
()[
1
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
2
])(
wei_d
esc
.
GetLengths
()[
2
])(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
else
if
constexpr
(
NumDimSpatial
==
2
)
else
if
constexpr
(
NumDimSpatial
==
2
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
auto
f_kcyx
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_kcyx
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
float
v_acc
=
0
;
float
v_acc
=
0
;
for
(
std
::
size_t
n
=
0
;
n
<
arg
.
output_
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
n
=
0
;
n
<
out_desc
.
GetLengths
()[
0
];
++
n
)
{
{
for
(
std
::
size_t
ho
=
0
;
ho
<
arg
.
output_
.
mD
esc
.
GetLengths
()[
2
];
++
ho
)
for
(
std
::
size_t
ho
=
0
;
ho
<
out_d
esc
.
GetLengths
()[
2
];
++
ho
)
{
{
auto
hi
=
auto
hi
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
I0
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
0
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
y
*
arg
.
conv_dilations_
[
I0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
y
*
arg
.
conv_dilations_
[
0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
I0
]);
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
std
::
size_t
wo
=
0
;
wo
<
arg
.
output_
.
mDesc
.
GetLengths
()[
3
];
++
wo
)
for
(
std
::
size_t
wo
=
0
;
wo
<
out_desc
.
GetLengths
()[
3
];
++
wo
)
{
{
auto
wi
=
auto
wi
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
I
1
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
1
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
1
])
-
arg
.
conv_dilation
s_
[
I
1
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pad
s_
[
1
])
;
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
I1
]);
if
(
hi
>=
0
&&
if
(
hi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
in_desc
.
GetLengths
()[
2
]
&&
arg
.
input_
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
in_desc
.
GetLengths
()[
3
])
arg
.
input_
.
mDesc
.
GetLengths
()[
3
])
{
{
float
v_out
;
float
v_out
;
float
v_in
;
float
v_in
;
// FIXME hacky
arg
.
out_element_op_
(
arg
.
out_element_op_
(
v_out
,
ck
::
type_convert
<
float
>
(
arg
.
output_
(
n
,
k
,
ho
,
wo
)));
v_out
,
ck
::
type_convert
<
float
>
(
arg
.
output_
.
mData
[
out_desc
.
GetOffsetFromMultiIndex
(
n
,
k
,
ho
,
wo
)]));
// FIXME hacky
arg
.
in_element_op_
(
arg
.
in_element_op_
(
v_in
,
ck
::
type_convert
<
float
>
(
arg
.
input_
(
n
,
c
,
hi
,
wi
)));
v_in
,
ck
::
type_convert
<
float
>
(
arg
.
input_
.
mData
[
in_desc
.
GetOffsetFromMultiIndex
(
n
,
c
,
hi
,
wi
)]));
v_acc
+=
v_out
*
v_in
;
v_acc
+=
v_out
*
v_in
;
}
}
}
}
}
}
}
}
float
v_wei
;
float
v_wei
;
arg
.
wei_element_op_
(
v_wei
,
v_acc
);
arg
.
wei_element_op_
(
v_wei
,
v_acc
);
arg
.
weight_
(
k
,
c
,
y
,
x
)
=
ck
::
type_convert
<
WeiDataType
>
(
v_wei
);
// FIXME hacky
arg
.
weight_
.
mData
[
wei_desc
.
GetOffsetFromMultiIndex
(
k
,
c
,
y
,
x
)]
=
ck
::
type_convert
<
WeiDataType
>
(
v_wei
);
};
};
make_ParallelTensorFunctor
(
f_kcyx
,
make_ParallelTensorFunctor
(
f_kcyx
,
arg
.
weight_
.
mD
esc
.
GetLengths
()[
0
],
wei_d
esc
.
GetLengths
()[
0
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
1
],
wei_d
esc
.
GetLengths
()[
1
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
2
],
wei_d
esc
.
GetLengths
()[
2
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
3
])(
wei_d
esc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
else
if
constexpr
(
NumDimSpatial
==
3
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
auto
f_kczyx
=
[
&
](
auto
k
,
auto
c
,
auto
z
,
auto
y
,
auto
x
)
{
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
auto
f_kczyx
=
[
&
](
auto
k
,
auto
c
,
auto
z
,
auto
y
,
auto
x
)
{
float
v_acc
=
0
;
float
v_acc
=
0
;
for
(
std
::
size_t
n
=
0
;
n
<
arg
.
output_
.
mD
esc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
n
=
0
;
n
<
out_d
esc
.
GetLengths
()[
0
];
++
n
)
{
{
for
(
std
::
size_t
do_
=
0
;
do_
<
arg
.
output_
.
mD
esc
.
GetLengths
()[
2
];
++
do_
)
for
(
std
::
size_t
do_
=
0
;
do_
<
out_d
esc
.
GetLengths
()[
2
];
++
do_
)
{
{
auto
di
=
auto
di
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
do_
*
arg
.
conv_strides_
[
I
0
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
do_
*
arg
.
conv_strides_
[
0
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
z
*
arg
.
conv_dilations_
[
I
0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
z
*
arg
.
conv_dilations_
[
0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
I
0
]);
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
std
::
size_t
ho
=
0
;
ho
<
arg
.
output_
.
mD
esc
.
GetLengths
()[
3
];
++
ho
)
for
(
std
::
size_t
ho
=
0
;
ho
<
out_d
esc
.
GetLengths
()[
3
];
++
ho
)
{
{
auto
hi
=
auto
hi
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
I1
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
1
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
y
*
ck
::
type_convert
<
ck
::
long_index_t
>
(
y
*
arg
.
conv_dilations_
[
1
])
-
arg
.
conv_dilations_
[
I1
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
1
]);
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
I1
]);
for
(
std
::
size_t
wo
=
0
;
wo
<
out_desc
.
GetLengths
()[
4
];
++
wo
)
for
(
std
::
size_t
wo
=
0
;
wo
<
arg
.
output_
.
mDesc
.
GetLengths
()[
4
];
++
wo
)
{
{
auto
wi
=
auto
wi
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
wo
*
ck
::
type_convert
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
I2
])
+
arg
.
conv_strides_
[
2
])
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
x
*
arg
.
conv_dilations_
[
I2
])
-
arg
.
conv_dilations_
[
2
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
I2
]);
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
2
]);
if
(
di
>=
0
&&
if
(
di
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
di
)
<
ck
::
type_convert
<
std
::
size_t
>
(
di
)
<
arg
.
input_
.
mD
esc
.
GetLengths
()[
2
]
&&
in_d
esc
.
GetLengths
()[
2
]
&&
hi
>=
0
&&
hi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
arg
.
input_
.
mD
esc
.
GetLengths
()[
3
]
&&
in_d
esc
.
GetLengths
()[
3
]
&&
wi
>=
0
&&
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
in_desc
.
GetLengths
()[
4
])
arg
.
input_
.
mDesc
.
GetLengths
()[
4
])
{
{
float
v_out
;
float
v_out
;
float
v_in
;
float
v_in
;
arg
.
out_element_op_
(
v_out
,
// FIXME hacky
ck
::
type_convert
<
float
>
(
arg
.
out_element_op_
(
arg
.
output_
(
n
,
k
,
do_
,
ho
,
wo
)));
v_out
,
ck
::
type_convert
<
float
>
(
arg
.
output_
.
mData
[
out_desc
.
GetOffsetFromMultiIndex
(
n
,
k
,
do_
,
ho
,
wo
)]));
// FIXME hacky
arg
.
in_element_op_
(
arg
.
in_element_op_
(
v_in
,
v_in
,
ck
::
type_convert
<
float
>
(
arg
.
input_
(
n
,
c
,
di
,
hi
,
wi
)));
ck
::
type_convert
<
float
>
(
arg
.
input_
.
mData
[
in_desc
.
GetOffsetFromMultiIndex
(
n
,
c
,
di
,
hi
,
wi
)]));
v_acc
+=
v_out
*
v_in
;
v_acc
+=
v_out
*
v_in
;
}
}
...
@@ -228,19 +312,22 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
...
@@ -228,19 +312,22 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
}
}
}
}
}
}
float
v_wei
;
float
v_wei
;
arg
.
wei_element_op_
(
v_wei
,
v_acc
);
arg
.
wei_element_op_
(
v_wei
,
v_acc
);
arg
.
weight_
(
k
,
c
,
z
,
y
,
x
)
=
ck
::
type_convert
<
WeiDataType
>
(
v_wei
);
// FIXME hacky
arg
.
weight_
.
mData
[
wei_desc
.
GetOffsetFromMultiIndex
(
k
,
c
,
z
,
y
,
x
)]
=
ck
::
type_convert
<
WeiDataType
>
(
v_wei
);
};
};
make_ParallelTensorFunctor
(
f_kczyx
,
make_ParallelTensorFunctor
(
f_kczyx
,
arg
.
weight_
.
mD
esc
.
GetLengths
()[
0
],
wei_d
esc
.
GetLengths
()[
0
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
1
],
wei_d
esc
.
GetLengths
()[
1
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
2
],
wei_d
esc
.
GetLengths
()[
2
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
3
],
wei_d
esc
.
GetLengths
()[
3
],
arg
.
weight_
.
mD
esc
.
GetLengths
()[
4
])(
wei_d
esc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
View file @
b054669b
...
@@ -14,16 +14,17 @@ namespace ck {
...
@@ -14,16 +14,17 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
host
{
namespace
host
{
// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X]
template
<
ck
::
index_t
NDimSpatial
,
template
<
typename
InDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InElementwiseOperation
,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
,
typename
OutElementwiseOperation
,
ck
::
index_t
NumDimSpatial
=
2
,
typename
std
::
enable_if
<
NDimSpatial
>
=
1
&&
NDimSpatial
<=
3
,
bool
>::
type
=
false
>
typename
ck
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
struct
ReferenceConvBwdData
:
public
device
::
BaseOperator
struct
ReferenceConvBwdData
:
public
device
::
BaseOperator
{
{
// Argument
// Argument
...
@@ -71,38 +72,105 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -71,38 +72,105 @@ struct ReferenceConvBwdData : public device::BaseOperator
{
{
using
Argument
=
ReferenceConvBwdData
::
Argument
;
using
Argument
=
ReferenceConvBwdData
::
Argument
;
// FIXME: properly implement "TensorView" for doing transpose or refer to dimension by name
float
Run
(
const
Argument
&
arg
)
float
Run
(
const
Argument
&
arg
)
{
{
if
constexpr
(
NumDimSpatial
==
1
)
// tensor descriptor in NCHW/KXYC/NKHW dimensional order
HostTensorDescriptor
in_desc
=
arg
.
input_
.
mDesc
;
HostTensorDescriptor
wei_desc
=
arg
.
weight_
.
mDesc
;
HostTensorDescriptor
out_desc
=
arg
.
output_
.
mDesc
;
// input
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>
)
{
in_desc
=
transpose_host_tensor_descriptor_given_new2old
(
in_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
2
,
1
});
}
else
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NHWC
>
)
{
in_desc
=
transpose_host_tensor_descriptor_given_new2old
(
in_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
3
,
1
,
2
});
}
else
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWC
>
)
{
in_desc
=
transpose_host_tensor_descriptor_given_new2old
(
in_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
4
,
1
,
2
,
3
});
}
// weight
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>
)
{
wei_desc
=
transpose_host_tensor_descriptor_given_new2old
(
wei_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
2
,
1
});
}
else
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KYXC
>
)
{
wei_desc
=
transpose_host_tensor_descriptor_given_new2old
(
wei_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
3
,
1
,
2
});
}
else
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>
)
{
wei_desc
=
transpose_host_tensor_descriptor_given_new2old
(
wei_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
4
,
1
,
2
,
3
});
}
// output
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>
)
{
out_desc
=
transpose_host_tensor_descriptor_given_new2old
(
out_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
2
,
1
});
}
else
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
)
{
out_desc
=
transpose_host_tensor_descriptor_given_new2old
(
out_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
3
,
1
,
2
});
}
else
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
)
{
out_desc
=
transpose_host_tensor_descriptor_given_new2old
(
out_desc
,
std
::
vector
<
std
::
size_t
>
{
0
,
4
,
1
,
2
,
3
});
}
if
constexpr
(
NDimSpatial
==
1
)
{
{
auto
f_ncw
=
[
&
](
auto
n
,
auto
c
,
auto
wi
)
{
auto
f_ncw
=
[
&
](
auto
n
,
auto
c
,
auto
wi
)
{
std
::
size_t
K
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
0
];
std
::
size_t
K
=
wei_d
esc
.
GetLengths
()[
0
];
std
::
size_t
X
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
2
];
std
::
size_t
X
=
wei_d
esc
.
GetLengths
()[
2
];
std
::
size_t
Wo
=
arg
.
output_
.
mD
esc
.
GetLengths
()[
2
];
std
::
size_t
Wo
=
out_d
esc
.
GetLengths
()[
2
];
AccDataType
v_acc
=
0
;
float
v_acc
=
0
;
for
(
std
::
size_t
x
=
0
;
x
<
X
;
++
x
)
for
(
std
::
size_t
x
=
0
;
x
<
X
;
++
x
)
{
{
auto
w_tmp
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
wi
)
+
auto
w_tmp
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
wi
)
+
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
0
]);
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
0
]);
if
(
w_tmp
%
arg
.
conv_strides_
[
0
]
==
0
)
if
(
w_tmp
%
arg
.
conv_strides_
[
0
]
==
0
)
{
{
auto
wo
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
w_tmp
)
/
auto
wo
=
ck
::
type_convert
<
ck
::
long_index_t
>
(
w_tmp
)
/
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
conv_strides_
[
0
]);
ck
::
type_convert
<
ck
::
long_index_t
>
(
arg
.
conv_strides_
[
0
]);
if
(
wo
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wo
)
<
Wo
)
if
(
wo
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wo
)
<
Wo
)
{
{
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
{
{
AccDataType
v_out
=
0
;
float
v_out
=
0
;
AccDataType
v_wei
=
0
;
float
v_wei
=
0
;
// FIXME hacky
arg
.
out_element_op_
(
arg
.
out_element_op_
(
v_out
,
v_out
,
ck
::
type_convert
<
AccDataType
>
(
arg
.
output_
(
n
,
k
,
wo
)));
ck
::
type_convert
<
float
>
(
arg
.
output_
.
mData
[
out_desc
.
GetOffsetFromMultiIndex
(
n
,
k
,
wo
)]));
// FIXME hacky
arg
.
wei_element_op_
(
arg
.
wei_element_op_
(
v_wei
,
ck
::
type_convert
<
AccDataType
>
(
arg
.
weight_
(
k
,
c
,
x
)));
v_wei
,
ck
::
type_convert
<
float
>
(
arg
.
weight_
.
mData
[
wei_desc
.
GetOffsetFromMultiIndex
(
k
,
c
,
x
)]));
v_acc
+=
v_out
*
v_wei
;
v_acc
+=
v_out
*
v_wei
;
}
}
...
@@ -110,29 +178,34 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -110,29 +178,34 @@ struct ReferenceConvBwdData : public device::BaseOperator
}
}
}
}
arg
.
in_element_op_
(
v_acc
,
v_acc
);
float
v_in
;
arg
.
input_
(
n
,
c
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_acc
);
arg
.
in_element_op_
(
v_in
,
v_acc
);
// FIXME hacky
arg
.
input_
.
mData
[
in_desc
.
GetOffsetFromMultiIndex
(
n
,
c
,
wi
)]
=
ck
::
type_convert
<
InDataType
>
(
v_acc
);
};
};
make_ParallelTensorFunctor
(
f_ncw
,
make_ParallelTensorFunctor
(
f_ncw
,
arg
.
input_
.
mD
esc
.
GetLengths
()[
0
],
in_d
esc
.
GetLengths
()[
0
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
1
],
in_d
esc
.
GetLengths
()[
1
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
2
])(
in_d
esc
.
GetLengths
()[
2
])(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
else
if
constexpr
(
N
um
DimSpatial
==
2
)
else
if
constexpr
(
NDimSpatial
==
2
)
{
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
std
::
size_t
K
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
0
];
std
::
size_t
K
=
wei_d
esc
.
GetLengths
()[
0
];
std
::
size_t
Y
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
2
];
std
::
size_t
Y
=
wei_d
esc
.
GetLengths
()[
2
];
std
::
size_t
X
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
3
];
std
::
size_t
X
=
wei_d
esc
.
GetLengths
()[
3
];
std
::
size_t
Ho
=
arg
.
output_
.
mD
esc
.
GetLengths
()[
2
];
std
::
size_t
Ho
=
out_d
esc
.
GetLengths
()[
2
];
std
::
size_t
Wo
=
arg
.
output_
.
mD
esc
.
GetLengths
()[
3
];
std
::
size_t
Wo
=
out_d
esc
.
GetLengths
()[
3
];
AccDataType
v_acc
=
0
;
float
v_acc
=
0
;
for
(
std
::
size_t
y
=
0
;
y
<
Y
;
++
y
)
for
(
std
::
size_t
y
=
0
;
y
<
Y
;
++
y
)
{
{
...
@@ -161,15 +234,24 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -161,15 +234,24 @@ struct ReferenceConvBwdData : public device::BaseOperator
{
{
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
{
{
AccDataType
v_out
=
0
;
float
v_out
=
0
;
AccDataType
v_wei
=
0
;
float
v_wei
=
0
;
arg
.
out_element_op_
(
v_out
,
// FIXME hacky
ck
::
type_convert
<
AccDataType
>
(
arg
.
out_element_op_
(
arg
.
output_
(
n
,
k
,
ho
,
wo
)));
v_out
,
arg
.
wei_element_op_
(
v_wei
,
ck
::
type_convert
<
float
>
(
ck
::
type_convert
<
AccDataType
>
(
arg
.
output_
arg
.
weight_
(
k
,
c
,
y
,
x
)));
.
mData
[
out_desc
.
GetOffsetFromMultiIndex
(
n
,
k
,
ho
,
wo
)]));
// FIXME hacky
arg
.
wei_element_op_
(
v_wei
,
ck
::
type_convert
<
float
>
(
arg
.
weight_
.
mData
[
wei_desc
.
GetOffsetFromMultiIndex
(
k
,
c
,
y
,
x
)]));
v_acc
+=
v_out
*
v_wei
;
v_acc
+=
v_out
*
v_wei
;
}
}
...
@@ -180,33 +262,37 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -180,33 +262,37 @@ struct ReferenceConvBwdData : public device::BaseOperator
}
}
}
}
AccDataType
v_in
;
float
v_in
;
arg
.
in_element_op_
(
v_in
,
v_acc
);
arg
.
in_element_op_
(
v_in
,
v_acc
);
arg
.
input_
(
n
,
c
,
hi
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
// FIXME hacky
arg
.
input_
.
mData
[
in_desc
.
GetOffsetFromMultiIndex
(
n
,
c
,
hi
,
wi
)]
=
ck
::
type_convert
<
InDataType
>
(
v_acc
);
};
};
make_ParallelTensorFunctor
(
f_nchw
,
make_ParallelTensorFunctor
(
f_nchw
,
arg
.
input_
.
mD
esc
.
GetLengths
()[
0
],
in_d
esc
.
GetLengths
()[
0
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
1
],
in_d
esc
.
GetLengths
()[
1
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
2
],
in_d
esc
.
GetLengths
()[
2
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
3
])(
in_d
esc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
}
}
else
if
constexpr
(
N
um
DimSpatial
==
3
)
else
if
constexpr
(
NDimSpatial
==
3
)
{
{
auto
f_ncdhw
=
[
&
](
auto
n
,
auto
c
,
auto
di
,
auto
hi
,
auto
wi
)
{
auto
f_ncdhw
=
[
&
](
auto
n
,
auto
c
,
auto
di
,
auto
hi
,
auto
wi
)
{
std
::
size_t
K
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
0
];
std
::
size_t
K
=
wei_d
esc
.
GetLengths
()[
0
];
std
::
size_t
Z
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
2
];
std
::
size_t
Z
=
wei_d
esc
.
GetLengths
()[
2
];
std
::
size_t
Y
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
3
];
std
::
size_t
Y
=
wei_d
esc
.
GetLengths
()[
3
];
std
::
size_t
X
=
arg
.
weight_
.
mD
esc
.
GetLengths
()[
4
];
std
::
size_t
X
=
wei_d
esc
.
GetLengths
()[
4
];
std
::
size_t
Do
=
arg
.
output_
.
mD
esc
.
GetLengths
()[
2
];
std
::
size_t
Do
=
out_d
esc
.
GetLengths
()[
2
];
std
::
size_t
Ho
=
arg
.
output_
.
mD
esc
.
GetLengths
()[
3
];
std
::
size_t
Ho
=
out_d
esc
.
GetLengths
()[
3
];
std
::
size_t
Wo
=
arg
.
output_
.
mD
esc
.
GetLengths
()[
4
];
std
::
size_t
Wo
=
out_d
esc
.
GetLengths
()[
4
];
AccDataType
v_acc
=
0
;
float
v_acc
=
0
;
for
(
std
::
size_t
z
=
0
;
z
<
Z
;
++
z
)
for
(
std
::
size_t
z
=
0
;
z
<
Z
;
++
z
)
{
{
...
@@ -241,6 +327,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -241,6 +327,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg
.
in_left_pads_
[
2
])
-
arg
.
in_left_pads_
[
2
])
-
ck
::
type_convert
<
ck
::
long_index_t
>
(
ck
::
type_convert
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
2
]);
x
*
arg
.
conv_dilations_
[
2
]);
if
(
w_tmp
%
arg
.
conv_strides_
[
2
]
==
0
)
if
(
w_tmp
%
arg
.
conv_strides_
[
2
]
==
0
)
{
{
auto
wo
=
auto
wo
=
...
@@ -252,18 +339,30 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -252,18 +339,30 @@ struct ReferenceConvBwdData : public device::BaseOperator
{
{
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
{
{
AccDataType
v_out
=
0
;
float
v_out
=
0
;
AccDataType
v_wei
=
0
;
float
v_wei
=
0
;
// FIXME hacky
arg
.
out_element_op_
(
arg
.
out_element_op_
(
v_out
,
v_out
,
ck
::
type_convert
<
AccDataType
>
(
ck
::
type_convert
<
float
>
(
arg
.
output_
(
arg
.
output_
.
mData
n
,
k
,
do_
,
ho
,
wo
)));
[
out_desc
.
GetOffsetFromMultiIndex
(
n
,
k
,
do_
,
ho
,
wo
)]));
// FIXME hacky
arg
.
wei_element_op_
(
arg
.
wei_element_op_
(
v_wei
,
v_wei
,
ck
::
type_convert
<
AccDataType
>
(
ck
::
type_convert
<
float
>
(
arg
.
weight_
(
k
,
c
,
z
,
y
,
x
)));
arg
.
weight_
.
mData
[
wei_desc
.
GetOffsetFromMultiIndex
(
k
,
c
,
z
,
y
,
x
)]));
v_acc
+=
v_out
*
v_wei
;
v_acc
+=
v_out
*
v_wei
;
}
}
...
@@ -277,17 +376,21 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -277,17 +376,21 @@ struct ReferenceConvBwdData : public device::BaseOperator
}
}
}
}
AccDataType
v_in
;
float
v_in
;
arg
.
in_element_op_
(
v_in
,
v_acc
);
arg
.
in_element_op_
(
v_in
,
v_acc
);
arg
.
input_
(
n
,
c
,
di
,
hi
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
// FIXME hacky
arg
.
input_
.
mData
[
in_desc
.
GetOffsetFromMultiIndex
(
n
,
c
,
wi
)]
=
ck
::
type_convert
<
InDataType
>
(
v_acc
);
};
};
make_ParallelTensorFunctor
(
f_ncdhw
,
make_ParallelTensorFunctor
(
f_ncdhw
,
arg
.
input_
.
mD
esc
.
GetLengths
()[
0
],
in_d
esc
.
GetLengths
()[
0
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
1
],
in_d
esc
.
GetLengths
()[
1
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
2
],
in_d
esc
.
GetLengths
()[
2
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
3
],
in_d
esc
.
GetLengths
()[
3
],
arg
.
input_
.
mD
esc
.
GetLengths
()[
4
])(
in_d
esc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
return
0
;
return
0
;
...
...
profiler/CMakeLists.txt
View file @
b054669b
...
@@ -17,9 +17,8 @@ set(PROFILER_SOURCE
...
@@ -17,9 +17,8 @@ set(PROFILER_SOURCE
src/profile_conv_fwd.cpp
src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv
nd
_bwd_data.cpp
src/profile_conv_bwd_data.cpp
src/profile_conv_bwd_weight.cpp
src/profile_conv_bwd_weight.cpp
src/profile_convnd_bwd_weight.cpp
src/profile_reduce.cpp
src/profile_reduce.cpp
src/profile_normalization.cpp
src/profile_normalization.cpp
)
)
...
...
profiler/include/profile_batched_gemm_reduce_impl.hpp
View file @
b054669b
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
...
profiler/include/profile_conv
nd
_bwd_data_impl.hpp
→
profiler/include/profile_conv_bwd_data_impl.hpp
View file @
b054669b
...
@@ -8,7 +8,8 @@
...
@@ -8,7 +8,8 @@
#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
@@ -63,72 +64,6 @@ namespace ck {
...
@@ -63,72 +64,6 @@ namespace ck {
namespace
profiler
{
namespace
profiler
{
using
DeviceConvBwdDataNoOpPtr
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceConvBwdDataNoOpPtr
;
using
DeviceConvBwdDataNoOpPtr
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceConvBwdDataNoOpPtr
;
template
<
typename
InLayout
>
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
WeiLayout
>
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
OutLayout
>
HostTensorDescriptor
get_output_host_ensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
template
<
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
void
get_device_conv_bwd_data_op_ptr
(
void
get_device_conv_bwd_data_op_ptr
(
InDataType
,
WeiDataType
,
OutDataType
,
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
,
int
)
InDataType
,
WeiDataType
,
OutDataType
,
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
,
int
)
...
@@ -221,21 +156,6 @@ void get_device_conv_bwd_data_op_ptr(
...
@@ -221,21 +156,6 @@ void get_device_conv_bwd_data_op_ptr(
}
}
}
}
template
<
typename
T
>
static
bool
check_out
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
float
max_diff
=
1e-6
;
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
return
false
;
}
}
return
true
;
}
template
<
typename
DataType
>
template
<
typename
DataType
>
void
show_data_nhwc_layout
(
Tensor
<
DataType
>&
nhwc
)
void
show_data_nhwc_layout
(
Tensor
<
DataType
>&
nhwc
)
{
{
...
@@ -263,27 +183,17 @@ void show_data_nhwc_layout(Tensor<DataType>& nhwc)
...
@@ -263,27 +183,17 @@ void show_data_nhwc_layout(Tensor<DataType>& nhwc)
}
}
template
<
int
NDimSpatial
,
template
<
int
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InLayout
,
typename
InLayout
,
typename
WeiLayout
,
typename
WeiLayout
,
typename
OutLayout
>
typename
OutLayout
,
bool
profile_convnd_bwd_data_impl
(
int
do_verification
,
typename
InDataType
,
int
init_method
,
typename
WeiDataType
,
bool
do_log
,
typename
OutDataType
>
bool
time_kernel
,
bool
profile_conv_bwd_data_impl
(
int
do_verification
,
ck
::
index_t
N
,
int
init_method
,
ck
::
index_t
K
,
bool
do_log
,
ck
::
index_t
C
,
bool
time_kernel
,
const
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
const
ck
::
tensor_operation
::
device
::
ConvParams
&
params
)
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
conv_filter_strides
,
const
std
::
vector
<
ck
::
index_t
>&
conv_filter_dilations
,
const
std
::
vector
<
ck
::
index_t
>&
input_left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
input_right_pads
)
{
{
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
...
@@ -293,31 +203,83 @@ bool profile_convnd_bwd_data_impl(int do_verification,
...
@@ -293,31 +203,83 @@ bool profile_convnd_bwd_data_impl(int do_verification,
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
N
),
static_cast
<
std
::
size_t
>
(
C
)};
// make host tensor descritpor
input_dims
.
insert
(
auto
f_nhwc_host_tensor_descriptor
=
std
::
end
(
input_dims
),
std
::
begin
(
input_spatial_lengths
),
std
::
end
(
input_spatial_lengths
));
[](
ck
::
index_t
n
,
ck
::
index_t
c
,
std
::
vector
<
ck
::
index_t
>
spatial_lengths
)
{
std
::
vector
<
std
::
size_t
>
nhwc_lengths
{
static_cast
<
std
::
size_t
>
(
n
),
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
K
),
static_cast
<
std
::
size_t
>
(
C
)};
static_cast
<
std
::
size_t
>
(
c
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
nhwc_lengths
.
insert
(
std
::
begin
(
filter_spatial_lengths
),
nhwc_lengths
.
begin
()
+
1
,
spatial_lengths
.
begin
(),
spatial_lengths
.
end
());
std
::
end
(
filter_spatial_lengths
));
return
HostTensorDescriptor
(
nhwc_lengths
);
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
N
),
static_cast
<
std
::
size_t
>
(
K
)};
};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
auto
f_nchw_host_tensor_descriptor
=
std
::
end
(
output_spatial_lengths
));
[](
ck
::
index_t
n
,
ck
::
index_t
c
,
std
::
vector
<
ck
::
index_t
>
spatial_lengths
)
{
std
::
vector
<
std
::
size_t
>
nchw_lengths
{
static_cast
<
std
::
size_t
>
(
n
),
Tensor
<
InDataType
>
input_host_result
(
static_cast
<
std
::
size_t
>
(
c
)};
get_input_host_tensor_descriptor
<
InLayout
>
(
input_dims
,
NDimSpatial
));
nchw_lengths
.
insert
(
nchw_lengths
.
end
(),
spatial_lengths
.
begin
(),
spatial_lengths
.
end
());
Tensor
<
InDataType
>
input_device_result
(
get_input_host_tensor_descriptor
<
InLayout
>
(
input_dims
,
NDimSpatial
));
return
HostTensorDescriptor
(
nchw_lengths
);
Tensor
<
WeiDataType
>
weights
(
};
get_filters_host_tensor_descriptor
<
WeiLayout
>
(
filter_dims
,
NDimSpatial
));
Tensor
<
OutDataType
>
output
(
HostTensorDescriptor
in_desc
,
wei_desc
,
out_desc
;
get_output_host_ensor_descriptor
<
OutLayout
>
(
output_dims
,
NDimSpatial
));
// FIXME: properly implement "make host descriptor" for different layout
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NHWC
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWC
>
)
{
in_desc
=
f_nhwc_host_tensor_descriptor
(
params
.
N_
,
params
.
C_
,
params
.
input_spatial_lengths_
);
}
else
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NCW
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NCHW
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NCDHW
>
)
{
in_desc
=
f_nchw_host_tensor_descriptor
(
params
.
N_
,
params
.
C_
,
params
.
input_spatial_lengths_
);
}
// FIXME: properly implement "make host descriptor" for different layout
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KYXC
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>
)
{
wei_desc
=
f_nhwc_host_tensor_descriptor
(
params
.
K_
,
params
.
C_
,
params
.
filter_spatial_lengths_
);
}
else
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KCX
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KCYX
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KCZYX
>
)
{
wei_desc
=
f_nchw_host_tensor_descriptor
(
params
.
K_
,
params
.
C_
,
params
.
filter_spatial_lengths_
);
}
// FIXME: properly implement "make host descriptor" for different layout
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
)
{
out_desc
=
f_nhwc_host_tensor_descriptor
(
params
.
N_
,
params
.
K_
,
params
.
GetOutputSpatialLengths
());
}
else
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NKW
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NKHW
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NKDHW
>
)
{
out_desc
=
f_nchw_host_tensor_descriptor
(
params
.
N_
,
params
.
K_
,
params
.
GetOutputSpatialLengths
());
}
Tensor
<
InDataType
>
input_host_result
(
in_desc
);
Tensor
<
InDataType
>
input_device_result
(
in_desc
);
Tensor
<
WeiDataType
>
weight
(
wei_desc
);
Tensor
<
OutDataType
>
output
(
out_desc
);
std
::
cout
<<
"input: "
<<
input_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"input: "
<<
input_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weight
s
: "
<<
weight
s
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weight: "
<<
weight
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
output
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
...
@@ -325,50 +287,46 @@ bool profile_convnd_bwd_data_impl(int do_verification,
...
@@ -325,50 +287,46 @@ bool profile_convnd_bwd_data_impl(int do_verification,
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
output
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
output
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
weight
s
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
weight
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
break
;
default:
default:
output
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
output
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
weight
s
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
weight
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weight
s
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weight
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
out_device_buf
.
ToDevice
(
output
.
mData
.
data
());
out_device_buf
.
ToDevice
(
output
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weight
.
mData
.
data
());
// reset input to zero
in_device_buf
.
SetZero
();
if
(
do_verification
)
if
(
do_verification
)
{
{
auto
RunReference
=
[
&
](
auto
&
ref_conv
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
NDimSpatial
,
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
InLayout
,
WeiLayout
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input_host_result
,
OutLayout
,
weights
,
InDataType
,
output
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
};
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
WeiDataType
,
OutDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
InElementOp
,
WeiElementOp
,
WeiElementOp
,
OutElementOp
,
OutElementOp
>
{};
NDimSpatial
>
();
RunReference
(
ref_conv
);
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input_host_result
,
weight
,
output
,
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
params
.
input_left_pads_
,
params
.
input_right_pads_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
}
}
// add device Conv instances
// add device Conv instances
...
@@ -381,29 +339,30 @@ bool profile_convnd_bwd_data_impl(int do_verification,
...
@@ -381,29 +339,30 @@ bool profile_convnd_bwd_data_impl(int do_verification,
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
}
std
::
string
best_
conv
_name
;
std
::
string
best_
op
_name
;
float
best_av
e
_time
=
0
;
float
best_av
g
_time
=
0
;
float
best_tflops
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
float
best_gb_per_sec
=
0
;
// profile device Conv instances
// profile device Conv instances
bool
success
=
true
;
bool
pass
=
true
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
params
.
N_
,
K
,
params
.
K_
,
C
,
params
.
C_
,
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
,
params
.
output_spatial_lengths
_
,
conv_filter_strides
,
params
.
conv_filter_strides
_
,
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
input_left_pads
,
params
.
input_left_pads
_
,
input_right_pads
,
params
.
input_right_pads
_
,
in_element_op
,
in_element_op
,
wei_element_op
,
wei_element_op
,
out_element_op
);
out_element_op
);
...
@@ -412,28 +371,28 @@ bool profile_convnd_bwd_data_impl(int do_verification,
...
@@ -412,28 +371,28 @@ bool profile_convnd_bwd_data_impl(int do_verification,
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
// reset input to zero
in_device_buf
.
SetZero
();
std
::
string
op_name
=
conv_ptr
->
GetTypeString
();
float
av
e
_time
=
float
av
g
_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
flop
=
params
.
GetFlops
();
ck
::
utils
::
conv
::
get_flops
(
N
,
C
,
K
,
filter_spatial_lengths
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
params
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
N
,
C
,
K
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
av
e
_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
av
g
_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
av
e
_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
av
g
_time
;
std
::
cout
<<
"Perf: "
<<
av
e
_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
std
::
cout
<<
"Perf: "
<<
av
g
_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
<<
" GB/s"
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
if
(
tflops
>
best_tflops
)
{
{
best_
conv
_name
=
conv
_name
;
best_
op
_name
=
op
_name
;
best_tflops
=
tflops
;
best_tflops
=
tflops
;
best_av
e
_time
=
av
e
_time
;
best_av
g
_time
=
av
g
_time
;
best_gb_per_sec
=
gb_per_sec
;
best_gb_per_sec
=
gb_per_sec
;
}
}
...
@@ -441,18 +400,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
...
@@ -441,18 +400,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
{
{
in_device_buf
.
FromDevice
(
input_device_result
.
mData
.
data
());
in_device_buf
.
FromDevice
(
input_device_result
.
mData
.
data
());
if
(
!
check_out
(
input_host_result
,
input_device_result
))
pass
=
{
pass
&
ck
::
utils
::
check_err
(
input_device_result
.
mData
,
input_host_result
.
mData
);
std
::
cout
<<
"Fail Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
success
=
false
;
}
else
{
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
check_error
(
input_host_result
,
input_device_result
);
if
(
do_log
)
if
(
do_log
)
{
{
...
@@ -461,7 +410,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
...
@@ -461,7 +410,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"wei: "
;
std
::
cout
<<
"wei: "
;
show_data_nhwc_layout
(
weight
s
);
show_data_nhwc_layout
(
weight
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"out_host : "
;
std
::
cout
<<
"out_host : "
;
...
@@ -476,9 +425,10 @@ bool profile_convnd_bwd_data_impl(int do_verification,
...
@@ -476,9 +425,10 @@ bool profile_convnd_bwd_data_impl(int do_verification,
}
}
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_conv_name
<<
std
::
endl
;
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
success
;
return
pass
;
}
}
}
// namespace profiler
}
// namespace profiler
...
...
profiler/include/profile_conv_bwd_weight_impl.hpp
View file @
b054669b
This diff is collapsed.
Click to expand it.
profiler/include/profile_conv_fwd_impl.hpp
View file @
b054669b
...
@@ -25,7 +25,7 @@ namespace ck {
...
@@ -25,7 +25,7 @@ namespace ck {
namespace
profiler
{
namespace
profiler
{
// FIXME: only support NCHW and NHWC layout, need to be more general
// FIXME: only support NCHW and NHWC layout, need to be more general
template
<
ck
::
index_t
N
um
DimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
InLayout
,
typename
WeiLayout
,
typename
WeiLayout
,
typename
OutLayout
,
typename
OutLayout
,
...
@@ -146,7 +146,7 @@ int profile_conv_fwd_impl(int do_verification,
...
@@ -146,7 +146,7 @@ int profile_conv_fwd_impl(int do_verification,
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weight
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weight
.
mData
.
data
());
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceConvFwd
<
N
um
DimSpatial
,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceConvFwd
<
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
OutLayout
,
OutLayout
,
...
@@ -166,7 +166,7 @@ int profile_conv_fwd_impl(int do_verification,
...
@@ -166,7 +166,7 @@ int profile_conv_fwd_impl(int do_verification,
// run reference op
// run reference op
if
(
do_verification
)
if
(
do_verification
)
{
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
N
um
DimSpatial
,
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
OutLayout
,
OutLayout
,
...
...
profiler/include/profile_convnd_bwd_weight_impl.hpp
deleted
100644 → 0
View file @
6b6360b1
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
BF16
=
ck
::
bhalf_t
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
DeviceConvndBwdWeightNoOpPtr
=
DeviceConvBwdWeightPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
void
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
using
DeviceConvndBwdWeightNoOpPtr
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceConvndBwdWeightNoOpPtr
;
template
<
typename
InLayout
>
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
WeiLayout
>
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
OutLayout
>
HostTensorDescriptor
get_output_host_ensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
void
get_device_conv_bwd_weight_op_ptr
(
InDataType
,
WeiDataType
,
OutDataType
,
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
,
int
)
{
std
::
cout
<<
"can not find device conv bwd weight"
<<
std
::
endl
;
exit
(
1
);
}
template
<
>
void
get_device_conv_bwd_weight_op_ptr
(
F32
,
F32
,
F32
,
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
conv_ptrs
,
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
1
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances
(
conv_ptrs
);
break
;
case
2
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
break
;
case
3
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
conv_ptrs
);
break
;
default:
break
;
}
}
template
<
>
void
get_device_conv_bwd_weight_op_ptr
(
F16
,
F16
,
F16
,
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
conv_ptrs
,
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
1
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances
(
conv_ptrs
);
break
;
case
2
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
break
;
case
3
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
conv_ptrs
);
break
;
default:
break
;
}
}
template
<
>
void
get_device_conv_bwd_weight_op_ptr
(
BF16
,
BF16
,
BF16
,
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>&
conv_ptrs
,
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
1
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances
(
conv_ptrs
);
break
;
case
2
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
break
;
case
3
:
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
conv_ptrs
);
break
;
default:
break
;
}
}
template
<
typename
DataType
>
void
show_data_nhwc_layout
(
Tensor
<
DataType
>&
nhwc
)
{
std
::
cout
<<
"["
;
for
(
int
n
=
0
;
n
<
ck
::
type_convert
<
int
>
(
nhwc
.
mDesc
.
GetLengths
()[
0
]);
n
++
)
{
std
::
cout
<<
"["
;
for
(
int
hi
=
0
;
hi
<
ck
::
type_convert
<
int
>
(
nhwc
.
mDesc
.
GetLengths
()[
2
]);
hi
++
)
{
std
::
cout
<<
"["
;
for
(
int
wi
=
0
;
wi
<
ck
::
type_convert
<
int
>
(
nhwc
.
mDesc
.
GetLengths
()[
3
]);
wi
++
)
{
std
::
cout
<<
"["
;
for
(
int
c
=
0
;
c
<
ck
::
type_convert
<
int
>
(
nhwc
.
mDesc
.
GetLengths
()[
1
]);
c
++
)
{
std
::
cout
<<
static_cast
<
float
>
(
nhwc
(
n
,
c
,
hi
,
wi
))
<<
" "
;
}
std
::
cout
<<
"]"
;
}
std
::
cout
<<
"]"
;
}
std
::
cout
<<
"]"
;
}
std
::
cout
<<
"]"
;
}
template
<
int
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
bool
profile_convnd_bwd_weight_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
ck
::
index_t
split_k
)
{
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
N
),
static_cast
<
std
::
size_t
>
(
C
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
input_spatial_lengths
),
std
::
end
(
input_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
K
),
static_cast
<
std
::
size_t
>
(
C
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
N
),
static_cast
<
std
::
size_t
>
(
K
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input
(
get_input_host_tensor_descriptor
<
InLayout
>
(
input_dims
,
NDimSpatial
));
Tensor
<
WeiDataType
>
weights_host_result
(
get_filters_host_tensor_descriptor
<
WeiLayout
>
(
filter_dims
,
NDimSpatial
));
Tensor
<
WeiDataType
>
weights_device_result
(
get_filters_host_tensor_descriptor
<
WeiLayout
>
(
filter_dims
,
NDimSpatial
));
Tensor
<
OutDataType
>
output
(
get_output_host_ensor_descriptor
<
OutLayout
>
(
output_dims
,
NDimSpatial
));
std
::
cout
<<
"input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weights: "
<<
weights_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
2
,
2
});
output
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
2
,
2
});
break
;
default:
input
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
output
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
out_device_buf
.
ToDevice
(
output
.
mData
.
data
());
// reset input to zero
wei_device_buf
.
SetZero
();
if
(
do_verification
)
{
auto
RunReference
=
[
&
](
auto
&
ref_conv
)
{
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights_host_result
,
output
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
};
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdWeight
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
NDimSpatial
>
();
RunReference
(
ref_conv
);
}
// add device Conv instances
std
::
vector
<
DeviceConvndBwdWeightNoOpPtr
>
conv_ptrs
;
get_device_conv_bwd_weight_op_ptr
(
InDataType
{},
WeiDataType
{},
OutDataType
{},
conv_ptrs
,
NDimSpatial
);
if
(
conv_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
std
::
string
best_conv_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device Conv instances
bool
success
=
true
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
// using atomic, so need to reset input, setzero is done in invoker
// if(split_k > 1)
//{
// wei_device_buf.SetZero();
//}
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
,
split_k
);
if
(
!
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
cout
<<
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
<<
std
::
endl
;
continue
;
}
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
0
;
if
(
std
::
is_same
<
InDataType
,
ck
::
bhalf_t
>::
value
&&
split_k
>
1
)
{
// alloc work space
size_t
bwd_weight_workspace_size
=
conv_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
if
(
bwd_weight_workspace_size
<=
0
)
{
printf
(
"wrong work space size
\n
"
);
exit
(
1
);
}
DeviceMem
wei_work_space_device_buf
(
bwd_weight_workspace_size
);
wei_work_space_device_buf
.
SetZero
();
conv_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
wei_work_space_device_buf
.
GetDeviceBuffer
());
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
}
else
{
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
}
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
N
,
C
,
K
,
filter_spatial_lengths
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
N
,
C
,
K
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_conv_name
=
conv_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
wei_device_buf
.
FromDevice
(
weights_device_result
.
mData
.
data
());
float
max_error
=
check_error
(
weights_host_result
,
weights_device_result
);
if
(
max_error
>
8
)
{
std
::
cout
<<
"Fail Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
success
=
false
;
}
else
{
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
check_error
(
weights_host_result
,
weights_device_result
);
if
(
do_log
)
{
std
::
cout
<<
"in : "
;
show_data_nhwc_layout
(
output
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"wei: "
;
show_data_nhwc_layout
(
weights_host_result
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"out : "
;
show_data_nhwc_layout
(
input
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"wei_device: "
;
show_data_nhwc_layout
(
weights_device_result
);
std
::
cout
<<
std
::
endl
;
}
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_conv_name
<<
std
::
endl
;
return
success
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_gemm_bias_add_reduce_impl.hpp
View file @
b054669b
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
...
profiler/include/profile_gemm_reduce_impl.hpp
View file @
b054669b
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
b054669b
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
...
profiler/include/profile_normalization_impl.hpp
View file @
b054669b
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv
_
uti
l
.hpp"
#include "ck/library/utility/conv
ol
uti
on_parameter
.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
...
...
profiler/src/profile_conv_bwd_data.cpp
0 → 100644
View file @
b054669b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_bwd_data_impl.hpp"
namespace
{
enum
struct
ConvLayout
{
NCHW_KYXC_NKHW
,
// 0
NHWC_KYXC_NHWK
,
// 1
};
enum
struct
ConvDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
};
static
void
print_helper_msg
()
{
// clang-format-off
std
::
cout
<<
"arg1: tensor operation (conv_bwd_data: Convolution Backward Data)
\n
"
<<
"arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)
\n
"
<<
"arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]
\n
"
<<
" 1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, "
"K])
\n
"
<<
"arg4: verification (0: no, 1: yes)
\n
"
<<
"arg5: initialization (0: no init, 1: integer value, 2: decimal value)
\n
"
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0: no, 1: yes)
\n
"
<<
"arg8: N spatial dimensions
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
<<
" <filter spatial dimensions>, (ie Y, X for 2D)
\n
"
<<
" <input image spatial dimensions>, (ie Hi, Wi for 2D)
\n
"
<<
" <strides>, (ie Sy, Sx for 2D)
\n
"
<<
" <dilations>, (ie Dy, Dx for 2D)
\n
"
<<
" <left padding>, (ie LeftPy, LeftPx for 2D)
\n
"
<<
" <right padding>, (ie RightPy, RightPx for 2D)
\n
"
<<
std
::
endl
;
// clang-format-on
}
ck
::
tensor_operation
::
device
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
arg_idx
,
char
*
const
argv
[])
{
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
const
ck
::
index_t
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_left_pads
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_right_pads
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
ck
::
tensor_operation
::
device
::
ConvParams
{
num_dim_spatial
,
N
,
K
,
C
,
filter_spatial_lengths
,
input_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
};
}
}
// namespace
int
profile_conv_bwd_data
(
int
argc
,
char
*
argv
[])
{
// 8 for control, 1 for num_dim_spatial
if
(
argc
<
9
)
{
print_helper_msg
();
return
1
;
}
const
auto
data_type
=
static_cast
<
ConvDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
ConvLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
num_dim_spatial
=
std
::
stoi
(
argv
[
8
]);
// 8 for control, 1 for num_dim_spatial, 3 for N/K/C, and 6 * num_dim_spatial
if
(
argc
!=
8
+
4
+
6
*
num_dim_spatial
)
{
print_helper_msg
();
return
1
;
}
const
auto
params
=
parse_conv_params
(
num_dim_spatial
,
9
,
argv
);
using
NWC
=
ck
::
tensor_layout
::
convolution
::
NWC
;
using
NHWC
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
NDHWC
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
KXC
=
ck
::
tensor_layout
::
convolution
::
KXC
;
using
KYXC
=
ck
::
tensor_layout
::
convolution
::
KYXC
;
using
KZYXC
=
ck
::
tensor_layout
::
convolution
::
KZYXC
;
using
NWK
=
ck
::
tensor_layout
::
convolution
::
NWK
;
using
NHWK
=
ck
::
tensor_layout
::
convolution
::
NHWK
;
using
NDHWK
=
ck
::
tensor_layout
::
convolution
::
NDHWK
;
constexpr
auto
I1
=
ck
::
Number
<
1
>
{};
constexpr
auto
I2
=
ck
::
Number
<
2
>
{};
constexpr
auto
I3
=
ck
::
Number
<
3
>
{};
auto
profile
=
[
&
](
auto
num_dim_spatial_tmp
,
auto
in_layout
,
auto
wei_layout
,
auto
out_layout
,
auto
in_type
,
auto
wei_type
,
auto
out_type
)
{
constexpr
ck
::
index_t
NDimSpatial
=
num_dim_spatial_tmp
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
OutLayout
=
decltype
(
out_layout
);
using
InDataType
=
decltype
(
in_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
bool
pass
=
ck
::
profiler
::
profile_conv_bwd_data_impl
<
NDimSpatial
,
InLayout
,
WeiLayout
,
OutLayout
,
InDataType
,
WeiDataType
,
OutDataType
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
params
);
return
pass
?
0
:
1
;
};
if
(
num_dim_spatial
==
1
&&
layout
==
ConvLayout
::
NHWC_KYXC_NHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
BF16
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
INT8
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
NHWC_KYXC_NHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
BF16
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
INT8
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
NHWC_KYXC_NHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
BF16
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
INT8
{},
INT8
{},
INT8
{});
}
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment