"vscode:/vscode.git/clone" did not exist on "33d2b5b08764d2579dd4ea555821b8986cd2dcf9"
Commit b054669b authored by Chao Liu's avatar Chao Liu
Browse files

update profiler for conv bwd data and weight

parent 6b6360b1
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
...@@ -106,7 +106,7 @@ void PrintUseMsg() ...@@ -106,7 +106,7 @@ void PrintUseMsg()
<< std::endl; << std::endl;
} }
ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int num_dim_spatial = 2; int num_dim_spatial = 2;
...@@ -118,7 +118,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) ...@@ -118,7 +118,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
exit(0); exit(0);
} }
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
int arg_idx = 4; int arg_idx = 4;
params.num_dim_spatial_ = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
...@@ -164,14 +164,12 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) ...@@ -164,14 +164,12 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck::utils::conv;
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
const int num_dim_spatial = 2; const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
if(argc >= 4) if(argc >= 4)
{ {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
...@@ -103,7 +103,7 @@ void PrintUseMsg() ...@@ -103,7 +103,7 @@ void PrintUseMsg()
<< std::endl; << std::endl;
} }
ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int num_dim_spatial = 2; int num_dim_spatial = 2;
...@@ -115,7 +115,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) ...@@ -115,7 +115,7 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
exit(0); exit(0);
} }
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
int arg_idx = 4; int arg_idx = 4;
params.num_dim_spatial_ = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
...@@ -168,7 +168,7 @@ int main(int argc, char* argv[]) ...@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
bool time_kernel = false; bool time_kernel = false;
const int num_dim_spatial = 2; const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
if(argc >= 4) if(argc >= 4)
{ {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
...@@ -100,10 +100,10 @@ void print_use_msg() ...@@ -100,10 +100,10 @@ void print_use_msg()
<< " <right padding>, (ie RightPy, RightPx for 2D)\n" << " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl; << std::endl;
} }
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) ck::tensor_operation::device::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
int arg_idx = 5; int arg_idx = 5;
params.num_dim_spatial_ = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
...@@ -171,7 +171,7 @@ int main(int argc, char* argv[]) ...@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
params.C_ = 128; params.C_ = 128;
if(argc == 4) if(argc == 4)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
...@@ -107,10 +107,10 @@ void print_use_msg() ...@@ -107,10 +107,10 @@ void print_use_msg()
<< std::endl; << std::endl;
} }
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) ck::tensor_operation::device::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
int arg_idx = 7; int arg_idx = 7;
params.num_dim_spatial_ = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
...@@ -180,7 +180,7 @@ int main(int argc, char* argv[]) ...@@ -180,7 +180,7 @@ int main(int argc, char* argv[])
int do_log = 0; int do_log = 0;
int split_k = 1; int split_k = 1;
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
params.C_ = 128; params.C_ = 128;
if(argc == 6) if(argc == 6)
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
...@@ -127,10 +127,10 @@ void print_use_msg() ...@@ -127,10 +127,10 @@ void print_use_msg()
<< std::endl; << std::endl;
} }
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) ck::tensor_operation::device::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
int arg_idx = 7; int arg_idx = 7;
params.num_dim_spatial_ = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
...@@ -200,7 +200,7 @@ int main(int argc, char* argv[]) ...@@ -200,7 +200,7 @@ int main(int argc, char* argv[])
int do_log = 0; int do_log = 0;
int split_k = 2; int split_k = 2;
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
params.C_ = 128; params.C_ = 128;
if(argc == 6) if(argc == 6)
......
...@@ -13,15 +13,17 @@ namespace ck { ...@@ -13,15 +13,17 @@ namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace host { namespace host {
// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X] template <ck::index_t NumDimSpatial,
template <typename InDataType, typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
typename InElementwiseOperation, typename InElementwiseOperation,
typename WeiElementwiseOperation, typename WeiElementwiseOperation,
typename OutElementwiseOperation, typename OutElementwiseOperation,
ck::index_t NumDimSpatial = 2, typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
typename ck::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
struct ReferenceConvBwdWeight : public device::BaseOperator struct ReferenceConvBwdWeight : public device::BaseOperator
{ {
// Argument // Argument
...@@ -69,158 +71,240 @@ struct ReferenceConvBwdWeight : public device::BaseOperator ...@@ -69,158 +71,240 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
{ {
using Argument = ReferenceConvBwdWeight::Argument; using Argument = ReferenceConvBwdWeight::Argument;
// FIXME: properly implement "TensorView" for doing transpose or refer to dimension by name
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
// tensor descriptor in NCHW/KXYC/NKHW dimensional order
HostTensorDescriptor in_desc = arg.input_.mDesc;
HostTensorDescriptor wei_desc = arg.weight_.mDesc;
HostTensorDescriptor out_desc = arg.output_.mDesc;
// input
if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NWC>)
{
in_desc = transpose_host_tensor_descriptor_given_new2old(
in_desc, std::vector<std::size_t>{0, 2, 1});
}
else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NHWC>)
{
in_desc = transpose_host_tensor_descriptor_given_new2old(
in_desc, std::vector<std::size_t>{0, 3, 1, 2});
}
else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
{
in_desc = transpose_host_tensor_descriptor_given_new2old(
in_desc, std::vector<std::size_t>{0, 4, 1, 2, 3});
}
// weight
if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC>)
{
wei_desc = transpose_host_tensor_descriptor_given_new2old(
wei_desc, std::vector<std::size_t>{0, 2, 1});
}
else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC>)
{
wei_desc = transpose_host_tensor_descriptor_given_new2old(
wei_desc, std::vector<std::size_t>{0, 3, 1, 2});
}
else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
{
wei_desc = transpose_host_tensor_descriptor_given_new2old(
wei_desc, std::vector<std::size_t>{0, 4, 1, 2, 3});
}
// output
if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NWK>)
{
out_desc = transpose_host_tensor_descriptor_given_new2old(
out_desc, std::vector<std::size_t>{0, 2, 1});
}
else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK>)
{
out_desc = transpose_host_tensor_descriptor_given_new2old(
out_desc, std::vector<std::size_t>{0, 3, 1, 2});
}
else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
{
out_desc = transpose_host_tensor_descriptor_given_new2old(
out_desc, std::vector<std::size_t>{0, 4, 1, 2, 3});
}
if constexpr(NumDimSpatial == 1) if constexpr(NumDimSpatial == 1)
{ {
constexpr auto I0 = Number<0>{}; auto f_kcx = [&](auto k, auto c, auto x) {
auto f_kcx = [&](auto k, auto c, auto x) {
float v_acc = 0; float v_acc = 0;
for(std::size_t n = 0; n < arg.output_.mDesc.GetLengths()[0]; ++n)
for(std::size_t n = 0; n < out_desc.GetLengths()[0]; ++n)
{ {
for(std::size_t wo = 0; wo < arg.output_.mDesc.GetLengths()[2]; ++wo) for(std::size_t wo = 0; wo < out_desc.GetLengths()[2]; ++wo)
{ {
auto wi = auto wi =
ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[I0]) + ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[0]) +
ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[I0]) - ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I0]); ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
if(wi >= 0 && if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.mDesc.GetLengths()[2]) ck::type_convert<std::size_t>(wi) < in_desc.GetLengths()[2])
{ {
float v_out; float v_out;
float v_in; float v_in;
arg.out_element_op_(v_out, // FIXME hacky
ck::type_convert<float>(arg.output_(n, k, wo))); arg.out_element_op_(
arg.in_element_op_(v_in, v_out,
ck::type_convert<float>(arg.input_(n, c, wi))); ck::type_convert<float>(
arg.output_
.mData[out_desc.GetOffsetFromMultiIndex(n, k, wo)]));
// FIXME hacky
arg.in_element_op_(
v_in,
ck::type_convert<float>(
arg.input_
.mData[in_desc.GetOffsetFromMultiIndex(n, c, wi)]));
v_acc += v_out * v_in; v_acc += v_out * v_in;
} }
} }
} }
float v_wei; float v_wei;
arg.wei_element_op_(v_wei, v_acc); arg.wei_element_op_(v_wei, v_acc);
arg.weight_(k, c, x) = ck::type_convert<WeiDataType>(v_wei); // FIXME hacky
arg.weight_.mData[wei_desc.GetOffsetFromMultiIndex(k, c, x)] =
ck::type_convert<WeiDataType>(v_wei);
}; };
make_ParallelTensorFunctor(f_kcx, make_ParallelTensorFunctor(f_kcx,
arg.weight_.mDesc.GetLengths()[0], wei_desc.GetLengths()[0],
arg.weight_.mDesc.GetLengths()[1], wei_desc.GetLengths()[1],
arg.weight_.mDesc.GetLengths()[2])( wei_desc.GetLengths()[2])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
} }
else if constexpr(NumDimSpatial == 2) else if constexpr(NumDimSpatial == 2)
{ {
constexpr auto I0 = Number<0>{}; auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
constexpr auto I1 = Number<1>{};
auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
float v_acc = 0; float v_acc = 0;
for(std::size_t n = 0; n < arg.output_.mDesc.GetLengths()[0]; ++n)
for(std::size_t n = 0; n < out_desc.GetLengths()[0]; ++n)
{ {
for(std::size_t ho = 0; ho < arg.output_.mDesc.GetLengths()[2]; ++ho) for(std::size_t ho = 0; ho < out_desc.GetLengths()[2]; ++ho)
{ {
auto hi = auto hi =
ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[I0]) + ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[I0]) - ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I0]); ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
for(std::size_t wo = 0; wo < arg.output_.mDesc.GetLengths()[3]; ++wo)
for(std::size_t wo = 0; wo < out_desc.GetLengths()[3]; ++wo)
{ {
auto wi = auto wi =
ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[I1]) + ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
ck::type_convert<ck::long_index_t>(x * ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
arg.conv_dilations_[I1]) - ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I1]);
if(hi >= 0 && if(hi >= 0 &&
ck::type_convert<std::size_t>(hi) < ck::type_convert<std::size_t>(hi) < in_desc.GetLengths()[2] &&
arg.input_.mDesc.GetLengths()[2] &&
wi >= 0 && wi >= 0 &&
ck::type_convert<std::size_t>(wi) < ck::type_convert<std::size_t>(wi) < in_desc.GetLengths()[3])
arg.input_.mDesc.GetLengths()[3])
{ {
float v_out; float v_out;
float v_in; float v_in;
// FIXME hacky
arg.out_element_op_( arg.out_element_op_(
v_out, ck::type_convert<float>(arg.output_(n, k, ho, wo))); v_out,
ck::type_convert<float>(
arg.output_.mData[out_desc.GetOffsetFromMultiIndex(
n, k, ho, wo)]));
// FIXME hacky
arg.in_element_op_( arg.in_element_op_(
v_in, ck::type_convert<float>(arg.input_(n, c, hi, wi))); v_in,
ck::type_convert<float>(
arg.input_.mData[in_desc.GetOffsetFromMultiIndex(
n, c, hi, wi)]));
v_acc += v_out * v_in; v_acc += v_out * v_in;
} }
} }
} }
} }
float v_wei; float v_wei;
arg.wei_element_op_(v_wei, v_acc); arg.wei_element_op_(v_wei, v_acc);
arg.weight_(k, c, y, x) = ck::type_convert<WeiDataType>(v_wei); // FIXME hacky
arg.weight_.mData[wei_desc.GetOffsetFromMultiIndex(k, c, y, x)] =
ck::type_convert<WeiDataType>(v_wei);
}; };
make_ParallelTensorFunctor(f_kcyx, make_ParallelTensorFunctor(f_kcyx,
arg.weight_.mDesc.GetLengths()[0], wei_desc.GetLengths()[0],
arg.weight_.mDesc.GetLengths()[1], wei_desc.GetLengths()[1],
arg.weight_.mDesc.GetLengths()[2], wei_desc.GetLengths()[2],
arg.weight_.mDesc.GetLengths()[3])( wei_desc.GetLengths()[3])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
} }
else if constexpr(NumDimSpatial == 3) else if constexpr(NumDimSpatial == 3)
{ {
constexpr auto I0 = Number<0>{}; auto f_kczyx = [&](auto k, auto c, auto z, auto y, auto x) {
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
auto f_kczyx = [&](auto k, auto c, auto z, auto y, auto x) {
float v_acc = 0; float v_acc = 0;
for(std::size_t n = 0; n < arg.output_.mDesc.GetLengths()[0]; ++n) for(std::size_t n = 0; n < out_desc.GetLengths()[0]; ++n)
{ {
for(std::size_t do_ = 0; do_ < arg.output_.mDesc.GetLengths()[2]; ++do_) for(std::size_t do_ = 0; do_ < out_desc.GetLengths()[2]; ++do_)
{ {
auto di = auto di =
ck::type_convert<ck::long_index_t>(do_ * arg.conv_strides_[I0]) + ck::type_convert<ck::long_index_t>(do_ * arg.conv_strides_[0]) +
ck::type_convert<ck::long_index_t>(z * arg.conv_dilations_[I0]) - ck::type_convert<ck::long_index_t>(z * arg.conv_dilations_[0]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I0]); ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
for(std::size_t ho = 0; ho < arg.output_.mDesc.GetLengths()[3]; ++ho) for(std::size_t ho = 0; ho < out_desc.GetLengths()[3]; ++ho)
{ {
auto hi = auto hi =
ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[I1]) + ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[1]) +
ck::type_convert<ck::long_index_t>(y * ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[1]) -
arg.conv_dilations_[I1]) - ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I1]); for(std::size_t wo = 0; wo < out_desc.GetLengths()[4]; ++wo)
for(std::size_t wo = 0; wo < arg.output_.mDesc.GetLengths()[4];
++wo)
{ {
auto wi = auto wi =
ck::type_convert<ck::long_index_t>(wo * ck::type_convert<ck::long_index_t>(wo *
arg.conv_strides_[I2]) + arg.conv_strides_[2]) +
ck::type_convert<ck::long_index_t>( ck::type_convert<ck::long_index_t>(x *
x * arg.conv_dilations_[I2]) - arg.conv_dilations_[2]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I2]); ck::type_convert<ck::long_index_t>(arg.in_left_pads_[2]);
if(di >= 0 && if(di >= 0 &&
ck::type_convert<std::size_t>(di) < ck::type_convert<std::size_t>(di) <
arg.input_.mDesc.GetLengths()[2] && in_desc.GetLengths()[2] &&
hi >= 0 && hi >= 0 &&
ck::type_convert<std::size_t>(hi) < ck::type_convert<std::size_t>(hi) <
arg.input_.mDesc.GetLengths()[3] && in_desc.GetLengths()[3] &&
wi >= 0 && wi >= 0 &&
ck::type_convert<std::size_t>(wi) < ck::type_convert<std::size_t>(wi) < in_desc.GetLengths()[4])
arg.input_.mDesc.GetLengths()[4])
{ {
float v_out; float v_out;
float v_in; float v_in;
arg.out_element_op_(v_out, // FIXME hacky
ck::type_convert<float>( arg.out_element_op_(
arg.output_(n, k, do_, ho, wo))); v_out,
ck::type_convert<float>(
arg.output_.mData[out_desc.GetOffsetFromMultiIndex(
n, k, do_, ho, wo)]));
// FIXME hacky
arg.in_element_op_( arg.in_element_op_(
v_in, v_in,
ck::type_convert<float>(arg.input_(n, c, di, hi, wi))); ck::type_convert<float>(
arg.input_.mData[in_desc.GetOffsetFromMultiIndex(
n, c, di, hi, wi)]));
v_acc += v_out * v_in; v_acc += v_out * v_in;
} }
...@@ -228,19 +312,22 @@ struct ReferenceConvBwdWeight : public device::BaseOperator ...@@ -228,19 +312,22 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
} }
} }
} }
float v_wei; float v_wei;
arg.wei_element_op_(v_wei, v_acc); arg.wei_element_op_(v_wei, v_acc);
arg.weight_(k, c, z, y, x) = ck::type_convert<WeiDataType>(v_wei); // FIXME hacky
arg.weight_.mData[wei_desc.GetOffsetFromMultiIndex(k, c, z, y, x)] =
ck::type_convert<WeiDataType>(v_wei);
}; };
make_ParallelTensorFunctor(f_kczyx, make_ParallelTensorFunctor(f_kczyx,
arg.weight_.mDesc.GetLengths()[0], wei_desc.GetLengths()[0],
arg.weight_.mDesc.GetLengths()[1], wei_desc.GetLengths()[1],
arg.weight_.mDesc.GetLengths()[2], wei_desc.GetLengths()[2],
arg.weight_.mDesc.GetLengths()[3], wei_desc.GetLengths()[3],
arg.weight_.mDesc.GetLengths()[4])( wei_desc.GetLengths()[4])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
......
...@@ -14,16 +14,17 @@ namespace ck { ...@@ -14,16 +14,17 @@ namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace host { namespace host {
// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X] template <ck::index_t NDimSpatial,
template <typename InDataType, typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
typename AccDataType,
typename InElementwiseOperation, typename InElementwiseOperation,
typename WeiElementwiseOperation, typename WeiElementwiseOperation,
typename OutElementwiseOperation, typename OutElementwiseOperation,
ck::index_t NumDimSpatial = 2, typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
typename ck::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
struct ReferenceConvBwdData : public device::BaseOperator struct ReferenceConvBwdData : public device::BaseOperator
{ {
// Argument // Argument
...@@ -71,38 +72,105 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -71,38 +72,105 @@ struct ReferenceConvBwdData : public device::BaseOperator
{ {
using Argument = ReferenceConvBwdData::Argument; using Argument = ReferenceConvBwdData::Argument;
// FIXME: properly implement "TensorView" for doing transpose or refer to dimension by name
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
if constexpr(NumDimSpatial == 1) // tensor descriptor in NCHW/KXYC/NKHW dimensional order
HostTensorDescriptor in_desc = arg.input_.mDesc;
HostTensorDescriptor wei_desc = arg.weight_.mDesc;
HostTensorDescriptor out_desc = arg.output_.mDesc;
// input
if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NWC>)
{
in_desc = transpose_host_tensor_descriptor_given_new2old(
in_desc, std::vector<std::size_t>{0, 2, 1});
}
else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NHWC>)
{
in_desc = transpose_host_tensor_descriptor_given_new2old(
in_desc, std::vector<std::size_t>{0, 3, 1, 2});
}
else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
{
in_desc = transpose_host_tensor_descriptor_given_new2old(
in_desc, std::vector<std::size_t>{0, 4, 1, 2, 3});
}
// weight
if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC>)
{
wei_desc = transpose_host_tensor_descriptor_given_new2old(
wei_desc, std::vector<std::size_t>{0, 2, 1});
}
else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC>)
{
wei_desc = transpose_host_tensor_descriptor_given_new2old(
wei_desc, std::vector<std::size_t>{0, 3, 1, 2});
}
else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
{
wei_desc = transpose_host_tensor_descriptor_given_new2old(
wei_desc, std::vector<std::size_t>{0, 4, 1, 2, 3});
}
// output
if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NWK>)
{
out_desc = transpose_host_tensor_descriptor_given_new2old(
out_desc, std::vector<std::size_t>{0, 2, 1});
}
else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK>)
{
out_desc = transpose_host_tensor_descriptor_given_new2old(
out_desc, std::vector<std::size_t>{0, 3, 1, 2});
}
else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
{
out_desc = transpose_host_tensor_descriptor_given_new2old(
out_desc, std::vector<std::size_t>{0, 4, 1, 2, 3});
}
if constexpr(NDimSpatial == 1)
{ {
auto f_ncw = [&](auto n, auto c, auto wi) { auto f_ncw = [&](auto n, auto c, auto wi) {
std::size_t K = arg.weight_.mDesc.GetLengths()[0]; std::size_t K = wei_desc.GetLengths()[0];
std::size_t X = arg.weight_.mDesc.GetLengths()[2]; std::size_t X = wei_desc.GetLengths()[2];
std::size_t Wo = arg.output_.mDesc.GetLengths()[2]; std::size_t Wo = out_desc.GetLengths()[2];
AccDataType v_acc = 0; float v_acc = 0;
for(std::size_t x = 0; x < X; ++x) for(std::size_t x = 0; x < X; ++x)
{ {
auto w_tmp = ck::type_convert<ck::long_index_t>(wi) + auto w_tmp = ck::type_convert<ck::long_index_t>(wi) +
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) - ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) -
ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]); ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]);
if(w_tmp % arg.conv_strides_[0] == 0) if(w_tmp % arg.conv_strides_[0] == 0)
{ {
auto wo = ck::type_convert<ck::long_index_t>(w_tmp) / auto wo = ck::type_convert<ck::long_index_t>(w_tmp) /
ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]); ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]);
if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo) if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
{ {
for(std::size_t k = 0; k < K; ++k) for(std::size_t k = 0; k < K; ++k)
{ {
AccDataType v_out = 0; float v_out = 0;
AccDataType v_wei = 0; float v_wei = 0;
// FIXME hacky
arg.out_element_op_( arg.out_element_op_(
v_out, v_out,
ck::type_convert<AccDataType>(arg.output_(n, k, wo))); ck::type_convert<float>(
arg.output_.mData[out_desc.GetOffsetFromMultiIndex(
n, k, wo)]));
// FIXME hacky
arg.wei_element_op_( arg.wei_element_op_(
v_wei, ck::type_convert<AccDataType>(arg.weight_(k, c, x))); v_wei,
ck::type_convert<float>(
arg.weight_
.mData[wei_desc.GetOffsetFromMultiIndex(k, c, x)]));
v_acc += v_out * v_wei; v_acc += v_out * v_wei;
} }
...@@ -110,29 +178,34 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -110,29 +178,34 @@ struct ReferenceConvBwdData : public device::BaseOperator
} }
} }
arg.in_element_op_(v_acc, v_acc); float v_in;
arg.input_(n, c, wi) = ck::type_convert<InDataType>(v_acc);
arg.in_element_op_(v_in, v_acc);
// FIXME hacky
arg.input_.mData[in_desc.GetOffsetFromMultiIndex(n, c, wi)] =
ck::type_convert<InDataType>(v_acc);
}; };
make_ParallelTensorFunctor(f_ncw, make_ParallelTensorFunctor(f_ncw,
arg.input_.mDesc.GetLengths()[0], in_desc.GetLengths()[0],
arg.input_.mDesc.GetLengths()[1], in_desc.GetLengths()[1],
arg.input_.mDesc.GetLengths()[2])( in_desc.GetLengths()[2])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
} }
else if constexpr(NumDimSpatial == 2) else if constexpr(NDimSpatial == 2)
{ {
auto f_nchw = [&](auto n, auto c, auto hi, auto wi) { auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
std::size_t K = arg.weight_.mDesc.GetLengths()[0]; std::size_t K = wei_desc.GetLengths()[0];
std::size_t Y = arg.weight_.mDesc.GetLengths()[2]; std::size_t Y = wei_desc.GetLengths()[2];
std::size_t X = arg.weight_.mDesc.GetLengths()[3]; std::size_t X = wei_desc.GetLengths()[3];
std::size_t Ho = arg.output_.mDesc.GetLengths()[2]; std::size_t Ho = out_desc.GetLengths()[2];
std::size_t Wo = arg.output_.mDesc.GetLengths()[3]; std::size_t Wo = out_desc.GetLengths()[3];
AccDataType v_acc = 0; float v_acc = 0;
for(std::size_t y = 0; y < Y; ++y) for(std::size_t y = 0; y < Y; ++y)
{ {
...@@ -161,15 +234,24 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -161,15 +234,24 @@ struct ReferenceConvBwdData : public device::BaseOperator
{ {
for(std::size_t k = 0; k < K; ++k) for(std::size_t k = 0; k < K; ++k)
{ {
AccDataType v_out = 0; float v_out = 0;
AccDataType v_wei = 0; float v_wei = 0;
arg.out_element_op_(v_out, // FIXME hacky
ck::type_convert<AccDataType>( arg.out_element_op_(
arg.output_(n, k, ho, wo))); v_out,
arg.wei_element_op_(v_wei, ck::type_convert<float>(
ck::type_convert<AccDataType>( arg.output_
arg.weight_(k, c, y, x))); .mData[out_desc.GetOffsetFromMultiIndex(
n, k, ho, wo)]));
// FIXME hacky
arg.wei_element_op_(
v_wei,
ck::type_convert<float>(
arg.weight_
.mData[wei_desc.GetOffsetFromMultiIndex(
k, c, y, x)]));
v_acc += v_out * v_wei; v_acc += v_out * v_wei;
} }
...@@ -180,33 +262,37 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -180,33 +262,37 @@ struct ReferenceConvBwdData : public device::BaseOperator
} }
} }
AccDataType v_in; float v_in;
arg.in_element_op_(v_in, v_acc); arg.in_element_op_(v_in, v_acc);
arg.input_(n, c, hi, wi) = ck::type_convert<InDataType>(v_in);
// FIXME hacky
arg.input_.mData[in_desc.GetOffsetFromMultiIndex(n, c, hi, wi)] =
ck::type_convert<InDataType>(v_acc);
}; };
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_nchw,
arg.input_.mDesc.GetLengths()[0], in_desc.GetLengths()[0],
arg.input_.mDesc.GetLengths()[1], in_desc.GetLengths()[1],
arg.input_.mDesc.GetLengths()[2], in_desc.GetLengths()[2],
arg.input_.mDesc.GetLengths()[3])( in_desc.GetLengths()[3])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
} }
else if constexpr(NumDimSpatial == 3) else if constexpr(NDimSpatial == 3)
{ {
auto f_ncdhw = [&](auto n, auto c, auto di, auto hi, auto wi) { auto f_ncdhw = [&](auto n, auto c, auto di, auto hi, auto wi) {
std::size_t K = arg.weight_.mDesc.GetLengths()[0]; std::size_t K = wei_desc.GetLengths()[0];
std::size_t Z = arg.weight_.mDesc.GetLengths()[2]; std::size_t Z = wei_desc.GetLengths()[2];
std::size_t Y = arg.weight_.mDesc.GetLengths()[3]; std::size_t Y = wei_desc.GetLengths()[3];
std::size_t X = arg.weight_.mDesc.GetLengths()[4]; std::size_t X = wei_desc.GetLengths()[4];
std::size_t Do = arg.output_.mDesc.GetLengths()[2]; std::size_t Do = out_desc.GetLengths()[2];
std::size_t Ho = arg.output_.mDesc.GetLengths()[3]; std::size_t Ho = out_desc.GetLengths()[3];
std::size_t Wo = arg.output_.mDesc.GetLengths()[4]; std::size_t Wo = out_desc.GetLengths()[4];
AccDataType v_acc = 0; float v_acc = 0;
for(std::size_t z = 0; z < Z; ++z) for(std::size_t z = 0; z < Z; ++z)
{ {
...@@ -241,6 +327,7 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -241,6 +327,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg.in_left_pads_[2]) - arg.in_left_pads_[2]) -
ck::type_convert<ck::long_index_t>( ck::type_convert<ck::long_index_t>(
x * arg.conv_dilations_[2]); x * arg.conv_dilations_[2]);
if(w_tmp % arg.conv_strides_[2] == 0) if(w_tmp % arg.conv_strides_[2] == 0)
{ {
auto wo = auto wo =
...@@ -252,18 +339,30 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -252,18 +339,30 @@ struct ReferenceConvBwdData : public device::BaseOperator
{ {
for(std::size_t k = 0; k < K; ++k) for(std::size_t k = 0; k < K; ++k)
{ {
AccDataType v_out = 0; float v_out = 0;
AccDataType v_wei = 0; float v_wei = 0;
// FIXME hacky
arg.out_element_op_( arg.out_element_op_(
v_out, v_out,
ck::type_convert<AccDataType>( ck::type_convert<float>(
arg.output_( arg.output_.mData
n, k, do_, ho, wo))); [out_desc
.GetOffsetFromMultiIndex(
n,
k,
do_,
ho,
wo)]));
// FIXME hacky
arg.wei_element_op_( arg.wei_element_op_(
v_wei, v_wei,
ck::type_convert<AccDataType>( ck::type_convert<float>(
arg.weight_(k, c, z, y, x))); arg.weight_.mData
[wei_desc
.GetOffsetFromMultiIndex(
k, c, z, y, x)]));
v_acc += v_out * v_wei; v_acc += v_out * v_wei;
} }
...@@ -277,17 +376,21 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -277,17 +376,21 @@ struct ReferenceConvBwdData : public device::BaseOperator
} }
} }
AccDataType v_in; float v_in;
arg.in_element_op_(v_in, v_acc); arg.in_element_op_(v_in, v_acc);
arg.input_(n, c, di, hi, wi) = ck::type_convert<InDataType>(v_in);
// FIXME hacky
arg.input_.mData[in_desc.GetOffsetFromMultiIndex(n, c, wi)] =
ck::type_convert<InDataType>(v_acc);
}; };
make_ParallelTensorFunctor(f_ncdhw, make_ParallelTensorFunctor(f_ncdhw,
arg.input_.mDesc.GetLengths()[0], in_desc.GetLengths()[0],
arg.input_.mDesc.GetLengths()[1], in_desc.GetLengths()[1],
arg.input_.mDesc.GetLengths()[2], in_desc.GetLengths()[2],
arg.input_.mDesc.GetLengths()[3], in_desc.GetLengths()[3],
arg.input_.mDesc.GetLengths()[4])( in_desc.GetLengths()[4])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
......
...@@ -17,9 +17,8 @@ set(PROFILER_SOURCE ...@@ -17,9 +17,8 @@ set(PROFILER_SOURCE
src/profile_conv_fwd.cpp src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp src/profile_conv_fwd_bias_relu_add.cpp
src/profile_convnd_bwd_data.cpp src/profile_conv_bwd_data.cpp
src/profile_conv_bwd_weight.cpp src/profile_conv_bwd_weight.cpp
src/profile_convnd_bwd_weight.cpp
src/profile_reduce.cpp src/profile_reduce.cpp
src/profile_normalization.cpp src/profile_normalization.cpp
) )
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
......
...@@ -8,7 +8,8 @@ ...@@ -8,7 +8,8 @@
#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp" #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
...@@ -63,72 +64,6 @@ namespace ck { ...@@ -63,72 +64,6 @@ namespace ck {
namespace profiler { namespace profiler {
using DeviceConvBwdDataNoOpPtr = ck::tensor_operation::device::instance::DeviceConvBwdDataNoOpPtr; using DeviceConvBwdDataNoOpPtr = ck::tensor_operation::device::instance::DeviceConvBwdDataNoOpPtr;
template <typename InLayout>
HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
template <typename WeiLayout>
HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
template <typename OutLayout>
HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
template <typename InDataType, typename WeiDataType, typename OutDataType> template <typename InDataType, typename WeiDataType, typename OutDataType>
void get_device_conv_bwd_data_op_ptr( void get_device_conv_bwd_data_op_ptr(
InDataType, WeiDataType, OutDataType, std::vector<DeviceConvBwdDataNoOpPtr>&, int) InDataType, WeiDataType, OutDataType, std::vector<DeviceConvBwdDataNoOpPtr>&, int)
...@@ -221,21 +156,6 @@ void get_device_conv_bwd_data_op_ptr( ...@@ -221,21 +156,6 @@ void get_device_conv_bwd_data_op_ptr(
} }
} }
template <typename T>
static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
{
float max_diff = 1e-6;
for(std::size_t i = 0; i < ref.mData.size(); ++i)
{
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
return false;
}
}
return true;
}
template <typename DataType> template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc) void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{ {
...@@ -263,27 +183,17 @@ void show_data_nhwc_layout(Tensor<DataType>& nhwc) ...@@ -263,27 +183,17 @@ void show_data_nhwc_layout(Tensor<DataType>& nhwc)
} }
template <int NDimSpatial, template <int NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InLayout, typename InLayout,
typename WeiLayout, typename WeiLayout,
typename OutLayout> typename OutLayout,
bool profile_convnd_bwd_data_impl(int do_verification, typename InDataType,
int init_method, typename WeiDataType,
bool do_log, typename OutDataType>
bool time_kernel, bool profile_conv_bwd_data_impl(int do_verification,
ck::index_t N, int init_method,
ck::index_t K, bool do_log,
ck::index_t C, bool time_kernel,
const std::vector<ck::index_t>& input_spatial_lengths, const ck::tensor_operation::device::ConvParams& params)
const std::vector<ck::index_t>& filter_spatial_lengths,
const std::vector<ck::index_t>& output_spatial_lengths,
const std::vector<ck::index_t>& conv_filter_strides,
const std::vector<ck::index_t>& conv_filter_dilations,
const std::vector<ck::index_t>& input_left_pads,
const std::vector<ck::index_t>& input_right_pads)
{ {
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
...@@ -293,31 +203,83 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -293,31 +203,83 @@ bool profile_convnd_bwd_data_impl(int do_verification,
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)}; // make host tensor descritpor
input_dims.insert( auto f_nhwc_host_tensor_descriptor =
std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths)); [](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)}; static_cast<std::size_t>(c)};
filter_dims.insert(std::end(filter_dims), nhwc_lengths.insert(
std::begin(filter_spatial_lengths), nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
std::end(filter_spatial_lengths));
return HostTensorDescriptor(nhwc_lengths);
std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)}; };
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths), auto f_nchw_host_tensor_descriptor =
std::end(output_spatial_lengths)); [](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nchw_lengths{static_cast<std::size_t>(n),
Tensor<InDataType> input_host_result( static_cast<std::size_t>(c)};
get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial)); nchw_lengths.insert(nchw_lengths.end(), spatial_lengths.begin(), spatial_lengths.end());
Tensor<InDataType> input_device_result(
get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial)); return HostTensorDescriptor(nchw_lengths);
Tensor<WeiDataType> weights( };
get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
Tensor<OutDataType> output( HostTensorDescriptor in_desc, wei_desc, out_desc;
get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
// FIXME: properly implement "make host descriptor" for different layout
if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NWC> ||
is_same_v<InLayout, ck::tensor_layout::convolution::NHWC> ||
is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
{
in_desc =
f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_);
}
else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NCW> ||
is_same_v<InLayout, ck::tensor_layout::convolution::NCHW> ||
is_same_v<InLayout, ck::tensor_layout::convolution::NCDHW>)
{
in_desc =
f_nchw_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_);
}
// FIXME: properly implement "make host descriptor" for different layout
if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
{
wei_desc =
f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_);
}
else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KCX> ||
is_same_v<WeiLayout, ck::tensor_layout::convolution::KCYX> ||
is_same_v<WeiLayout, ck::tensor_layout::convolution::KCZYX>)
{
wei_desc =
f_nchw_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_);
}
// FIXME: properly implement "make host descriptor" for different layout
if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NWK> ||
is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK> ||
is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
{
out_desc =
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths());
}
else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NKW> ||
is_same_v<OutLayout, ck::tensor_layout::convolution::NKHW> ||
is_same_v<OutLayout, ck::tensor_layout::convolution::NKDHW>)
{
out_desc =
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths());
}
Tensor<InDataType> input_host_result(in_desc);
Tensor<InDataType> input_device_result(in_desc);
Tensor<WeiDataType> weight(wei_desc);
Tensor<OutDataType> output(out_desc);
std::cout << "input: " << input_host_result.mDesc << std::endl; std::cout << "input: " << input_host_result.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl; std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl; std::cout << "output: " << output.mDesc << std::endl;
switch(init_method) switch(init_method)
...@@ -325,50 +287,46 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -325,50 +287,46 @@ bool profile_convnd_bwd_data_impl(int do_verification,
case 0: break; case 0: break;
case 1: case 1:
output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break; break;
default: default:
output.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1}); output.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1}); weight.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace()); DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
out_device_buf.ToDevice(output.mData.data()); out_device_buf.ToDevice(output.mData.data());
wei_device_buf.ToDevice(weights.mData.data()); wei_device_buf.ToDevice(weight.mData.data());
// reset input to zero
in_device_buf.SetZero();
if(do_verification) if(do_verification)
{ {
auto RunReference = [&](auto& ref_conv) { auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
auto ref_invoker = ref_conv.MakeInvoker(); InLayout,
WeiLayout,
auto ref_argument = ref_conv.MakeArgument(input_host_result, OutLayout,
weights, InDataType,
output,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
};
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
AccDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp>{};
NDimSpatial>();
RunReference(ref_conv); auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input_host_result,
weight,
output,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
} }
// add device Conv instances // add device Conv instances
...@@ -381,29 +339,30 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -381,29 +339,30 @@ bool profile_convnd_bwd_data_impl(int do_verification,
throw std::runtime_error("wrong! no device Conv instance found"); throw std::runtime_error("wrong! no device Conv instance found");
} }
std::string best_conv_name; std::string best_op_name;
float best_ave_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
// profile device Conv instances // profile device Conv instances
bool success = true; bool pass = true;
for(auto& conv_ptr : conv_ptrs) for(auto& conv_ptr : conv_ptrs)
{ {
auto argument_ptr = conv_ptr->MakeArgumentPointer( auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
N, params.N_,
K, params.K_,
C, params.C_,
input_spatial_lengths, params.input_spatial_lengths_,
filter_spatial_lengths, params.filter_spatial_lengths_,
output_spatial_lengths, params.output_spatial_lengths_,
conv_filter_strides, params.conv_filter_strides_,
conv_filter_dilations, params.conv_filter_dilations_,
input_left_pads, params.input_left_pads_,
input_right_pads, params.input_right_pads_,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
...@@ -412,28 +371,28 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -412,28 +371,28 @@ bool profile_convnd_bwd_data_impl(int do_verification,
if(conv_ptr->IsSupportedArgument(argument_ptr.get())) if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
std::string conv_name = conv_ptr->GetTypeString(); // reset input to zero
in_device_buf.SetZero();
std::string op_name = conv_ptr->GetTypeString();
float ave_time = float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t flop = params.GetFlops();
ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths); std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
std::size_t num_btype =
ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s" << std::endl; << " GB/s" << std::endl;
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
best_conv_name = conv_name; best_op_name = op_name;
best_tflops = tflops; best_tflops = tflops;
best_ave_time = ave_time; best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec; best_gb_per_sec = gb_per_sec;
} }
...@@ -441,18 +400,8 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -441,18 +400,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
{ {
in_device_buf.FromDevice(input_device_result.mData.data()); in_device_buf.FromDevice(input_device_result.mData.data());
if(!check_out(input_host_result, input_device_result)) pass =
{ pass & ck::utils::check_err(input_device_result.mData, input_host_result.mData);
std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
success = false;
}
else
{
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
}
check_error(input_host_result, input_device_result);
if(do_log) if(do_log)
{ {
...@@ -461,7 +410,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -461,7 +410,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std::cout << std::endl; std::cout << std::endl;
std::cout << "wei: "; std::cout << "wei: ";
show_data_nhwc_layout(weights); show_data_nhwc_layout(weight);
std::cout << std::endl; std::cout << std::endl;
std::cout << "out_host : "; std::cout << "out_host : ";
...@@ -476,9 +425,10 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -476,9 +425,10 @@ bool profile_convnd_bwd_data_impl(int do_verification,
} }
} }
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_avg_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return success;
return pass;
} }
} // namespace profiler } // namespace profiler
......
...@@ -25,7 +25,7 @@ namespace ck { ...@@ -25,7 +25,7 @@ namespace ck {
namespace profiler { namespace profiler {
// FIXME: only support NCHW and NHWC layout, need to be more general // FIXME: only support NCHW and NHWC layout, need to be more general
template <ck::index_t NumDimSpatial, template <ck::index_t NDimSpatial,
typename InLayout, typename InLayout,
typename WeiLayout, typename WeiLayout,
typename OutLayout, typename OutLayout,
...@@ -146,7 +146,7 @@ int profile_conv_fwd_impl(int do_verification, ...@@ -146,7 +146,7 @@ int profile_conv_fwd_impl(int do_verification,
in_device_buf.ToDevice(input.mData.data()); in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data()); wei_device_buf.ToDevice(weight.mData.data());
using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
...@@ -166,7 +166,7 @@ int profile_conv_fwd_impl(int do_verification, ...@@ -166,7 +166,7 @@ int profile_conv_fwd_impl(int do_verification,
// run reference op // run reference op
if(do_verification) if(do_verification)
{ {
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NumDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
......
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using DeviceConvndBwdWeightNoOpPtr =
DeviceConvBwdWeightPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
std::vector<DeviceConvndBwdWeightNoOpPtr>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
using DeviceConvndBwdWeightNoOpPtr =
ck::tensor_operation::device::instance::DeviceConvndBwdWeightNoOpPtr;
template <typename InLayout>
HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
template <typename WeiLayout>
HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
template <typename OutLayout>
HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
template <typename InDataType, typename WeiDataType, typename OutDataType>
void get_device_conv_bwd_weight_op_ptr(
InDataType, WeiDataType, OutDataType, std::vector<DeviceConvndBwdWeightNoOpPtr>&, int)
{
std::cout << "can not find device conv bwd weight" << std::endl;
exit(1);
}
template <>
void get_device_conv_bwd_weight_op_ptr(
F32, F32, F32, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 1:
ck::tensor_operation::device::instance::
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
break;
case 2:
ck::tensor_operation::device::instance::
add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
break;
case 3:
ck::tensor_operation::device::instance::
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
break;
default: break;
}
}
template <>
void get_device_conv_bwd_weight_op_ptr(
F16, F16, F16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 1:
ck::tensor_operation::device::instance::
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
break;
case 2:
ck::tensor_operation::device::instance::
add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
break;
case 3:
ck::tensor_operation::device::instance::
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
break;
default: break;
}
}
template <>
void get_device_conv_bwd_weight_op_ptr(
BF16, BF16, BF16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 1:
ck::tensor_operation::device::instance::
add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
break;
case 2:
ck::tensor_operation::device::instance::
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
break;
case 3:
ck::tensor_operation::device::instance::
add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
break;
default: break;
}
}
template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
template <int NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
bool profile_convnd_bwd_weight_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads,
ck::index_t split_k)
{
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
input_dims.insert(
std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
filter_dims.insert(std::end(filter_dims),
std::begin(filter_spatial_lengths),
std::end(filter_spatial_lengths));
std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
Tensor<WeiDataType> weights_host_result(
get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
Tensor<WeiDataType> weights_device_result(
get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
Tensor<OutDataType> output(
get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights_host_result.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
output.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
break;
default:
input.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights_device_result.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
out_device_buf.ToDevice(output.mData.data());
// reset input to zero
wei_device_buf.SetZero();
if(do_verification)
{
auto RunReference = [&](auto& ref_conv) {
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights_host_result,
output,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
};
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
NDimSpatial>();
RunReference(ref_conv);
}
// add device Conv instances
std::vector<DeviceConvndBwdWeightNoOpPtr> conv_ptrs;
get_device_conv_bwd_weight_op_ptr(
InDataType{}, WeiDataType{}, OutDataType{}, conv_ptrs, NDimSpatial);
if(conv_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device Conv instance found");
}
std::string best_conv_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device Conv instances
bool success = true;
for(auto& conv_ptr : conv_ptrs)
{
// using atomic, so need to reset input, setzero is done in invoker
// if(split_k > 1)
//{
// wei_device_buf.SetZero();
//}
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op,
split_k);
if(!conv_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::cout << "wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
<< std::endl;
continue;
}
auto invoker_ptr = conv_ptr->MakeInvokerPointer();
std::string conv_name = conv_ptr->GetTypeString();
float ave_time = 0;
if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
{
// alloc work space
size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get());
if(bwd_weight_workspace_size <= 0)
{
printf("wrong work space size\n");
exit(1);
}
DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
wei_work_space_device_buf.SetZero();
conv_ptr->SetWorkSpacePointer(argument_ptr.get(),
wei_work_space_device_buf.GetDeviceBuffer());
ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
}
else
{
ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
}
std::size_t flop =
ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s" << std::endl;
if(tflops > best_tflops)
{
best_conv_name = conv_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
wei_device_buf.FromDevice(weights_device_result.mData.data());
float max_error = check_error(weights_host_result, weights_device_result);
if(max_error > 8)
{
std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
success = false;
}
else
{
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
}
check_error(weights_host_result, weights_device_result);
if(do_log)
{
std::cout << "in : ";
show_data_nhwc_layout(output);
std::cout << std::endl;
std::cout << "wei: ";
show_data_nhwc_layout(weights_host_result);
std::cout << std::endl;
std::cout << "out : ";
show_data_nhwc_layout(input);
std::cout << std::endl;
std::cout << "wei_device: ";
show_data_nhwc_layout(weights_device_result);
std::cout << std::endl;
}
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
return success;
}
} // namespace profiler
} // namespace ck
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_bwd_data_impl.hpp"
namespace {
enum struct ConvLayout
{
NCHW_KYXC_NKHW, // 0
NHWC_KYXC_NHWK, // 1
};
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
static void print_helper_msg()
{
// clang-format-off
std::cout
<< "arg1: tensor operation (conv_bwd_data: Convolution Backward Data)\n"
<< "arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n"
<< "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
<< " 1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, "
"K])\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< "arg8: N spatial dimensions\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
// clang-format-on
}
ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{
const ck::index_t N = std::stoi(argv[arg_idx++]);
const ck::index_t K = std::stoi(argv[arg_idx++]);
const ck::index_t C = std::stoi(argv[arg_idx++]);
std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
std::vector<ck::index_t> input_left_pads(num_dim_spatial);
std::vector<ck::index_t> input_right_pads(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
input_left_pads[i] = std::stoi(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
input_right_pads[i] = std::stoi(argv[arg_idx++]);
}
return ck::tensor_operation::device::ConvParams{num_dim_spatial,
N,
K,
C,
filter_spatial_lengths,
input_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
}
} // namespace
int profile_conv_bwd_data(int argc, char* argv[])
{
// 8 for control, 1 for num_dim_spatial
if(argc < 9)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 8 for control, 1 for num_dim_spatial, 3 for N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 4 + 6 * num_dim_spatial)
{
print_helper_msg();
return 1;
}
const auto params = parse_conv_params(num_dim_spatial, 9, argv);
using NWC = ck::tensor_layout::convolution::NWC;
using NHWC = ck::tensor_layout::convolution::NHWC;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
using KXC = ck::tensor_layout::convolution::KXC;
using KYXC = ck::tensor_layout::convolution::KYXC;
using KZYXC = ck::tensor_layout::convolution::KZYXC;
using NWK = ck::tensor_layout::convolution::NWK;
using NHWK = ck::tensor_layout::convolution::NHWK;
using NDHWK = ck::tensor_layout::convolution::NDHWK;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp,
auto in_layout,
auto wei_layout,
auto out_layout,
auto in_type,
auto wei_type,
auto out_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using WeiLayout = decltype(wei_layout);
using OutLayout = decltype(out_layout);
using InDataType = decltype(in_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
bool pass = ck::profiler::profile_conv_bwd_data_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
};
if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment