"mmdet3d/vscode:/vscode.git/clone" did not exist on "9419a722c3887436060a00223a27d14acced73e2"
Commit d92fb7e8 authored by rocking's avatar rocking
Browse files

Merge commit 'a3c910ac' into gemm_softmax

parents bfc80764 a3c910ac
#pragma once
namespace ck {
namespace profiler {
int profile_convnd_fwd(int argc, char* argv[]);
} // namespace profiler
} // namespace ck
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "element_wise_reduce_operation.hpp" #include "reduction_operator.hpp"
#include "device_gemm_reduce.hpp" #include "device_gemm_reduce.hpp"
#include "reference_gemm.hpp" #include "reference_gemm.hpp"
...@@ -20,8 +20,7 @@ using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePt ...@@ -20,8 +20,7 @@ using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePt
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::ReduceSum, ck::tensor_operation::element_wise::UnarySquare<float, float, false>>;
ck::tensor_operation::element_wise::ReduceSquareSum>;
void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances( void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&); std::vector<DeviceGemmReduceNoOpPtr>&);
...@@ -113,17 +112,19 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -113,17 +112,19 @@ bool profile_gemm_reduce_impl(int do_verification,
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
} }
using AElementOp = ck::tensor_operation::element_wise::PassThrough; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum; using D0ReduceOp = ck::reduce::Add<float>;
using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum; using D1ReduceOp = ck::reduce::Add<float>;
using D1ElementOp = ck::tensor_operation::element_wise::UnarySquare<float, float, false>;
const auto a_element_op = AElementOp{}; const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; const auto c_element_op = CElementOp{};
const auto d0_reduce_op = D0ReduceOp{}; const auto d0_reduce_op = D0ReduceOp{};
const auto d1_reduce_op = D1ReduceOp{}; const auto d1_reduce_op = D1ReduceOp{};
const auto d1_element_op = D1ElementOp{};
if(do_verification) if(do_verification)
{ {
...@@ -140,17 +141,21 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -140,17 +141,21 @@ bool profile_gemm_reduce_impl(int do_verification,
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
float d0_acc = d0_reduce_op.GetReduceZeroValue(); float d0_acc = d0_reduce_op.GetReductionZeroVal();
float d1_acc = d1_reduce_op.GetReduceZeroValue(); float d1_acc = d1_reduce_op.GetReductionZeroVal();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
d0_reduce_op.Reduce(d0_acc, c_m_n_host_result(m, n)); float d0_val = ck::type_convert<float>(c_m_n_host_result(m, n));
d1_reduce_op.Reduce(d1_acc, c_m_n_host_result(m, n)); float d1_val;
d1_element_op(d1_val, d0_val);
d0_reduce_op(d0_acc, d0_val);
d1_reduce_op(d1_acc, d1_val);
} }
d0_m_host_result(m) = d0_acc; d0_m_host_result(m) = ck::type_convert<DDataType>(d0_acc);
d1_m_host_result(m) = d1_acc; d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc);
} }
} }
...@@ -232,8 +237,7 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -232,8 +237,7 @@ bool profile_gemm_reduce_impl(int do_verification,
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op, c_element_op,
d0_reduce_op, d1_element_op);
d1_reduce_op);
auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
......
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_conv_fwd_impl.hpp"
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvInputLayout
{
NCHW, // 0
NHWC, // 1
};
enum struct ConvWeightLayout
{
KCYX, // 0
KYXC, // 1
};
enum struct ConvOutputLayout
{
NKHW, // 0
NHWK, // 1
};
int profile_conv_fwd(int argc, char* argv[])
{
if(argc != 25)
{
printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
const auto wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
const auto out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
const ck::index_t C = std::stoi(argv[12]);
const ck::index_t Y = std::stoi(argv[13]);
const ck::index_t X = std::stoi(argv[14]);
const ck::index_t Hi = std::stoi(argv[15]);
const ck::index_t Wi = std::stoi(argv[16]);
const ck::index_t conv_stride_h = std::stoi(argv[17]);
const ck::index_t conv_stride_w = std::stoi(argv[18]);
const ck::index_t conv_dilation_h = std::stoi(argv[19]);
const ck::index_t conv_dilation_w = std::stoi(argv[20]);
const ck::index_t in_left_pad_h = std::stoi(argv[21]);
const ck::index_t in_left_pad_w = std::stoi(argv[22]);
const ck::index_t in_right_pad_h = std::stoi(argv[23]);
const ck::index_t in_right_pad_w = std::stoi(argv[24]);
const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
float,
float,
float,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
uint16_t,
uint16_t,
uint16_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
int8_t,
int8_t,
int8_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else
{
throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
}
return 1;
}
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
#include "profile_convnd_bwd_data_impl.hpp" #include "profile_convnd_bwd_data_impl.hpp"
namespace {
enum struct ConvDataType enum struct ConvDataType
{ {
F32_F32_F32, // 0 F32_F32_F32, // 0
...@@ -76,6 +78,8 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], ...@@ -76,6 +78,8 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[],
return params; return params;
} }
} // namespace
int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
{ {
const int preParams = 10; const int preParams = 10;
......
#include <cstdlib>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <half.hpp>
#include "conv_fwd_util.hpp"
#include "element_wise_operation.hpp"
#include "fill.hpp"
#include "profile_convnd_fwd.hpp"
#include "tensor_layout.hpp"
namespace {
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvDataLayout
{
NCHW, // 0
NHWC, // 1
};
namespace ctl = ck::tensor_layout::convolution;
template <int NDim, ConvDataLayout DataLayout>
struct ConvolutionLayouts;
template <>
struct ConvolutionLayouts<1, ConvDataLayout::NHWC>
{
typedef ctl::NWC Input;
typedef ctl::KXC Weight;
typedef ctl::NWK Output;
};
template <>
struct ConvolutionLayouts<2, ConvDataLayout::NHWC>
{
typedef ctl::NHWC Input;
typedef ctl::KYXC Weight;
typedef ctl::NHWK Output;
};
template <>
struct ConvolutionLayouts<3, ConvDataLayout::NHWC>
{
typedef ctl::NDHWC Input;
typedef ctl::KZYXC Weight;
typedef ctl::NDHWK Output;
};
template <>
struct ConvolutionLayouts<1, ConvDataLayout::NCHW>
{
typedef ctl::NCW Input;
typedef ctl::KCX Weight;
typedef ctl::NKW Output;
};
template <>
struct ConvolutionLayouts<2, ConvDataLayout::NCHW>
{
typedef ctl::NCHW Input;
typedef ctl::KCYX Weight;
typedef ctl::NKHW Output;
};
template <>
struct ConvolutionLayouts<3, ConvDataLayout::NCHW>
{
typedef ctl::NCDHW Input;
typedef ctl::KCZYX Weight;
typedef ctl::NKDHW Output;
};
void print_use_msg()
{
std::cout << "arg1: tensor operation (conv_fwd: ForwardConvolution)\n"
<< "arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n"
<< "arg3: data layout (0: NCHW; 1: NHWC)\n"
<< "arg4: verification (0=no, 1=yes)\n"
<< "arg5: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: run kernel # of times (>1)\n"
<< "arg8: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 9;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(1);
}
int arg_idx = 9;
return ck::utils::conv::parse_conv_params(num_dim_spatial, arg_idx, argv);
}
template <int NDim,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename ConvLayouts>
void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
int nrepeat,
int init_method,
ConvLayouts)
{
using namespace std::placeholders;
using namespace ck::utils;
std::unique_ptr<OpInstance<OutDataType, InDataType, WeiDataType>> conv_instance;
switch(init_method)
{
case 0:
conv_instance =
std::make_unique<conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output>>(params, false);
break;
case 1:
conv_instance = std::make_unique<
conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::utils::FillUniform<int>,
ck::utils::FillUniform<int>>>(
params, true, ck::utils::FillUniform<int>{}, ck::utils::FillUniform<int>{});
break;
case 2:
conv_instance = std::make_unique<
conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::utils::FillUniform<InDataType>,
ck::utils::FillUniform<WeiDataType>>>(
params,
true,
ck::utils::FillUniform<InDataType>{},
ck::utils::FillUniform<WeiDataType>{});
break;
default: throw std::runtime_error("Unsupported init method!");
}
auto reference_conv_fwd_fun = std::bind(
conv::run_reference_convolution_forward<NDim, InDataType, WeiDataType, OutDataType>,
params,
_1,
_2,
_3);
OpInstanceRunEngine<InDataType, WeiDataType, OutDataType> run_engine(*conv_instance,
reference_conv_fwd_fun);
auto best_conf = run_engine.Profile(
conv::ConvolutionFwdInstances<InDataType, WeiDataType, OutDataType>::template Get<NDim>(),
nrepeat,
do_verification,
do_log);
std::cout << "Best configuration parameters:"
<< "\nname: " << best_conf.best_op_name << "\navg_time: " << best_conf.best_avg_time
<< "\ntflops: " << best_conf.best_tflops << "\nGB/s: " << best_conf.best_gb_per_sec
<< std::endl;
}
template <int NDim>
void profile_convnd_instances(ConvDataType data_type,
ConvDataLayout data_layout,
const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
int nrepeat,
int init_method)
{
switch(data_layout)
{
case ConvDataLayout::NHWC: {
switch(data_type)
{
case ConvDataType::F32_F32_F32:
profile_convnd_instances_impl<NDim, float, float, float>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::F16_F16_F16:
profile_convnd_instances_impl<NDim, ck::half_t, ck::half_t, ck::half_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::BF16_BF16_BF16:
profile_convnd_instances_impl<NDim, ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::INT8_INT8_INT8:
profile_convnd_instances_impl<NDim, int8_t, int8_t, int8_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
}
break;
}
case ConvDataLayout::NCHW: {
switch(data_type)
{
case ConvDataType::F32_F32_F32:
profile_convnd_instances_impl<NDim, float, float, float>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::F16_F16_F16:
profile_convnd_instances_impl<NDim, ck::half_t, ck::half_t, ck::half_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::BF16_BF16_BF16:
profile_convnd_instances_impl<NDim, ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::INT8_INT8_INT8:
profile_convnd_instances_impl<NDim, int8_t, int8_t, int8_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
}
break;
}
}
}
} // namespace
int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
{
using namespace ck::utils::conv;
ConvDataType data_type{ConvDataType::F32_F32_F32};
ConvDataLayout data_layout{ConvDataLayout::NHWC};
bool do_verification{true};
int init_method{2};
bool do_log{false};
int nrepeat{100};
int num_dim_spatial{2};
ConvParams params;
if(argc >= 4)
{
data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
data_layout = static_cast<ConvDataLayout>(std::stoi(argv[3]));
}
if(argc >= 9)
{
do_verification = std::stoi(argv[4]);
init_method = std::stoi(argv[5]);
do_log = std::stoi(argv[6]);
nrepeat = std::stoi(argv[7]);
num_dim_spatial = std::stoi(argv[8]);
}
if(argc >= 10)
{
params = parse_params(num_dim_spatial, argc, argv);
}
// TODO Print nice message what is being profiled.
switch(num_dim_spatial)
{
case 1:
profile_convnd_instances<1>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
break;
case 2:
profile_convnd_instances<2>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
break;
case 3:
profile_convnd_instances<3>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
break;
default:
throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
std::to_string(num_dim_spatial));
}
return 1;
}
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include "profile_convnd_fwd.hpp"
int profile_gemm(int, char*[]); int profile_gemm(int, char*[]);
int profile_gemm_bias_2d(int, char*[]); int profile_gemm_bias_2d(int, char*[]);
int profile_gemm_bias_relu(int, char*[]); int profile_gemm_bias_relu(int, char*[]);
...@@ -11,7 +13,6 @@ int profile_gemm_bias_relu_add(int, char*[]); ...@@ -11,7 +13,6 @@ int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]); int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]); int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]); int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]); int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
...@@ -56,7 +57,7 @@ int main(int argc, char* argv[]) ...@@ -56,7 +57,7 @@ int main(int argc, char* argv[])
} }
else if(strcmp(argv[1], "conv_fwd") == 0) else if(strcmp(argv[1], "conv_fwd") == 0)
{ {
return profile_conv_fwd(argc, argv); return ck::profiler::profile_convnd_fwd(argc, argv);
} }
else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0) else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
{ {
......
find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' #find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
#!/usr/bin/env python3
import os, io
import argparse
def print_to_string(*args, **kwargs):
output = io.StringIO()
print(*args, file=output, **kwargs)
contents = output.getvalue()
output.close()
return contents
def parse_args():
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
args = parser.parse_args()
files = []
if os.path.isdir(args.filename):
all_files = os.listdir(args.filename)
for name in all_files:
if not 'log' in name:
continue
files.append(os.path.join(args.filename, name))
else:
files = [args.filename]
args.files = files
return args
def main():
args = parse_args()
results = []
#parse results
glue=""
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
results.append(print_to_string(glue.join(lst[8:]),lst[4]))
#sort results
#read baseline results for the latest develop branch
#write new results to the db
#compare the results to the baseline
#return 0 if performance criteria met, otherwise return 1
print(results)
return 0
if __name__ == '__main__':
main()
\ No newline at end of file
#!/bin/bash #!/bin/bash
## GPU visibility ## GPU visibility
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
#make -j ckProfiler
make -j ckProfiler DRIVER="../build/bin/ckProfiler"
echo $DRIVER
DRIVER="./profiler/ckProfiler"
OP=$1 OP=$1
DATATYPE=$2 DATATYPE=$2
LAYOUT=$3 LAYOUT=$3
...@@ -43,3 +41,13 @@ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1 ...@@ -43,3 +41,13 @@ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 6656 8192 8192 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3328 4096 4096 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1664 2048 2048 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 832 1024 1024 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7040 8192 8192 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 5120 5632 4096 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2560 2816 2048 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1280 1408 1024 -1 -1 -1
...@@ -24,6 +24,7 @@ include_directories(BEFORE ...@@ -24,6 +24,7 @@ include_directories(BEFORE
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR}) add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
add_custom_target(tests) add_custom_target(tests)
function(add_test_executable TEST_NAME) function(add_test_executable TEST_NAME)
message("adding test ${TEST_NAME}") message("adding test ${TEST_NAME}")
add_executable(${TEST_NAME} ${ARGN}) add_executable(${TEST_NAME} ${ARGN})
...@@ -32,6 +33,20 @@ function(add_test_executable TEST_NAME) ...@@ -32,6 +33,20 @@ function(add_test_executable TEST_NAME)
add_dependencies(check ${TEST_NAME}) add_dependencies(check ${TEST_NAME})
endfunction(add_test_executable TEST_NAME) endfunction(add_test_executable TEST_NAME)
include(GoogleTest)
function(add_gtest_executable TEST_NAME)
message("adding gtest ${TEST_NAME}")
add_executable(${TEST_NAME} ${ARGN})
add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME})
# suppress gtest warnings
target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors)
target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
gtest_discover_tests(${TEST_NAME})
endfunction(add_gtest_executable TEST_NAME)
add_subdirectory(magic_number_division) add_subdirectory(magic_number_division)
add_subdirectory(space_filling_curve) add_subdirectory(space_filling_curve)
add_subdirectory(conv_util) add_subdirectory(conv_util)
......
...@@ -4,5 +4,4 @@ include_directories(BEFORE ...@@ -4,5 +4,4 @@ include_directories(BEFORE
) )
add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor) target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_fwd_util)
target_link_libraries(test_conv2d_bwd_weight PRIVATE device_conv2d_bwd_weight_instance)
add_test_executable(test_conv_util conv_util.cpp) add_gtest_executable(test_conv_util conv_util.cpp)
target_link_libraries(test_conv_util PRIVATE host_tensor) target_link_libraries(test_conv_util PRIVATE host_tensor conv_fwd_util)
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "gtest/gtest.h"
#include "config.hpp" #include "config.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
...@@ -9,196 +10,194 @@ ...@@ -9,196 +10,194 @@
namespace { namespace {
bool test_conv_params_get_output_spatial_lengths() class TestConvUtil : public ::testing::Test
{ {
bool res{true}; public:
// -------------------------- default 2D ------------------------------------ void SetNDParams(std::size_t ndims)
{
conv_params.num_dim_spatial = ndims;
conv_params.filter_spatial_lengths = std::vector<ck::index_t>(ndims, 3);
conv_params.input_spatial_lengths = std::vector<ck::index_t>(ndims, 71);
conv_params.conv_filter_strides = std::vector<ck::index_t>(ndims, 2);
conv_params.conv_filter_dilations = std::vector<ck::index_t>(ndims, 1);
conv_params.input_left_pads = std::vector<ck::index_t>(ndims, 1);
conv_params.input_right_pads = std::vector<ck::index_t>(ndims, 1);
}
protected:
// ------- default 2D -------
// input NCHW {128,192,71,71}, // input NCHW {128,192,71,71},
// weights KCYX {256,192,3,3}, // weights KCYX {256,192,3,3},
// stride {2,2}, // stride {2,2},
// dilations {1,1}, // dilations {1,1},
// padding {{1,1}, {1,1}} // padding {{1,1}, {1,1}}
ck::utils::conv::ConvParams conv_params; ck::utils::conv::ConvParams conv_params;
};
} // namespace
TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths2D)
{
ck::utils::conv::ConvParams conv_params;
std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths(); std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(out_spatial_len, EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
std::vector<ck::index_t>{36, 36}, std::vector<ck::index_t>{36, 36},
"Error: ConvParams 2D default constructor."); "Error: ConvParams 2D default constructor."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1}; conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}."); out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2}; conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
conv_params.input_left_pads = std::vector<ck::index_t>{2, 2}; conv_params.input_left_pads = std::vector<ck::index_t>{2, 2};
conv_params.input_right_pads = std::vector<ck::index_t>{2, 2}; conv_params.input_right_pads = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(out_spatial_len, EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
std::vector<ck::index_t>{37, 37}, std::vector<ck::index_t>{37, 37},
"Error: ConvParams 2D padding left/right {2,2}."); "Error: ConvParams 2D padding left/right {2,2}."));
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2}; conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}."); out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3}; conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3};
conv_params.input_left_pads = std::vector<ck::index_t>{1, 1}; conv_params.input_left_pads = std::vector<ck::index_t>{1, 1};
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1}; conv_params.input_right_pads = std::vector<ck::index_t>{1, 1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2}; conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = EXPECT_TRUE(
ck::utils::check_err(out_spatial_len, ck::utils::check_err(out_spatial_len,
std::vector<ck::index_t>{23, 23}, std::vector<ck::index_t>{23, 23},
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."); "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."));
}
// -------------------------- 1D ------------------------------------ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths1D)
conv_params.num_dim_spatial = 1; {
conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3}; SetNDParams(1);
conv_params.input_spatial_lengths = std::vector<ck::index_t>{71};
conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{1};
conv_params.input_left_pads = std::vector<ck::index_t>{1};
conv_params.input_right_pads = std::vector<ck::index_t>{1};
out_spatial_len = conv_params.GetOutputSpatialLengths(); std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D."); out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{1}; conv_params.conv_filter_strides = std::vector<ck::index_t>{1};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}."); out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{2}; conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
conv_params.input_left_pads = std::vector<ck::index_t>{2}; conv_params.input_left_pads = std::vector<ck::index_t>{2};
conv_params.input_right_pads = std::vector<ck::index_t>{2}; conv_params.input_right_pads = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(out_spatial_len, EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
std::vector<ck::index_t>{37}, std::vector<ck::index_t>{37},
"Error: ConvParams 1D padding left/right {2}."); "Error: ConvParams 1D padding left/right {2}."));
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2}; conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}."); out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{3}; conv_params.conv_filter_strides = std::vector<ck::index_t>{3};
conv_params.input_left_pads = std::vector<ck::index_t>{1}; conv_params.input_left_pads = std::vector<ck::index_t>{1};
conv_params.input_right_pads = std::vector<ck::index_t>{1}; conv_params.input_right_pads = std::vector<ck::index_t>{1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2}; conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(out_spatial_len, EXPECT_TRUE(
std::vector<ck::index_t>{23}, ck::utils::check_err(out_spatial_len,
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."); std::vector<ck::index_t>{23},
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."));
// -------------------------- 3D ------------------------------------ }
conv_params.num_dim_spatial = 3;
conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3}; TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
conv_params.input_spatial_lengths = std::vector<ck::index_t>{71, 71, 71}; {
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2}; SetNDParams(3);
conv_params.conv_filter_dilations = std::vector<ck::index_t>{1, 1, 1};
conv_params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; EXPECT_TRUE(ck::utils::check_err(
out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D."));
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(
out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1, 1}; conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(out_spatial_len, EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
std::vector<ck::index_t>{71, 71, 71}, std::vector<ck::index_t>{71, 71, 71},
"Error: ConvParams 3D stride {1, 1, 1}."); "Error: ConvParams 3D stride {1, 1, 1}."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2}; conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
conv_params.input_left_pads = std::vector<ck::index_t>{2, 2, 2}; conv_params.input_left_pads = std::vector<ck::index_t>{2, 2, 2};
conv_params.input_right_pads = std::vector<ck::index_t>{2, 2, 2}; conv_params.input_right_pads = std::vector<ck::index_t>{2, 2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(out_spatial_len, EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
std::vector<ck::index_t>{37, 37, 37}, std::vector<ck::index_t>{37, 37, 37},
"Error: ConvParams 3D padding left/right {2, 2, 2}."); "Error: ConvParams 3D padding left/right {2, 2, 2}."));
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2}; conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err(out_spatial_len, EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
std::vector<ck::index_t>{36, 36, 36}, std::vector<ck::index_t>{36, 36, 36},
"Error: ConvParams 3D dilation {2, 2, 2}."); "Error: ConvParams 3D dilation {2, 2, 2}."));
conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3, 3}; conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3, 3};
conv_params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; conv_params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; conv_params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2}; conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths(); out_spatial_len = conv_params.GetOutputSpatialLengths();
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
out_spatial_len, out_spatial_len,
std::vector<ck::index_t>{23, 23, 23}, std::vector<ck::index_t>{23, 23, 23},
"Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."); "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."));
return res;
} }
bool test_get_host_tensor_descriptor() TEST(ConvUtil, GetHostTensorDescriptor)
{ {
bool res{true};
namespace tl = ck::tensor_layout::convolution; namespace tl = ck::tensor_layout::convolution;
std::vector<std::size_t> dims{2, 3, 4, 5}; std::vector<std::size_t> dims{2, 3, 4, 5};
HostTensorDescriptor h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{}); HostTensorDescriptor h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
res = EXPECT_TRUE(ck::utils::check_err(
ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!"); h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!"));
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!"); h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!"));
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCHW{}); h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCHW{});
res = EXPECT_TRUE(ck::utils::check_err(
ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!"); h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!"));
res = ck::utils::check_err( EXPECT_TRUE(ck::utils::check_err(
h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!"); h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!"));
dims = std::vector<std::size_t>{2, 3, 4}; dims = std::vector<std::size_t>{2, 3, 4};
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{}); h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!"); EXPECT_TRUE(
res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!"));
ck::utils::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!"); EXPECT_TRUE(ck::utils::check_err(
h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!"));
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCW{}); h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCW{});
res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!"); EXPECT_TRUE(
res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!"));
ck::utils::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!"); EXPECT_TRUE(ck::utils::check_err(
h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!"));
dims = std::vector<std::size_t>{2, 3, 4, 5, 6}; dims = std::vector<std::size_t>{2, 3, 4, 5, 6};
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{}); h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
res = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!"); EXPECT_TRUE(
res = ck::utils::check_err(h.GetStrides(), ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!"));
{3 * 4 * 5 * 6, // N EXPECT_TRUE(ck::utils::check_err(h.GetStrides(),
1, // C {3 * 4 * 5 * 6, // N
3 * 5 * 6, // D 1, // C
3 * 6, // H 3 * 5 * 6, // D
3}, // W 3 * 6, // H
"Error: wrong NDHWC dimensions strides!"); 3}, // W
"Error: wrong NDHWC dimensions strides!"));
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCDHW{});
res = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!"); h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCDHW{});
res = ck::utils::check_err(h.GetStrides(), EXPECT_TRUE(
{3 * 4 * 5 * 6, // N ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!"));
4 * 5 * 6, // C EXPECT_TRUE(ck::utils::check_err(h.GetStrides(),
5 * 6, // D {3 * 4 * 5 * 6, // N
6, // H 4 * 5 * 6, // C
1}, // W 5 * 6, // D
"Error: wrong NCDHW dimensions strides!"); 6, // H
1}, // W
return res; "Error: wrong NCDHW dimensions strides!"));
}
} // namespace
int main(void)
{
bool res = test_conv_params_get_output_spatial_lengths();
std::cout << "test_conv_params_get_output_spatial_lengths ..... "
<< (res ? "SUCCESS" : "FAILURE") << std::endl;
res = test_get_host_tensor_descriptor();
std::cout << "test_get_host_tensor_descriptor ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
return res ? 0 : 1;
} }
...@@ -4,5 +4,4 @@ include_directories(BEFORE ...@@ -4,5 +4,4 @@ include_directories(BEFORE
) )
add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp) add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor) target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_fwd_util)
target_link_libraries(test_convnd_bwd_data PRIVATE device_convnd_bwd_data_instance)
add_custom_target(test_convnd_fwd) add_custom_target(test_convnd_fwd)
add_test_executable(test_conv1d_fwd conv1d_fwd.cpp) add_gtest_executable(test_conv1d_fwd conv1d_fwd.cpp)
target_link_libraries(test_conv1d_fwd PRIVATE host_tensor) target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_fwd_util)
target_link_libraries(test_conv1d_fwd PRIVATE device_conv1d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv1d_fwd) add_dependencies(test_convnd_fwd test_conv1d_fwd)
add_test_executable(test_conv2d_fwd conv2d_fwd.cpp) add_gtest_executable(test_conv2d_fwd conv2d_fwd.cpp)
target_link_libraries(test_conv2d_fwd PRIVATE host_tensor) target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_fwd_util)
target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv2d_fwd) add_dependencies(test_convnd_fwd test_conv2d_fwd)
add_test_executable(test_conv3d_fwd conv3d_fwd.cpp) add_gtest_executable(test_conv3d_fwd conv3d_fwd.cpp)
target_link_libraries(test_conv3d_fwd PRIVATE host_tensor) target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_fwd_util)
target_link_libraries(test_conv3d_fwd PRIVATE device_conv3d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv3d_fwd) add_dependencies(test_convnd_fwd test_conv3d_fwd)
...@@ -2,155 +2,92 @@ ...@@ -2,155 +2,92 @@
#include <stdexcept> #include <stdexcept>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "gtest/gtest.h"
#include "data_type.hpp" #include "data_type.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
#include "conv_util.hpp" #include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv1d_fwd_instance {
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv1d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace { namespace {
bool test_conv1D_nwc() template <typename T>
bool test_conv1d_nwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
{ {
bool res{true}; using namespace std::placeholders;
using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
params.num_dim_spatial = 1; params.num_dim_spatial = 1;
params.N = 2;
params.K = 16;
params.C = 4;
params.filter_spatial_lengths = std::vector<ck::index_t>{3}; params.filter_spatial_lengths = std::vector<ck::index_t>{3};
params.input_spatial_lengths = std::vector<ck::index_t>{16}; params.input_spatial_lengths = std::vector<ck::index_t>{71};
params.conv_filter_strides = std::vector<ck::index_t>{1}; params.conv_filter_strides = std::vector<ck::index_t>{2};
params.conv_filter_dilations = std::vector<ck::index_t>{1}; params.conv_filter_dilations = std::vector<ck::index_t>{1};
params.input_left_pads = std::vector<ck::index_t>{1}; params.input_left_pads = std::vector<ck::index_t>{1};
params.input_right_pads = std::vector<ck::index_t>{1}; params.input_right_pads = std::vector<ck::index_t>{1};
auto host_tensors = conv::ConvFwdOpInstance<T, T, T, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(params);
ck::utils::conv::get_host_tensors<float,
float,
float,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>(params);
const Tensor<float>& input = std::get<0>(host_tensors);
const Tensor<float>& weights = std::get<1>(host_tensors);
Tensor<float>& host_output = std::get<2>(host_tensors);
Tensor<float>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output);
test::conv::RunConv<1>(params, input, weights, device_output);
res = res &&
ck::utils::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res; auto reference_conv_fwd_fun =
std::bind(conv::run_reference_convolution_forward<1, T, T, T>, params, _1, _2, _3);
OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
return run_engine.Test(conv_ptrs);
} }
template <typename T> } // anonymous namespace
bool test_conv1d_nwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
TEST(Conv1DFwdNWC, TestConv1D)
{ {
using namespace std::placeholders;
using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
params.num_dim_spatial = 1; params.num_dim_spatial = 1;
params.N = 2;
params.K = 16;
params.C = 4;
params.filter_spatial_lengths = std::vector<ck::index_t>{3}; params.filter_spatial_lengths = std::vector<ck::index_t>{3};
params.input_spatial_lengths = std::vector<ck::index_t>{71}; params.input_spatial_lengths = std::vector<ck::index_t>{16};
params.conv_filter_strides = std::vector<ck::index_t>{2}; params.conv_filter_strides = std::vector<ck::index_t>{1};
params.conv_filter_dilations = std::vector<ck::index_t>{1}; params.conv_filter_dilations = std::vector<ck::index_t>{1};
params.input_left_pads = std::vector<ck::index_t>{1}; params.input_left_pads = std::vector<ck::index_t>{1};
params.input_right_pads = std::vector<ck::index_t>{1}; params.input_right_pads = std::vector<ck::index_t>{1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<T, test::conv::get_test_convolution_fwd_instance<1>(conv_ptrs);
T, conv::ConvFwdOpInstance<float, float, float, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(
T, params);
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC, auto reference_conv_fwd_fun = std::bind(
ck::tensor_layout::convolution::NWK>(params); conv::run_reference_convolution_forward<1, float, float, float>, params, _1, _2, _3);
const Tensor<T>& input = std::get<0>(host_tensors); OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
const Tensor<T>& weights = std::get<1>(host_tensors); run_engine.SetAtol(1e-5);
Tensor<T>& host_output = std::get<2>(host_tensors); run_engine.SetRtol(1e-4);
Tensor<T>& device_output = std::get<3>(host_tensors); EXPECT_TRUE(run_engine.Test(conv_ptrs));
ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output);
return ck::utils::conv::run_convolution_forward_instances<1>(
params, conv_ptrs, input, weights, device_output, host_output);
}
bool test_conv1d_nwc_bf16_instances()
{
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
ck::tensor_operation::device::device_conv1d_fwd_instance::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
return test_conv1d_nwc_instances<ck::bhalf_t>(conv_ptrs);
} }
bool test_conv1d_nwc_f16_instances() TEST(Conv1DFwdNWC, Bf16Iinstances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv1d_nwc_instances<ck::bhalf_t>(
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<1>()));
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
return test_conv1d_nwc_instances<ck::half_t>(conv_ptrs);
} }
bool test_conv1d_nwc_f32_instances() TEST(Conv1DFwdNWC, F16Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv1d_nwc_instances<ck::half_t>(
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<1>()));
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
return test_conv1d_nwc_instances<float>(conv_ptrs);
} }
bool test_conv1d_nwc_int8_instances() TEST(Conv1DFwdNWC, F32Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv1d_nwc_instances<float>(
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<1>()));
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
return test_conv1d_nwc_instances<int8_t>(conv_ptrs);
} }
} // anonymous namespace TEST(Conv1DFwdNWC, Int8Instances)
int main()
{ {
bool res{true}; EXPECT_TRUE(test_conv1d_nwc_instances<int8_t>(
res = test_conv1D_nwc(); ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<1>()));
std::cout << "test_conv1D_nwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
res = test_conv1d_nwc_bf16_instances();
std::cout << "\nTestConv1DNWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv1d_nwc_f16_instances();
std::cout << "\ntest_conv1d_nwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv1d_nwc_f32_instances();
std::cout << "\ntest_conv1d_nwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv1d_nwc_int8_instances();
std::cout << "\ntes_tconv1_dnw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
return res ? 0 : 1;
} }
#include <half.hpp> #include <half.hpp>
#include <iostream> #include <iostream>
#include <stdexcept>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "gtest/gtest.h"
#include "data_type.hpp" #include "data_type.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
#include "conv_util.hpp" #include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv2d_fwd_instance {
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv2d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace { namespace {
bool test_conv2d_nhwc()
{
bool res{true};
ck::utils::conv::ConvParams params;
params.N = 2;
params.K = 16;
params.C = 4;
params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
auto host_tensors = ck::utils::conv::get_host_tensors(params);
const Tensor<float>& input = std::get<0>(host_tensors);
const Tensor<float>& weights = std::get<1>(host_tensors);
Tensor<float>& host_output = std::get<2>(host_tensors);
Tensor<float>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output);
test::conv::RunConv<2>(params, input, weights, device_output);
res = res &&
ck::utils::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res;
}
template <typename T> template <typename T>
bool test_conv2d_nhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs) bool test_conv2d_nhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
{ {
ck::utils::conv::ConvParams params; using namespace std::placeholders;
using namespace ck::utils;
conv::ConvParams params;
params.num_dim_spatial = 2; params.num_dim_spatial = 2;
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3}; params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
params.input_spatial_lengths = std::vector<ck::index_t>{71, 71}; params.input_spatial_lengths = std::vector<ck::index_t>{71, 71};
...@@ -74,77 +26,66 @@ bool test_conv2d_nhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_pt ...@@ -74,77 +26,66 @@ bool test_conv2d_nhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_pt
params.input_left_pads = std::vector<ck::index_t>{1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1};
auto host_tensors = conv::ConvFwdOpInstance<T, T, T> conv_instance(params);
ck::utils::conv::get_host_tensors<T,
T, auto reference_conv_fwd_fun =
T, std::bind(conv::run_reference_convolution_forward<2, T, T, T>, params, _1, _2, _3);
ck::tensor_layout::convolution::NHWC, OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
ck::tensor_layout::convolution::KYXC, return run_engine.Test(conv_ptrs);
ck::tensor_layout::convolution::NHWK>(params);
const Tensor<T>& input = std::get<0>(host_tensors);
const Tensor<T>& weights = std::get<1>(host_tensors);
Tensor<T>& host_output = std::get<2>(host_tensors);
Tensor<T>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output);
return ck::utils::conv::run_convolution_forward_instances<2>(
params, conv_ptrs, input, weights, device_output, host_output);
} }
bool test_conv2d_nhwc_bf16_instances() } // anonymous namespace
TEST(Conv2DFwdNHWC, TestConv2D)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; using namespace std::placeholders;
ck::tensor_operation::device::device_conv2d_fwd_instance:: using namespace ck::utils;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
return test_conv2d_nhwc_instances<ck::bhalf_t>(conv_ptrs); ck::utils::conv::ConvParams params;
params.N = 2;
params.K = 16;
params.C = 4;
params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
test::conv::get_test_convolution_fwd_instance<2>(conv_ptrs);
conv::ConvFwdOpInstance<float, float, float> conv_instance(params);
auto reference_conv_fwd_fun = std::bind(
conv::run_reference_convolution_forward<2, float, float, float>, params, _1, _2, _3);
OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
run_engine.SetAtol(1e-5);
run_engine.SetRtol(1e-4);
EXPECT_TRUE(run_engine.Test(conv_ptrs));
} }
bool test_conv2d_nhwc_f16_instances() TEST(Conv2DFwdNHWC, Bf16Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv2d_nhwc_instances<ck::bhalf_t>(
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<2>()));
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
return test_conv2d_nhwc_instances<ck::half_t>(conv_ptrs);
} }
bool test_conv2d_nhwc_f32_instances() TEST(Conv2DFwdNHWC, F16Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv2d_nhwc_instances<ck::half_t>(
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<2>()));
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
return test_conv2d_nhwc_instances<float>(conv_ptrs);
} }
bool test_conv2d_nhwc_int8_instances() TEST(Conv2DFwdNHWC, BF32Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv2d_nhwc_instances<float>(
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<2>()));
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
return test_conv2d_nhwc_instances<int8_t>(conv_ptrs);
} }
} // anonymous namespace TEST(Conv2DFwdNHWC, F32Instances)
{
EXPECT_TRUE(test_conv2d_nhwc_instances<float>(
ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<2>()));
}
int main() TEST(Conv2DFwdNHWC, Int8Instances)
{ {
bool res{true}; EXPECT_TRUE(test_conv2d_nhwc_instances<int8_t>(
res = test_conv2d_nhwc(); ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<2>()));
std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
res = test_conv2d_nhwc_bf16_instances();
std::cout << "\ntest_conv2d_nhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv2d_nhwc_f16_instances();
std::cout << "\ntest_conv2d_nhwc_f16_instances ....." << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv2d_nhwc_f32_instances();
std::cout << "\ntest_conv2d_nhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv2d_nhwc_int8_instances();
std::cout << "\ntest_conv2d_nhwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
return res ? 0 : 1;
} }
...@@ -3,42 +3,49 @@ ...@@ -3,42 +3,49 @@
#include <stdexcept> #include <stdexcept>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "gtest/gtest.h"
#include "data_type.hpp" #include "data_type.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
#include "conv_util.hpp" #include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances. namespace {
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck { template <typename T>
namespace tensor_operation { bool test_conv3d_ndhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
namespace device { {
namespace device_conv3d_fwd_instance { using namespace std::placeholders;
using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&); conv::ConvParams params;
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&); params.N = 64;
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&); params.num_dim_spatial = 3;
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&); params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2};
params.input_spatial_lengths = std::vector<ck::index_t>{32, 32, 2};
params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
params.conv_filter_dilations = std::vector<ck::index_t>{1, 1, 1};
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
} // namespace device_conv3d_fwd_instance conv::ConvFwdOpInstance<T, T, T, ctl::NDHWC, ctl::KZYXC, ctl::NDHWK> conv_instance(params);
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace { auto reference_conv_fwd_fun =
std::bind(conv::run_reference_convolution_forward<3, T, T, T>, params, _1, _2, _3);
OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
return run_engine.Test(conv_ptrs);
}
} // anonymous namespace
bool test_conv3d_ndhwc() TEST(Conv3DFwdNDHWC, TestConv3D)
{ {
bool res{true}; using namespace std::placeholders;
ck::utils::conv::ConvParams params; using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -50,31 +57,26 @@ bool test_conv3d_ndhwc() ...@@ -50,31 +57,26 @@ bool test_conv3d_ndhwc()
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float, conv::ConvFwdOpInstance<float, float, float, ctl::NDHWC, ctl::KZYXC, ctl::NDHWK> conv_instance(
float, params);
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC, auto reference_conv_fwd_fun = std::bind(
ck::tensor_layout::convolution::NDHWK>(params); conv::run_reference_convolution_forward<3, float, float, float>, params, _1, _2, _3);
const Tensor<float>& input = std::get<0>(host_tensors); OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
const Tensor<float>& weights = std::get<1>(host_tensors); run_engine.SetAtol(1e-5);
Tensor<float>& host_output = std::get<2>(host_tensors); run_engine.SetRtol(1e-4);
Tensor<float>& device_output = std::get<3>(host_tensors); EXPECT_TRUE(run_engine.Test(conv_ptrs));
ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output);
test::conv::RunConv<3>(params, input, weights, device_output);
res = res &&
ck::utils::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res;
} }
bool test_conv3d_ndhwc_2gb_input() TEST(Conv3DFwdNDHWC, InputOver2GB)
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using namespace ck::utils;
// >2GB Input // >2GB Input
ck::utils::conv::ConvParams params; conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -86,39 +88,35 @@ bool test_conv3d_ndhwc_2gb_input() ...@@ -86,39 +88,35 @@ bool test_conv3d_ndhwc_2gb_input()
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float,
float, auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
ck::tensor_layout::convolution::NDHWC, nullptr,
ck::tensor_layout::convolution::KZYXC, nullptr,
ck::tensor_layout::convolution::NDHWK>(params, false); params.N,
const Tensor<float>& input = std::get<0>(host_tensors); params.K,
const Tensor<float>& weights = std::get<1>(host_tensors); params.C,
Tensor<float>& device_output = std::get<3>(host_tensors); params.input_spatial_lengths,
params.filter_spatial_lengths,
try params.GetOutputSpatialLengths(),
{ params.conv_filter_strides,
test::conv::RunConv<3>(params, input, weights, device_output); params.conv_filter_dilations,
} params.input_left_pads,
catch(const std::runtime_error& err) params.input_right_pads,
{ PassThrough{},
std::string err_msg{"Error! device_conv with the specified compilation parameters does " PassThrough{},
"not support this Conv problem"}; PassThrough{});
if(err.what() != err_msg) EXPECT_FALSE(conv_ptrs.back()->IsSupportedArgument(arg.get()));
{
return false;
}
return true;
}
std::cout << "Error: Failure checking oversized tensor!" << std::endl;
return false;
} }
bool test_conv3d_ndhwc_2gb_filters() TEST(Conv3DFwdNDHWC, FiltersOver2GB)
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using namespace ck::utils;
// >2GB Filters // >2GB Filters
ck::utils::conv::ConvParams params; conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -130,39 +128,35 @@ bool test_conv3d_ndhwc_2gb_filters() ...@@ -130,39 +128,35 @@ bool test_conv3d_ndhwc_2gb_filters()
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float,
float, auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
ck::tensor_layout::convolution::NDHWC, nullptr,
ck::tensor_layout::convolution::KZYXC, nullptr,
ck::tensor_layout::convolution::NDHWK>(params, false); params.N,
const Tensor<float>& input = std::get<0>(host_tensors); params.K,
const Tensor<float>& weights = std::get<1>(host_tensors); params.C,
Tensor<float>& device_output = std::get<3>(host_tensors); params.input_spatial_lengths,
params.filter_spatial_lengths,
try params.GetOutputSpatialLengths(),
{ params.conv_filter_strides,
test::conv::RunConv<3>(params, input, weights, device_output); params.conv_filter_dilations,
} params.input_left_pads,
catch(const std::runtime_error& err) params.input_right_pads,
{ PassThrough{},
std::string err_msg{"Error! device_conv with the specified compilation parameters does " PassThrough{},
"not support this Conv problem"}; PassThrough{});
if(err.what() != err_msg) EXPECT_FALSE(conv_ptrs.back()->IsSupportedArgument(arg.get()));
{
return false;
}
return true;
}
std::cout << "Error: Failure checking oversized tensor!" << std::endl;
return false;
} }
bool test_conv3d_ndhwc_2gb_output() TEST(Conv3DFwdNDHWC, OutputOver2GB)
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using namespace ck::utils;
// >2GB Output // >2GB Output
ck::utils::conv::ConvParams params; conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -174,127 +168,47 @@ bool test_conv3d_ndhwc_2gb_output() ...@@ -174,127 +168,47 @@ bool test_conv3d_ndhwc_2gb_output()
params.input_left_pads = std::vector<ck::index_t>{2, 2, 2}; params.input_left_pads = std::vector<ck::index_t>{2, 2, 2};
params.input_right_pads = std::vector<ck::index_t>{2, 2, 2}; params.input_right_pads = std::vector<ck::index_t>{2, 2, 2};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float, auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
float, nullptr,
ck::tensor_layout::convolution::NDHWC, nullptr,
ck::tensor_layout::convolution::KZYXC, params.N,
ck::tensor_layout::convolution::NDHWK>(params, false); params.K,
const Tensor<float>& input = std::get<0>(host_tensors); params.C,
const Tensor<float>& weights = std::get<1>(host_tensors); params.input_spatial_lengths,
Tensor<float>& device_output = std::get<3>(host_tensors); params.filter_spatial_lengths,
params.GetOutputSpatialLengths(),
try params.conv_filter_strides,
{ params.conv_filter_dilations,
test::conv::RunConv<3>(params, input, weights, device_output); params.input_left_pads,
} params.input_right_pads,
catch(const std::runtime_error& err) PassThrough{},
{ PassThrough{},
std::string err_msg{"Error! device_conv with the specified compilation parameters does " PassThrough{});
"not support this Conv problem"}; EXPECT_FALSE(conv_ptrs.back()->IsSupportedArgument(arg.get()));
if(err.what() != err_msg)
{
return false;
}
return true;
}
std::cout << "Error: Failure checking oversized tensor!" << std::endl;
return false;
} }
template <typename T> TEST(Conv3DFwdNDHWC, Bf16Instances)
bool test_conv3d_ndhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
{ {
ck::utils::conv::ConvParams params; EXPECT_TRUE(test_conv3d_ndhwc_instances<ck::bhalf_t>(
params.N = 64; ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<3>()));
params.num_dim_spatial = 3;
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2};
params.input_spatial_lengths = std::vector<ck::index_t>{32, 32, 2};
params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
params.conv_filter_dilations = std::vector<ck::index_t>{1, 1, 1};
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors =
ck::utils::conv::get_host_tensors<T,
T,
T,
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC,
ck::tensor_layout::convolution::NDHWK>(params);
const Tensor<T>& input = std::get<0>(host_tensors);
const Tensor<T>& weights = std::get<1>(host_tensors);
Tensor<T>& host_output = std::get<2>(host_tensors);
Tensor<T>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output);
return ck::utils::conv::run_convolution_forward_instances<3>(
params, conv_ptrs, input, weights, device_output, host_output);
} }
bool test_conv3d_ndhwc_bf16_instances() TEST(Conv3DFwdNDHWC, F16Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv3d_ndhwc_instances<ck::half_t>(
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<3>()));
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<ck::bhalf_t>(conv_ptrs);
} }
bool test_conv3d_ndhwc_f16_instances() TEST(Conv3DFwdNDHWC, F32Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv3d_ndhwc_instances<float>(
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<3>()));
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<ck::half_t>(conv_ptrs);
} }
bool test_conv3d_ndhwc_f32_instances() TEST(Conv3DFwdNDHWC, Int8Instances)
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; EXPECT_TRUE(test_conv3d_ndhwc_instances<int8_t>(
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<3>()));
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<float>(conv_ptrs);
}
bool test_conv3d_ndhwc_int8_instances()
{
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
ck::tensor_operation::device::device_conv3d_fwd_instance::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<int8_t>(conv_ptrs);
}
} // anonymous namespace
int main()
{
bool res{true};
res = test_conv3d_ndhwc();
std::cout << "test_conv3d_ndhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
res = test_conv3d_ndhwc_2gb_input();
std::cout << "\ntest_conv3d_ndhwc_2gb_input ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv3d_ndhwc_2gb_filters();
std::cout << "\ntest_conv3d_ndhwc_2gb_filters ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv3d_ndhwc_2gb_output();
std::cout << "\ntest_conv3d_ndhwc_2gb_output ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv3d_ndhwc_bf16_instances();
std::cout << "\ntest_conv3d_ndhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv3d_ndhwc_f16_instances();
std::cout << "\ntest_conv3d_ndhwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv3d_ndhwc_f32_instances();
std::cout << "\ntest_conv3d_ndhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = test_conv3d_ndhwc_int8_instances();
std::cout << "\ntest_conv3d_ndhw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
return res ? 0 : 1;
} }
...@@ -10,7 +10,8 @@ ...@@ -10,7 +10,8 @@
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "sequence.hpp" #include "sequence.hpp"
namespace { namespace test {
namespace conv {
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -19,6 +20,9 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -19,6 +20,9 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
...@@ -62,26 +66,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device:: ...@@ -62,26 +66,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::
1>; // CThreadTransferDstScalarPerVector 1>; // CThreadTransferDstScalarPerVector
// clang-format on // clang-format on
} // namespace
namespace test {
namespace conv {
template <ck::index_t NDim, template <ck::index_t NDim,
typename InDataType = float, typename InDataType = float,
typename WeiDataType = float, typename WeiDataType = float,
typename OutDataType = float> typename OutDataType = float>
void RunConv(const ck::utils::conv::ConvParams& params, void get_test_convolution_fwd_instance(std::vector<DeviceConvFwdNoOpPtr>& instances)
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output)
{ {
ck::utils::conv::run_convolution_forward<NDim, using ConvInstanceT = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>;
InDataType, instances.emplace_back(std::make_unique<ConvInstanceT>());
WeiDataType,
OutDataType,
DeviceConvNDFwdInstance>(
params, input, weights, output);
} }
} // namespace conv } // namespace conv
......
#ifndef GEMM_UTILS_HPP #ifndef GEMM_UTILS_HPP
#define GEMM_UTILS_HPP #define GEMM_UTILS_HPP
#include "check_err.hpp" #include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "reference_gemm.hpp" #include "reference_gemm.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
namespace ck { namespace ck {
namespace gemm_util { namespace gemm_util {
struct GemmParams struct GemmParams
{ {
GemmParams() GemmParams()
: M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0) : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
{ {
} }
ck::index_t M; ck::index_t M;
ck::index_t N; ck::index_t N;
ck::index_t K; ck::index_t K;
ck::index_t StrideA; ck::index_t StrideA;
ck::index_t StrideB; ck::index_t StrideB;
ck::index_t StrideC; ck::index_t StrideC;
float alpha; float alpha;
float beta; float beta;
}; };
template <typename GemmInstance, template <typename GemmInstance,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
void RunHostGEMM(const Tensor<ADataType>& A, void RunHostGEMM(const Tensor<ADataType>& A,
const Tensor<BDataType>& B, const Tensor<BDataType>& B,
Tensor<CDataType>& C, Tensor<CDataType>& C,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op) CElementwiseOperation c_element_op)
{ {
auto ref_gemm = GemmInstance{}; auto ref_gemm = GemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op); auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
template <typename DeviceGemmPtr_, template <typename DeviceGemmPtr_,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
const ck::gemm_util::GemmParams& params, const ck::gemm_util::GemmParams& params,
const Tensor<ADataType>& A, const Tensor<ADataType>& A,
const Tensor<BDataType>& B, const Tensor<BDataType>& B,
Tensor<CDataType>& C, Tensor<CDataType>& C,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op) CElementwiseOperation c_element_op)
{ {
DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
a_m_k_device_buf.ToDevice(A.mData.data()); a_m_k_device_buf.ToDevice(A.mData.data());
b_k_n_device_buf.ToDevice(B.mData.data()); b_k_n_device_buf.ToDevice(B.mData.data());
auto invoker_ptr = gemmPtr->MakeInvokerPointer(); auto invoker_ptr = gemmPtr->MakeInvokerPointer();
auto argument_ptr = auto argument_ptr =
gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()), gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()), static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()), static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
params.M, params.M,
params.N, params.N,
params.K, params.K,
params.StrideA, params.StrideA,
params.StrideB, params.StrideB,
params.StrideC, params.StrideC,
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op); c_element_op);
if(!gemmPtr->IsSupportedArgument(argument_ptr.get())) if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
{ {
throw std::runtime_error( throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does " "wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"); "not support this GEMM problem");
} }
invoker_ptr->Run(argument_ptr.get()); invoker_ptr->Run(argument_ptr.get());
c_m_n_device_buf.FromDevice(C.mData.data()); c_m_n_device_buf.FromDevice(C.mData.data());
} }
template <typename DeviceGemmPtr_, template <typename DeviceGemmPtr_,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout, typename CLayout,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
struct TestGemm struct TestGemm
{ {
auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params) auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1})); std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride})); std::vector<std::size_t>({1, stride}));
} }
}; };
Tensor<ADataType> a_m_k( Tensor<ADataType> a_m_k(
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<BDataType> b_k_n( Tensor<BDataType> b_k_n(
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result( Tensor<CDataType> c_m_n_host_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result( Tensor<CDataType> c_m_n_device_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
auto f_generate_tensor_value = [](auto desc, auto type) { auto f_generate_tensor_value = [](auto& tensor, auto type) {
using dataType = decltype(type); using dataType = decltype(type);
if(std::is_same<dataType, int8_t>::value) tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
{ };
desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5});
} f_generate_tensor_value(a_m_k, ADataType{});
else f_generate_tensor_value(b_k_n, BDataType{});
{
desc.GenerateTensorValue(GeneratorTensor_3<dataType>{-0.5, 0.5}); return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
} }
};
auto operator()(DeviceGemmPtr_& gemmPtr)
f_generate_tensor_value(a_m_k, ADataType{}); {
f_generate_tensor_value(b_k_n, BDataType{}); std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
<< ", CLayout = " << CLayout{}.name << std::endl;
return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result); std::cout << gemmPtr->GetTypeString() << std::endl;
}
// Arrange
auto operator()(DeviceGemmPtr_& gemmPtr) ck::gemm_util::GemmParams params;
{ params.M = 1024;
std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name params.N = 1024;
<< ", CLayout = " << CLayout{}.name << std::endl; params.K = 1024;
std::cout << gemmPtr->GetTypeString() << std::endl; params.StrideA = 1024;
params.StrideB = 1024;
// Arrange params.StrideC = 1024;
ck::gemm_util::GemmParams params;
params.M = 1024; auto host_tensors = PrepareGemmTensor(params);
params.N = 1024;
params.K = 1024; const Tensor<ADataType>& a = std::get<0>(host_tensors);
params.StrideA = 1024; const Tensor<BDataType>& b = std::get<1>(host_tensors);
params.StrideB = 1024; Tensor<CDataType>& c_host = std::get<2>(host_tensors);
params.StrideC = 1024; Tensor<CDataType>& c_device = std::get<3>(host_tensors);
auto host_tensors = PrepareGemmTensor(params); auto a_element_op = AElementwiseOperation{};
auto b_element_op = BElementwiseOperation{};
const Tensor<ADataType>& a = std::get<0>(host_tensors); auto c_element_op = CElementwiseOperation{};
const Tensor<BDataType>& b = std::get<1>(host_tensors);
Tensor<CDataType>& c_host = std::get<2>(host_tensors); using ReferenceGemmInstance =
Tensor<CDataType>& c_device = std::get<3>(host_tensors); ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
auto a_element_op = AElementwiseOperation{}; CDataType,
auto b_element_op = BElementwiseOperation{}; AElementwiseOperation,
auto c_element_op = CElementwiseOperation{}; BElementwiseOperation,
CElementwiseOperation>;
using ReferenceGemmInstance = ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
ck::tensor_operation::host::ReferenceGemm<ADataType, a, b, c_host, a_element_op, b_element_op, c_element_op);
BDataType,
CDataType, // Act
AElementwiseOperation, ck::gemm_util::RunDeviceGEMM(
BElementwiseOperation, gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
CElementwiseOperation>;
ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>( // Assert
a, b, c_host, a_element_op, b_element_op, c_element_op); bool res = false;
if(std::is_same<CDataType, float>::value)
// Act {
ck::gemm_util::RunDeviceGEMM( res = ck::utils::check_err(c_device.mData, c_host.mData);
gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
// Assert else if(std::is_same<CDataType, ck::half_t>::value)
bool res = false; {
if(std::is_same<CDataType, float>::value) res = ck::utils::check_err(c_device.mData, c_host.mData);
{ std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
res = ck::utils::check_err(c_device.mData, c_host.mData); }
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; else if(std::is_same<CDataType, int8_t>::value)
} {
else if(std::is_same<CDataType, ck::half_t>::value) res = ck::utils::check_err(c_device.mData, c_host.mData);
{ std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
res = ck::utils::check_err(c_device.mData, c_host.mData); }
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} return res;
else if(std::is_same<CDataType, int8_t>::value) }
{ };
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; template <typename DeviceGemmPtr_,
} typename ALayout,
typename BLayout,
return res; typename CLayout,
} typename AElementwiseOperation,
}; typename BElementwiseOperation,
typename CElementwiseOperation>
template <typename DeviceGemmPtr_, struct TestGemmBF16
typename ALayout, {
typename BLayout, using BF16 = ck::bhalf_t;
typename CLayout,
typename AElementwiseOperation, auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params)
typename BElementwiseOperation, {
typename CElementwiseOperation> auto f_host_tensor_descriptor =
struct TestGemmBF16 [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
{ if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
using BF16 = ck::bhalf_t; {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params) std::vector<std::size_t>({stride, 1}));
{ }
auto f_host_tensor_descriptor = else
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
{ std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), }
std::vector<std::size_t>({stride, 1})); };
}
else // use fp32 host kernel to verify bf16 device kernel
{ Tensor<BF16> a_m_k_bf16(
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
std::vector<std::size_t>({1, stride})); Tensor<BF16> b_k_n_bf16(
} f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
}; Tensor<BF16> c_m_n_device_bf16(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
// use fp32 host kernel to verify bf16 device kernel
Tensor<BF16> a_m_k_bf16( Tensor<float> a_m_k_fp32(
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<BF16> b_k_n_bf16( Tensor<float> b_k_n_fp32(
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
Tensor<BF16> c_m_n_device_bf16( Tensor<float> c_m_n_host_fp32(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
Tensor<float> c_m_n_device_fp32(
Tensor<float> a_m_k_fp32( f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<float> b_k_n_fp32( a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
Tensor<float> c_m_n_host_fp32(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); bf16_to_f32_(a_m_k_bf16, a_m_k_fp32);
Tensor<float> c_m_n_device_fp32( bf16_to_f32_(b_k_n_bf16, b_k_n_fp32);
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
return std::make_tuple(a_m_k_bf16,
a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5}); b_k_n_bf16,
b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5}); c_m_n_device_bf16,
a_m_k_fp32,
bf16_to_f32_(a_m_k_bf16, a_m_k_fp32); b_k_n_fp32,
bf16_to_f32_(b_k_n_bf16, b_k_n_fp32); c_m_n_host_fp32,
c_m_n_device_fp32);
return std::make_tuple(a_m_k_bf16, }
b_k_n_bf16,
c_m_n_device_bf16, auto operator()(DeviceGemmPtr_& gemmPtr)
a_m_k_fp32, {
b_k_n_fp32, // Arrange
c_m_n_host_fp32, ck::gemm_util::GemmParams params;
c_m_n_device_fp32); params.M = 1024;
} params.N = 1024;
params.K = 1024;
auto operator()(DeviceGemmPtr_& gemmPtr) params.StrideA = 1024;
{ params.StrideB = 1024;
// Arrange params.StrideC = 1024;
ck::gemm_util::GemmParams params;
params.M = 1024; auto host_tensors = PrepareGemmTensorBF16(params);
params.N = 1024; const Tensor<BF16>& a_bf16 = std::get<0>(host_tensors);
params.K = 1024; const Tensor<BF16>& b_bf16 = std::get<1>(host_tensors);
params.StrideA = 1024; Tensor<BF16>& c_device_bf16 = std::get<2>(host_tensors);
params.StrideB = 1024; Tensor<float>& a_fp32 = std::get<3>(host_tensors);
params.StrideC = 1024; Tensor<float>& b_fp32 = std::get<4>(host_tensors);
Tensor<float>& c_host_fp32 = std::get<5>(host_tensors);
auto host_tensors = PrepareGemmTensorBF16(params); Tensor<float>& c_device_fp32 = std::get<6>(host_tensors);
const Tensor<BF16>& a_bf16 = std::get<0>(host_tensors);
const Tensor<BF16>& b_bf16 = std::get<1>(host_tensors); auto a_element_op = AElementwiseOperation{};
Tensor<BF16>& c_device_bf16 = std::get<2>(host_tensors); auto b_element_op = BElementwiseOperation{};
Tensor<float>& a_fp32 = std::get<3>(host_tensors); auto c_element_op = CElementwiseOperation{};
Tensor<float>& b_fp32 = std::get<4>(host_tensors);
Tensor<float>& c_host_fp32 = std::get<5>(host_tensors); // use fp32 host kernel to verify bf16 device kernel
Tensor<float>& c_device_fp32 = std::get<6>(host_tensors); using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceGemm<float,
auto a_element_op = AElementwiseOperation{}; float,
auto b_element_op = BElementwiseOperation{}; float,
auto c_element_op = CElementwiseOperation{}; AElementwiseOperation,
BElementwiseOperation,
// use fp32 host kernel to verify bf16 device kernel CElementwiseOperation>;
using ReferenceGemmInstance = ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
ck::tensor_operation::host::ReferenceGemm<float, a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op);
float,
float, // Act
AElementwiseOperation, ck::gemm_util::RunDeviceGEMM(gemmPtr,
BElementwiseOperation, params,
CElementwiseOperation>; a_bf16,
ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>( b_bf16,
a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op); c_device_bf16,
a_element_op,
// Act b_element_op,
ck::gemm_util::RunDeviceGEMM(gemmPtr, c_element_op);
params,
a_bf16, bf16_to_f32_(c_device_bf16, c_device_fp32);
b_bf16,
c_device_bf16, // Assert
a_element_op, bool res = ck::utils::check_err(
b_element_op, c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
c_element_op); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
bf16_to_f32_(c_device_bf16, c_device_fp32); return res;
};
// Assert };
bool res = ck::utils::check_err(
c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f); } // namespace gemm_util
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } // namespace ck
#endif
return res;
};
};
} // namespace gemm_util
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment