Commit 5e6cca6f authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents afc7d431 3956085d
...@@ -29,10 +29,10 @@ set(PROFILER_SOURCE ...@@ -29,10 +29,10 @@ set(PROFILER_SOURCE
src/profile_gemm_bias_relu_add.cpp src/profile_gemm_bias_relu_add.cpp
src/profile_gemm_reduce.cpp src/profile_gemm_reduce.cpp
src/profile_batched_gemm.cpp src/profile_batched_gemm.cpp
src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_fwd_bias_relu_atomic_add.cpp src/profile_conv_fwd_bias_relu_atomic_add.cpp
src/profile_convnd_fwd.cpp
src/profile_conv_fwd_cpu.cpp src/profile_conv_fwd_cpu.cpp
src/profile_convnd_bwd_data.cpp src/profile_convnd_bwd_data.cpp
src/profile_reduce.cpp src/profile_reduce.cpp
...@@ -44,20 +44,22 @@ set(PROFILER_SOURCE ...@@ -44,20 +44,22 @@ set(PROFILER_SOURCE
add_executable(ckProfiler ${PROFILER_SOURCE}) add_executable(ckProfiler ${PROFILER_SOURCE})
target_link_libraries(ckProfiler PRIVATE host_tensor) target_link_libraries(ckProfiler PRIVATE host_tensor)
target_link_libraries(ckProfiler PRIVATE conv_fwd_util)
target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_conv1d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv3d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_cpu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_cpu_instance)
target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance)
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_fwd.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_fwd.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv2d_fwd_instance {
using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv2d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
template <int NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
void profile_conv_fwd_impl(int do_verification,
int init_method,
bool do_log,
int nrepeat,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads)
{
const ck::index_t Y = filter_spatial_lengths[0];
const ck::index_t X = filter_spatial_lengths[1];
const ck::index_t Hi = input_spatial_lengths[0];
const ck::index_t Wi = input_spatial_lengths[1];
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
}
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
}
};
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_host_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_device_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(do_verification)
{
using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
auto ref_conv = ReferenceConvFwdInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
wei_k_c_y_x,
out_n_k_ho_wo_host_result,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
ref_invoker.Run(ref_argument);
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
// add device Conv instances
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, bhalf_t>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, int8_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, int8_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, int8_t>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
}
if(conv_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device Conv instance found");
}
std::string best_conv_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device Conv instances
for(auto& conv_ptr : conv_ptrs)
{
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
auto invoker_ptr = conv_ptr->MakeInvokerPointer();
if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::string conv_name = conv_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << conv_name << std::endl;
if(tflops > best_tflops)
{
best_conv_name = conv_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
out_n_k_ho_wo_host_result.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
<< std::endl;
}
}
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
}
} // namespace profiler
} // namespace ck
#pragma once
namespace ck {
namespace profiler {
int profile_convnd_fwd(int argc, char* argv[]);
} // namespace profiler
} // namespace ck
...@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification,
if(do_verification) if(do_verification)
{ {
using HostInDataType = typename type_mapping<InDataType>::OutType; ReductionHost<InDataType,
using HostOutDataType = typename type_mapping<OutDataType>::OutType; AccDataType,
using HostAccDataType = typename type_mapping<AccDataType>::OutType; OutDataType,
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReduceOpId, ReduceOpId,
Rank, Rank,
NumReduceDim, NumReduceDim,
...@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification,
NeedIndices> NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(
reinterpret_cast<const HostInDataType*>(in.mData.data()), alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
out_indices_ref.mData.data());
}; };
const auto i_inLengths = to_int_vector(inLengths); const auto i_inLengths = to_int_vector(inLengths);
......
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_conv_fwd_impl.hpp"
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvInputLayout
{
NCHW, // 0
NHWC, // 1
};
enum struct ConvWeightLayout
{
KCYX, // 0
KYXC, // 1
};
enum struct ConvOutputLayout
{
NKHW, // 0
NHWK, // 1
};
int profile_conv_fwd(int argc, char* argv[])
{
if(argc != 25)
{
printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
const auto wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
const auto out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
const ck::index_t C = std::stoi(argv[12]);
const ck::index_t Y = std::stoi(argv[13]);
const ck::index_t X = std::stoi(argv[14]);
const ck::index_t Hi = std::stoi(argv[15]);
const ck::index_t Wi = std::stoi(argv[16]);
const ck::index_t conv_stride_h = std::stoi(argv[17]);
const ck::index_t conv_stride_w = std::stoi(argv[18]);
const ck::index_t conv_dilation_h = std::stoi(argv[19]);
const ck::index_t conv_dilation_w = std::stoi(argv[20]);
const ck::index_t in_left_pad_h = std::stoi(argv[21]);
const ck::index_t in_left_pad_w = std::stoi(argv[22]);
const ck::index_t in_right_pad_h = std::stoi(argv[23]);
const ck::index_t in_right_pad_w = std::stoi(argv[24]);
const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
float,
float,
float,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
uint16_t,
uint16_t,
uint16_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_fwd_impl<2,
int8_t,
int8_t,
int8_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else
{
throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
}
return 1;
}
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
#include "profile_convnd_bwd_data_impl.hpp" #include "profile_convnd_bwd_data_impl.hpp"
namespace {
enum struct ConvDataType enum struct ConvDataType
{ {
F32_F32_F32, // 0 F32_F32_F32, // 0
...@@ -76,6 +78,8 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], ...@@ -76,6 +78,8 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[],
return params; return params;
} }
} // namespace
int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
{ {
const int preParams = 10; const int preParams = 10;
......
#include <cstdlib>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <half.hpp>
#include "conv_fwd_util.hpp"
#include "element_wise_operation.hpp"
#include "fill.hpp"
#include "profile_convnd_fwd.hpp"
#include "tensor_layout.hpp"
namespace {
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvDataLayout
{
NCHW, // 0
NHWC, // 1
};
namespace ctl = ck::tensor_layout::convolution;
template <int NDim, ConvDataLayout DataLayout>
struct ConvolutionLayouts;
template <>
struct ConvolutionLayouts<1, ConvDataLayout::NHWC>
{
typedef ctl::NWC Input;
typedef ctl::KXC Weight;
typedef ctl::NWK Output;
};
template <>
struct ConvolutionLayouts<2, ConvDataLayout::NHWC>
{
typedef ctl::NHWC Input;
typedef ctl::KYXC Weight;
typedef ctl::NHWK Output;
};
template <>
struct ConvolutionLayouts<3, ConvDataLayout::NHWC>
{
typedef ctl::NDHWC Input;
typedef ctl::KZYXC Weight;
typedef ctl::NDHWK Output;
};
template <>
struct ConvolutionLayouts<1, ConvDataLayout::NCHW>
{
typedef ctl::NCW Input;
typedef ctl::KCX Weight;
typedef ctl::NKW Output;
};
template <>
struct ConvolutionLayouts<2, ConvDataLayout::NCHW>
{
typedef ctl::NCHW Input;
typedef ctl::KCYX Weight;
typedef ctl::NKHW Output;
};
template <>
struct ConvolutionLayouts<3, ConvDataLayout::NCHW>
{
typedef ctl::NCDHW Input;
typedef ctl::KCZYX Weight;
typedef ctl::NKDHW Output;
};
void print_use_msg()
{
std::cout << "arg1: tensor operation (conv_fwd: ForwardConvolution)\n"
<< "arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n"
<< "arg3: data layout (0: NCHW; 1: NHWC)\n"
<< "arg4: verification (0=no, 1=yes)\n"
<< "arg5: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: run kernel # of times (>1)\n"
<< "arg8: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 9;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(1);
}
int arg_idx = 9;
return ck::utils::conv::parse_conv_params(num_dim_spatial, arg_idx, argv);
}
template <int NDim,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename ConvLayouts>
void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
int nrepeat,
int init_method,
ConvLayouts)
{
using namespace std::placeholders;
using namespace ck::utils;
std::unique_ptr<OpInstance<OutDataType, InDataType, WeiDataType>> conv_instance;
switch(init_method)
{
case 0:
conv_instance =
std::make_unique<conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output>>(params, false);
break;
case 1:
conv_instance = std::make_unique<
conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::utils::FillUniform<int>,
ck::utils::FillUniform<int>>>(
params, true, ck::utils::FillUniform<int>{}, ck::utils::FillUniform<int>{});
break;
case 2:
conv_instance = std::make_unique<
conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::utils::FillUniform<InDataType>,
ck::utils::FillUniform<WeiDataType>>>(
params,
true,
ck::utils::FillUniform<InDataType>{},
ck::utils::FillUniform<WeiDataType>{});
break;
default: throw std::runtime_error("Unsupported init method!");
}
auto reference_conv_fwd_fun = std::bind(
conv::run_reference_convolution_forward<NDim, InDataType, WeiDataType, OutDataType>,
params,
_1,
_2,
_3);
OpInstanceRunEngine<InDataType, WeiDataType, OutDataType> run_engine(*conv_instance,
reference_conv_fwd_fun);
auto best_conf = run_engine.Profile(
conv::ConvolutionFwdInstances<InDataType, WeiDataType, OutDataType>::template Get<NDim>(),
nrepeat,
do_verification,
do_log);
std::cout << "Best configuration parameters:"
<< "\nname: " << best_conf.best_op_name << "\navg_time: " << best_conf.best_avg_time
<< "\ntflops: " << best_conf.best_tflops << "\nGB/s: " << best_conf.best_gb_per_sec
<< std::endl;
}
template <int NDim>
void profile_convnd_instances(ConvDataType data_type,
ConvDataLayout data_layout,
const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
int nrepeat,
int init_method)
{
switch(data_layout)
{
case ConvDataLayout::NHWC: {
switch(data_type)
{
case ConvDataType::F32_F32_F32:
profile_convnd_instances_impl<NDim, float, float, float>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::F16_F16_F16:
profile_convnd_instances_impl<NDim, ck::half_t, ck::half_t, ck::half_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::BF16_BF16_BF16:
profile_convnd_instances_impl<NDim, ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::INT8_INT8_INT8:
profile_convnd_instances_impl<NDim, int8_t, int8_t, int8_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
}
break;
}
case ConvDataLayout::NCHW: {
switch(data_type)
{
case ConvDataType::F32_F32_F32:
profile_convnd_instances_impl<NDim, float, float, float>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::F16_F16_F16:
profile_convnd_instances_impl<NDim, ck::half_t, ck::half_t, ck::half_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::BF16_BF16_BF16:
profile_convnd_instances_impl<NDim, ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::INT8_INT8_INT8:
profile_convnd_instances_impl<NDim, int8_t, int8_t, int8_t>(
params,
do_verification,
do_log,
nrepeat,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
}
break;
}
}
}
} // namespace
int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
{
using namespace ck::utils::conv;
ConvDataType data_type{ConvDataType::F32_F32_F32};
ConvDataLayout data_layout{ConvDataLayout::NHWC};
bool do_verification{true};
int init_method{2};
bool do_log{false};
int nrepeat{100};
int num_dim_spatial{2};
ConvParams params;
if(argc >= 4)
{
data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
data_layout = static_cast<ConvDataLayout>(std::stoi(argv[3]));
}
if(argc >= 9)
{
do_verification = std::stoi(argv[4]);
init_method = std::stoi(argv[5]);
do_log = std::stoi(argv[6]);
nrepeat = std::stoi(argv[7]);
num_dim_spatial = std::stoi(argv[8]);
}
if(argc >= 10)
{
params = parse_params(num_dim_spatial, argc, argv);
}
// TODO Print nice message what is being profiled.
switch(num_dim_spatial)
{
case 1:
profile_convnd_instances<1>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
break;
case 2:
profile_convnd_instances<2>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
break;
case 3:
profile_convnd_instances<3>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
break;
default:
throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
std::to_string(num_dim_spatial));
}
return 1;
}
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include "profile_convnd_fwd.hpp"
int profile_gemm(int, char*[]); int profile_gemm(int, char*[]);
int profile_gemm_bias_2d(int, char*[]); int profile_gemm_bias_2d(int, char*[]);
int profile_gemm_bias_relu(int, char*[]); int profile_gemm_bias_relu(int, char*[]);
...@@ -11,7 +13,6 @@ int profile_gemm_bias_relu_add(int, char*[]); ...@@ -11,7 +13,6 @@ int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]); int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]); int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]); int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]); int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
...@@ -56,7 +57,7 @@ int main(int argc, char* argv[]) ...@@ -56,7 +57,7 @@ int main(int argc, char* argv[])
} }
else if(strcmp(argv[1], "conv_fwd") == 0) else if(strcmp(argv[1], "conv_fwd") == 0)
{ {
return profile_conv_fwd(argc, argv); return ck::profiler::profile_convnd_fwd(argc, argv);
} }
else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0) else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
{ {
......
find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' #find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
...@@ -10,7 +10,7 @@ cmake ...@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D BUILD_DEV=OFF \ -D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_FLAGS=" -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
...@@ -4,5 +4,4 @@ include_directories(BEFORE ...@@ -4,5 +4,4 @@ include_directories(BEFORE
) )
add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor) target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_fwd_util)
target_link_libraries(test_conv2d_bwd_weight PRIVATE device_conv2d_bwd_weight_instance)
add_test_executable(test_conv_util conv_util.cpp) add_test_executable(test_conv_util conv_util.cpp)
target_link_libraries(test_conv_util PRIVATE host_tensor) target_link_libraries(test_conv_util PRIVATE host_tensor conv_fwd_util)
...@@ -4,5 +4,4 @@ include_directories(BEFORE ...@@ -4,5 +4,4 @@ include_directories(BEFORE
) )
add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp) add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor) target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_fwd_util)
target_link_libraries(test_convnd_bwd_data PRIVATE device_convnd_bwd_data_instance)
add_custom_target(test_convnd_fwd) add_custom_target(test_convnd_fwd)
add_test_executable(test_conv1d_fwd conv1d_fwd.cpp) add_test_executable(test_conv1d_fwd conv1d_fwd.cpp)
target_link_libraries(test_conv1d_fwd PRIVATE host_tensor) target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_fwd_util)
target_link_libraries(test_conv1d_fwd PRIVATE device_conv1d_fwd_instance) target_link_libraries(test_conv1d_fwd PRIVATE )
add_dependencies(test_convnd_fwd test_conv1d_fwd) add_dependencies(test_convnd_fwd test_conv1d_fwd)
add_test_executable(test_conv2d_fwd conv2d_fwd.cpp) add_test_executable(test_conv2d_fwd conv2d_fwd.cpp)
target_link_libraries(test_conv2d_fwd PRIVATE host_tensor) target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_fwd_util)
target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv2d_fwd) add_dependencies(test_convnd_fwd test_conv2d_fwd)
add_test_executable(test_conv3d_fwd conv3d_fwd.cpp) add_test_executable(test_conv3d_fwd conv3d_fwd.cpp)
target_link_libraries(test_conv3d_fwd PRIVATE host_tensor) target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_fwd_util)
target_link_libraries(test_conv3d_fwd PRIVATE device_conv3d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv3d_fwd) add_dependencies(test_convnd_fwd test_conv3d_fwd)
...@@ -7,37 +7,15 @@ ...@@ -7,37 +7,15 @@
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
#include "conv_util.hpp" #include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv1d_fwd_instance {
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv1d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace { namespace {
bool test_conv1D_nwc() bool test_conv1D_nwc()
{ {
bool res{true}; using namespace std::placeholders;
using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
params.num_dim_spatial = 1; params.num_dim_spatial = 1;
params.N = 2; params.N = 2;
...@@ -50,30 +28,26 @@ bool test_conv1D_nwc() ...@@ -50,30 +28,26 @@ bool test_conv1D_nwc()
params.input_left_pads = std::vector<ck::index_t>{1}; params.input_left_pads = std::vector<ck::index_t>{1};
params.input_right_pads = std::vector<ck::index_t>{1}; params.input_right_pads = std::vector<ck::index_t>{1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<1>(conv_ptrs);
float, conv::ConvFwdOpInstance<float, float, float, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(
float, params);
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC, auto reference_conv_fwd_fun = std::bind(
ck::tensor_layout::convolution::NWK>(params); conv::run_reference_convolution_forward<1, float, float, float>, params, _1, _2, _3);
const Tensor<float>& input = std::get<0>(host_tensors); OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
const Tensor<float>& weights = std::get<1>(host_tensors); run_engine.SetAtol(1e-5);
Tensor<float>& host_output = std::get<2>(host_tensors); run_engine.SetRtol(1e-4);
Tensor<float>& device_output = std::get<3>(host_tensors); return run_engine.Test(conv_ptrs);
ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output);
test::conv::RunConv<1>(params, input, weights, device_output);
res = res &&
ck::utils::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res;
} }
template <typename T> template <typename T>
bool test_conv1d_nwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs) bool test_conv1d_nwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
{ {
using namespace std::placeholders;
using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
params.num_dim_spatial = 1; params.num_dim_spatial = 1;
params.filter_spatial_lengths = std::vector<ck::index_t>{3}; params.filter_spatial_lengths = std::vector<ck::index_t>{3};
...@@ -83,52 +57,36 @@ bool test_conv1d_nwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptr ...@@ -83,52 +57,36 @@ bool test_conv1d_nwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptr
params.input_left_pads = std::vector<ck::index_t>{1}; params.input_left_pads = std::vector<ck::index_t>{1};
params.input_right_pads = std::vector<ck::index_t>{1}; params.input_right_pads = std::vector<ck::index_t>{1};
auto host_tensors = conv::ConvFwdOpInstance<T, T, T, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(params);
ck::utils::conv::get_host_tensors<T,
T, auto reference_conv_fwd_fun =
T, std::bind(conv::run_reference_convolution_forward<1, T, T, T>, params, _1, _2, _3);
ck::tensor_layout::convolution::NWC, OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
ck::tensor_layout::convolution::KXC, return run_engine.Test(conv_ptrs);
ck::tensor_layout::convolution::NWK>(params);
const Tensor<T>& input = std::get<0>(host_tensors);
const Tensor<T>& weights = std::get<1>(host_tensors);
Tensor<T>& host_output = std::get<2>(host_tensors);
Tensor<T>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output);
return ck::utils::conv::run_convolution_forward_instances<1>(
params, conv_ptrs, input, weights, device_output, host_output);
} }
bool test_conv1d_nwc_bf16_instances() bool test_conv1d_nwc_bf16_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv1d_nwc_instances<ck::bhalf_t>(
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<1>());
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
return test_conv1d_nwc_instances<ck::bhalf_t>(conv_ptrs);
} }
bool test_conv1d_nwc_f16_instances() bool test_conv1d_nwc_f16_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv1d_nwc_instances<ck::half_t>(
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<1>());
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
return test_conv1d_nwc_instances<ck::half_t>(conv_ptrs);
} }
bool test_conv1d_nwc_f32_instances() bool test_conv1d_nwc_f32_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv1d_nwc_instances<float>(
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<1>());
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
return test_conv1d_nwc_instances<float>(conv_ptrs);
} }
bool test_conv1d_nwc_int8_instances() bool test_conv1d_nwc_int8_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv1d_nwc_instances<int8_t>(
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<1>());
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
return test_conv1d_nwc_instances<int8_t>(conv_ptrs);
} }
} // anonymous namespace } // anonymous namespace
...@@ -149,7 +107,7 @@ int main() ...@@ -149,7 +107,7 @@ int main()
std::cout << "\ntest_conv1d_nwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE") std::cout << "\ntest_conv1d_nwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl; << std::endl;
res = test_conv1d_nwc_int8_instances(); res = test_conv1d_nwc_int8_instances();
std::cout << "\ntes_tconv1_dnw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE") std::cout << "\ntest_conv1d_nwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl; << std::endl;
return res ? 0 : 1; return res ? 0 : 1;
......
#include <half.hpp> #include <half.hpp>
#include <iostream> #include <iostream>
#include <stdexcept>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
...@@ -8,38 +7,14 @@ ...@@ -8,38 +7,14 @@
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
#include "conv_util.hpp" #include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv2d_fwd_instance {
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv2d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace { namespace {
bool test_conv2d_nhwc() bool test_conv2d_nhwc()
{ {
bool res{true}; using namespace std::placeholders;
using namespace ck::utils;
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -47,25 +22,25 @@ bool test_conv2d_nhwc() ...@@ -47,25 +22,25 @@ bool test_conv2d_nhwc()
params.input_spatial_lengths = std::vector<ck::index_t>{16, 16}; params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
params.conv_filter_strides = std::vector<ck::index_t>{1, 1}; params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
auto host_tensors = ck::utils::conv::get_host_tensors(params); std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
const Tensor<float>& input = std::get<0>(host_tensors); test::conv::get_test_convolution_fwd_instance<2>(conv_ptrs);
const Tensor<float>& weights = std::get<1>(host_tensors); conv::ConvFwdOpInstance<float, float, float> conv_instance(params);
Tensor<float>& host_output = std::get<2>(host_tensors);
Tensor<float>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output);
test::conv::RunConv<2>(params, input, weights, device_output);
res = res &&
ck::utils::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res; auto reference_conv_fwd_fun = std::bind(
conv::run_reference_convolution_forward<2, float, float, float>, params, _1, _2, _3);
OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
run_engine.SetAtol(1e-5);
run_engine.SetRtol(1e-4);
return run_engine.Test(conv_ptrs);
} }
template <typename T> template <typename T>
bool test_conv2d_nhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs) bool test_conv2d_nhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
{ {
ck::utils::conv::ConvParams params; using namespace std::placeholders;
using namespace ck::utils;
conv::ConvParams params;
params.num_dim_spatial = 2; params.num_dim_spatial = 2;
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3}; params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
params.input_spatial_lengths = std::vector<ck::index_t>{71, 71}; params.input_spatial_lengths = std::vector<ck::index_t>{71, 71};
...@@ -74,55 +49,36 @@ bool test_conv2d_nhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_pt ...@@ -74,55 +49,36 @@ bool test_conv2d_nhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_pt
params.input_left_pads = std::vector<ck::index_t>{1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1};
auto host_tensors = conv::ConvFwdOpInstance<T, T, T> conv_instance(params);
ck::utils::conv::get_host_tensors<T,
T, auto reference_conv_fwd_fun =
T, std::bind(conv::run_reference_convolution_forward<2, T, T, T>, params, _1, _2, _3);
ck::tensor_layout::convolution::NHWC, OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
ck::tensor_layout::convolution::KYXC, return run_engine.Test(conv_ptrs);
ck::tensor_layout::convolution::NHWK>(params);
const Tensor<T>& input = std::get<0>(host_tensors);
const Tensor<T>& weights = std::get<1>(host_tensors);
Tensor<T>& host_output = std::get<2>(host_tensors);
Tensor<T>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output);
return ck::utils::conv::run_convolution_forward_instances<2>(
params, conv_ptrs, input, weights, device_output, host_output);
} }
bool test_conv2d_nhwc_bf16_instances() bool test_conv2d_nhwc_bf16_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv2d_nhwc_instances<ck::bhalf_t>(
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<2>());
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
return test_conv2d_nhwc_instances<ck::bhalf_t>(conv_ptrs);
} }
bool test_conv2d_nhwc_f16_instances() bool test_conv2d_nhwc_f16_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv2d_nhwc_instances<ck::half_t>(
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<2>());
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
return test_conv2d_nhwc_instances<ck::half_t>(conv_ptrs);
} }
bool test_conv2d_nhwc_f32_instances() bool test_conv2d_nhwc_f32_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv2d_nhwc_instances<float>(
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<2>());
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
return test_conv2d_nhwc_instances<float>(conv_ptrs);
} }
bool test_conv2d_nhwc_int8_instances() bool test_conv2d_nhwc_int8_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv2d_nhwc_instances<int8_t>(
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<2>());
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
return test_conv2d_nhwc_instances<int8_t>(conv_ptrs);
} }
} // anonymous namespace } // anonymous namespace
......
...@@ -8,37 +8,16 @@ ...@@ -8,37 +8,16 @@
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
#include "conv_util.hpp" #include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv3d_fwd_instance {
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv3d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace { namespace {
bool test_conv3d_ndhwc() bool test_conv3d_ndhwc()
{ {
bool res{true}; using namespace std::placeholders;
ck::utils::conv::ConvParams params; using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -50,31 +29,26 @@ bool test_conv3d_ndhwc() ...@@ -50,31 +29,26 @@ bool test_conv3d_ndhwc()
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float, conv::ConvFwdOpInstance<float, float, float, ctl::NDHWC, ctl::KZYXC, ctl::NDHWK> conv_instance(
float, params);
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC, auto reference_conv_fwd_fun = std::bind(
ck::tensor_layout::convolution::NDHWK>(params); conv::run_reference_convolution_forward<3, float, float, float>, params, _1, _2, _3);
const Tensor<float>& input = std::get<0>(host_tensors); OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
const Tensor<float>& weights = std::get<1>(host_tensors); run_engine.SetAtol(1e-5);
Tensor<float>& host_output = std::get<2>(host_tensors); run_engine.SetRtol(1e-4);
Tensor<float>& device_output = std::get<3>(host_tensors); return run_engine.Test(conv_ptrs);
ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output);
test::conv::RunConv<3>(params, input, weights, device_output);
res = res &&
ck::utils::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res;
} }
bool test_conv3d_ndhwc_2gb_input() bool test_conv3d_ndhwc_2gb_input()
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using namespace ck::utils;
// >2GB Input // >2GB Input
ck::utils::conv::ConvParams params; conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -86,39 +60,35 @@ bool test_conv3d_ndhwc_2gb_input() ...@@ -86,39 +60,35 @@ bool test_conv3d_ndhwc_2gb_input()
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float,
float, auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
ck::tensor_layout::convolution::NDHWC, nullptr,
ck::tensor_layout::convolution::KZYXC, nullptr,
ck::tensor_layout::convolution::NDHWK>(params, false); params.N,
const Tensor<float>& input = std::get<0>(host_tensors); params.K,
const Tensor<float>& weights = std::get<1>(host_tensors); params.C,
Tensor<float>& device_output = std::get<3>(host_tensors); params.input_spatial_lengths,
params.filter_spatial_lengths,
try params.GetOutputSpatialLengths(),
{ params.conv_filter_strides,
test::conv::RunConv<3>(params, input, weights, device_output); params.conv_filter_dilations,
} params.input_left_pads,
catch(const std::runtime_error& err) params.input_right_pads,
{ PassThrough{},
std::string err_msg{"Error! device_conv with the specified compilation parameters does " PassThrough{},
"not support this Conv problem"}; PassThrough{});
if(err.what() != err_msg) return !(conv_ptrs.back()->IsSupportedArgument(arg.get()));
{
return false;
}
return true;
}
std::cout << "Error: Failure checking oversized tensor!" << std::endl;
return false;
} }
bool test_conv3d_ndhwc_2gb_filters() bool test_conv3d_ndhwc_2gb_filters()
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using namespace ck::utils;
// >2GB Filters // >2GB Filters
ck::utils::conv::ConvParams params; conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -130,39 +100,35 @@ bool test_conv3d_ndhwc_2gb_filters() ...@@ -130,39 +100,35 @@ bool test_conv3d_ndhwc_2gb_filters()
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float,
float, auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
ck::tensor_layout::convolution::NDHWC, nullptr,
ck::tensor_layout::convolution::KZYXC, nullptr,
ck::tensor_layout::convolution::NDHWK>(params, false); params.N,
const Tensor<float>& input = std::get<0>(host_tensors); params.K,
const Tensor<float>& weights = std::get<1>(host_tensors); params.C,
Tensor<float>& device_output = std::get<3>(host_tensors); params.input_spatial_lengths,
params.filter_spatial_lengths,
try params.GetOutputSpatialLengths(),
{ params.conv_filter_strides,
test::conv::RunConv<3>(params, input, weights, device_output); params.conv_filter_dilations,
} params.input_left_pads,
catch(const std::runtime_error& err) params.input_right_pads,
{ PassThrough{},
std::string err_msg{"Error! device_conv with the specified compilation parameters does " PassThrough{},
"not support this Conv problem"}; PassThrough{});
if(err.what() != err_msg) return !(conv_ptrs.back()->IsSupportedArgument(arg.get()));
{
return false;
}
return true;
}
std::cout << "Error: Failure checking oversized tensor!" << std::endl;
return false;
} }
bool test_conv3d_ndhwc_2gb_output() bool test_conv3d_ndhwc_2gb_output()
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using namespace ck::utils;
// >2GB Output // >2GB Output
ck::utils::conv::ConvParams params; conv::ConvParams params;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.N = 2; params.N = 2;
params.K = 16; params.K = 16;
...@@ -174,39 +140,35 @@ bool test_conv3d_ndhwc_2gb_output() ...@@ -174,39 +140,35 @@ bool test_conv3d_ndhwc_2gb_output()
params.input_left_pads = std::vector<ck::index_t>{2, 2, 2}; params.input_left_pads = std::vector<ck::index_t>{2, 2, 2};
params.input_right_pads = std::vector<ck::index_t>{2, 2, 2}; params.input_right_pads = std::vector<ck::index_t>{2, 2, 2};
auto host_tensors = std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
ck::utils::conv::get_host_tensors<float, test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
float, auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
float, nullptr,
ck::tensor_layout::convolution::NDHWC, nullptr,
ck::tensor_layout::convolution::KZYXC, params.N,
ck::tensor_layout::convolution::NDHWK>(params, false); params.K,
const Tensor<float>& input = std::get<0>(host_tensors); params.C,
const Tensor<float>& weights = std::get<1>(host_tensors); params.input_spatial_lengths,
Tensor<float>& device_output = std::get<3>(host_tensors); params.filter_spatial_lengths,
params.GetOutputSpatialLengths(),
try params.conv_filter_strides,
{ params.conv_filter_dilations,
test::conv::RunConv<3>(params, input, weights, device_output); params.input_left_pads,
} params.input_right_pads,
catch(const std::runtime_error& err) PassThrough{},
{ PassThrough{},
std::string err_msg{"Error! device_conv with the specified compilation parameters does " PassThrough{});
"not support this Conv problem"}; return !(conv_ptrs.back()->IsSupportedArgument(arg.get()));
if(err.what() != err_msg)
{
return false;
}
return true;
}
std::cout << "Error: Failure checking oversized tensor!" << std::endl;
return false;
} }
template <typename T> template <typename T>
bool test_conv3d_ndhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs) bool test_conv3d_ndhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
{ {
ck::utils::conv::ConvParams params; using namespace std::placeholders;
using namespace ck::utils;
namespace ctl = ck::tensor_layout::convolution;
conv::ConvParams params;
params.N = 64; params.N = 64;
params.num_dim_spatial = 3; params.num_dim_spatial = 3;
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2}; params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2};
...@@ -216,53 +178,36 @@ bool test_conv3d_ndhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_p ...@@ -216,53 +178,36 @@ bool test_conv3d_ndhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_p
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1}; params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
auto host_tensors = conv::ConvFwdOpInstance<T, T, T, ctl::NDHWC, ctl::KZYXC, ctl::NDHWK> conv_instance(params);
ck::utils::conv::get_host_tensors<T,
T,
T,
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC,
ck::tensor_layout::convolution::NDHWK>(params);
const Tensor<T>& input = std::get<0>(host_tensors);
const Tensor<T>& weights = std::get<1>(host_tensors);
Tensor<T>& host_output = std::get<2>(host_tensors);
Tensor<T>& device_output = std::get<3>(host_tensors);
ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output); auto reference_conv_fwd_fun =
return ck::utils::conv::run_convolution_forward_instances<3>( std::bind(conv::run_reference_convolution_forward<3, T, T, T>, params, _1, _2, _3);
params, conv_ptrs, input, weights, device_output, host_output); OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
return run_engine.Test(conv_ptrs);
} }
bool test_conv3d_ndhwc_bf16_instances() bool test_conv3d_ndhwc_bf16_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv3d_ndhwc_instances<ck::bhalf_t>(
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<3>());
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<ck::bhalf_t>(conv_ptrs);
} }
bool test_conv3d_ndhwc_f16_instances() bool test_conv3d_ndhwc_f16_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv3d_ndhwc_instances<ck::half_t>(
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<3>());
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<ck::half_t>(conv_ptrs);
} }
bool test_conv3d_ndhwc_f32_instances() bool test_conv3d_ndhwc_f32_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv3d_ndhwc_instances<float>(
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<3>());
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<float>(conv_ptrs);
} }
bool test_conv3d_ndhwc_int8_instances() bool test_conv3d_ndhwc_int8_instances()
{ {
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; return test_conv3d_ndhwc_instances<int8_t>(
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<3>());
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
return test_conv3d_ndhwc_instances<int8_t>(conv_ptrs);
} }
} // anonymous namespace } // anonymous namespace
...@@ -293,7 +238,7 @@ int main() ...@@ -293,7 +238,7 @@ int main()
std::cout << "\ntest_conv3d_ndhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE") std::cout << "\ntest_conv3d_ndhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl; << std::endl;
res = test_conv3d_ndhwc_int8_instances(); res = test_conv3d_ndhwc_int8_instances();
std::cout << "\ntest_conv3d_ndhw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE") std::cout << "\ntest_conv3d_ndhwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl; << std::endl;
return res ? 0 : 1; return res ? 0 : 1;
......
...@@ -10,7 +10,8 @@ ...@@ -10,7 +10,8 @@
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "sequence.hpp" #include "sequence.hpp"
namespace { namespace test {
namespace conv {
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -19,6 +20,9 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -19,6 +20,9 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
...@@ -62,26 +66,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device:: ...@@ -62,26 +66,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::
1>; // CThreadTransferDstScalarPerVector 1>; // CThreadTransferDstScalarPerVector
// clang-format on // clang-format on
} // namespace
namespace test {
namespace conv {
template <ck::index_t NDim, template <ck::index_t NDim,
typename InDataType = float, typename InDataType = float,
typename WeiDataType = float, typename WeiDataType = float,
typename OutDataType = float> typename OutDataType = float>
void RunConv(const ck::utils::conv::ConvParams& params, void get_test_convolution_fwd_instance(std::vector<DeviceConvFwdNoOpPtr>& instances)
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output)
{ {
ck::utils::conv::run_convolution_forward<NDim, using ConvInstanceT = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>;
InDataType, instances.emplace_back(std::make_unique<ConvInstanceT>());
WeiDataType,
OutDataType,
DeviceConvNDFwdInstance>(
params, input, weights, output);
} }
} // namespace conv } // namespace conv
......
...@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce ...@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return invariantDims; return invariantDims;
}; };
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template <typename InType>
struct type_mapping
{
using OutType = InType;
};
template <>
struct type_mapping<ck::half_t>
{
using OutType = half_float::half;
};
constexpr int Rank = 4; constexpr int Rank = 4;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG; constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
...@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method, ...@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
bool result = true; bool result = true;
using HostInDataType = typename type_mapping<InDataType>::OutType; ReductionHost<InDataType,
using HostOutDataType = typename type_mapping<OutDataType>::OutType; AccDataType,
using HostAccDataType = typename type_mapping<AccDataType>::OutType; OutDataType,
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReduceOpId, ReduceOpId,
Rank, Rank,
NumReduceDim, NumReduceDim,
...@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method, ...@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
NeedIndices> NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
reinterpret_cast<const HostInDataType*>(in.mData.data()),
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
nullptr);
const auto i_inLengths = to_int_vector(inLengths); const auto i_inLengths = to_int_vector(inLengths);
const auto i_inStrides = to_int_vector(inStrides); const auto i_inStrides = to_int_vector(inStrides);
......
...@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce ...@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return invariantDims; return invariantDims;
}; };
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template <typename InType>
struct type_mapping
{
using OutType = InType;
};
template <>
struct type_mapping<ck::half_t>
{
using OutType = half_float::half;
};
constexpr int Rank = 4; constexpr int Rank = 4;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX; constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX;
...@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method, ...@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
bool result = true; bool result = true;
using HostInDataType = typename type_mapping<InDataType>::OutType; ReductionHost<InDataType,
using HostOutDataType = typename type_mapping<OutDataType>::OutType; AccDataType,
using HostAccDataType = typename type_mapping<AccDataType>::OutType; OutDataType,
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReduceOpId, ReduceOpId,
Rank, Rank,
NumReduceDim, NumReduceDim,
...@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method, ...@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
NeedIndices> NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(
reinterpret_cast<const HostInDataType*>(in.mData.data()), alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
out_indices_ref.mData.data());
const auto i_inLengths = to_int_vector(inLengths); const auto i_inLengths = to_int_vector(inLengths);
const auto i_inStrides = to_int_vector(inStrides); const auto i_inStrides = to_int_vector(inStrides);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment