Commit 96d1a7a9 authored by Chao Liu's avatar Chao Liu
Browse files

added profiler for conv+bias+relu+add

parent cd929111
......@@ -12,17 +12,9 @@
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "device_conv2d_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
struct PassThrough
{
template <typename T>
__host__ __device__ constexpr T operator()(T v) const
{
return v;
}
};
struct BiasLeakyReluAdd
struct AddLeakyReluAdd
{
template <typename T1, typename T2>
__host__ constexpr float operator()(float v0, T1 v1, T2 v2) const
......@@ -96,61 +88,6 @@ struct BiasLeakyReluAdd
}
};
struct BiasReluAdd
{
template <typename T1, typename T2>
__host__ constexpr float operator()(float v0, T1 v1, T2 v2) const
{
float b = v0 + v1;
float c = b > 0 ? b : 0;
float d = c + v2;
return d;
}
template <typename T1, typename T2>
__device__ constexpr float operator()(float v0, T1 v1, T2 v2) const
{
#if 0
float a = v1 + v0;
float b = max(a, float(0));
float c = b + v2;
return c;
#else
float b = v1 + v2;
float c = (v0 > -v1) ? b + v0 : v2;
return c;
#endif
}
};
struct BiasLeakyRelu
{
template <typename T1, typename T2>
__host__ constexpr float operator()(float v0, T1 v1, T2) const
{
float a = v0 + v1;
float b = 0.1 * a;
float c = b > 0 ? b : 0;
return c;
}
template <typename T1, typename T2>
__device__ constexpr float operator()(float v0, T1 v1, T2) const
{
constexpr float alpha = 0.1;
float b = v1 + v0;
float c = max(b, float(0));
float d = alpha * c;
return d;
}
};
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
......@@ -163,13 +100,13 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = BiasReluAdd;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
// clang-format off
using DeviceConvFwdInstance = ck::tensor_operation::device::
DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// clang-format off
// | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
// | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
......@@ -191,7 +128,7 @@ void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
const std::vector<ck::index_t>& conv_strides,
const std::vector<ck::index_t>& conv_dilations,
const std::vector<ck::index_t>& in_left_pads,
const std::vector<ck::index_t>&,
const std::vector<ck::index_t>& /* in_right_pads */,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
......@@ -356,8 +293,8 @@ int main(int argc, char* argv[])
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
......@@ -397,8 +334,8 @@ int main(int argc, char* argv[])
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
"wrong! device operator with the specified compilation parameters does "
"not support this problem");
}
float ave_time = invoker.Run(argument, nrepeat);
......
......@@ -6,21 +6,23 @@
#include "host_conv.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_fwd.hpp"
#include "device_conv_fwd_bias_activation_add.hpp"
#include "element_wise_operation.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv2d_fwd_instance {
namespace device_conv2d_fwd_bias_activation_add_instance {
using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
using DeviceConvFwdBiasReluAddPtr =
DeviceConvFwdBiasActivationAddPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
ck::tensor_operation::element_wise::AddReluAdd>;
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_bias_relu_add_xdl_nhwc_kyxc_nhwk_fp16_instances(
std::vector<DeviceConvFwdBiasReluAddPtr>&);
} // namespace device_conv2d_fwd_instance
} // namespace device_conv2d_fwd_bias_activation_add_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -28,6 +30,56 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(std::vector<DeviceC
namespace ck {
namespace profiler {
template <typename TIn,
typename TWei,
typename TOut,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp>
void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
const Tensor<TWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const Tensor<TOut>& bias_k,
const Tensor<TOut>& resi_n_k_ho_wo,
const std::vector<ck::index_t>& conv_strides,
const std::vector<ck::index_t>& conv_dilations,
const std::vector<ck::index_t>& in_left_pads,
const std::vector<ck::index_t>& /* in_right_pads */,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
{
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
double v = 0;
for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
{
v += in_element_op(static_cast<const double>(in_n_c_hi_wi(n, c, hi, wi))) *
wei_element_op(static_cast<const double>(wei_k_c_y_x(k, c, y, x)));
}
}
}
}
out_n_k_ho_wo(n, k, ho, wo) = out_element_op(v, bias_k(k), resi_n_k_ho_wo(n, k, ho, wo));
};
make_ParallelTensorFunctor(f_nchw,
out_n_k_ho_wo.mDesc.GetLengths()[0],
out_n_k_ho_wo.mDesc.GetLengths()[1],
out_n_k_ho_wo.mDesc.GetLengths()[2],
out_n_k_ho_wo.mDesc.GetLengths()[3])(
std::thread::hardware_concurrency());
}
template <int NDimSpatial,
typename InDataType,
typename WeiDataType,
......@@ -35,7 +87,7 @@ template <int NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout>
void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
void profile_conv_fwd_bias_relu_add_impl(int do_verification,
int init_method,
bool do_log,
int nrepeat,
......@@ -84,9 +136,18 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
Tensor<OutDataType> out_n_k_ho_wo_device_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
// residual: assume same layout as output tensor
Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl;
switch(init_method)
{
......@@ -94,61 +155,63 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
}
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
if(do_verification)
{
host_conv_nchw_kcyx_nkhw(in_n_c_hi_wi,
host_reference_calculation(in_n_c_hi_wi,
wei_k_c_y_x,
out_n_k_ho_wo_host_result,
bias_k,
resi_n_k_ho_wo,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpace());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
bias_device_buf.ToDevice(bias_k.mData.data());
resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data());
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceConvFwdBiasReluAddPtr = ck::tensor_operation::device::
DeviceConvFwdBiasActivationAddPtr<InElementOp, WeiElementOp, OutElementOp>;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
// add device operator instances
std::vector<DeviceConvFwdBiasReluAddPtr> op_ptrs;
// add device Conv instances
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp32_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_fp16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_fp16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_bias_activation_add_instance::
add_device_conv2d_fwd_bias_relu_add_xdl_nhwc_kyxc_nhwk_fp16_instances(op_ptrs);
}
if(conv_ptrs.size() <= 0)
if(op_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device Conv instance found");
}
......@@ -159,12 +222,14 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
float best_gb_per_sec = 0;
// profile device Conv instances
for(auto& conv_ptr : conv_ptrs)
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()),
N,
K,
C,
......@@ -175,22 +240,23 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
conv_filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
InElementOp{},
WeiElementOp{},
OutElementOp{});
auto invoker_ptr = conv_ptr->MakeInvokerPointer();
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::string conv_name = conv_ptr->GetTypeString();
std::string conv_name = op_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
std::size_t num_btype =
sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K) +
sizeof(OutDataType) * (N * K * Ho * Wo);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment