Unverified Commit f4980310 authored by Po Yen Chen's avatar Po Yen Chen Committed by GitHub
Browse files

Add client example of grouped conv2d forward (data type: fp16) (#488)

* Rename example folder for GroupedConvFwdMultipleD

* Unify example codes

* Change target names

* Add fp16 example for multiple d instance

* Re-format common.hpp

* Add interface 'DeviceGroupedConvFwd'

* Use simpler interface

* Move common conv params out

* Rename conv fwd client example folder

* Add missing include directive

* Update grouped conv instance implementations

* Simplify ckProfiler (grouped conv forward)

* Use GroupedConvFwd to implement client example

* Use greater groupe count in example

* Add custom target to group examples

* Add extra tag param to instance factory function

* Use tag to differentiate factory functions

* Add missing tag argument for factory function

* Remove inheritance relationship

* Remove no-longer used include directive

* Add license in front of file
parent 38470e04
add_executable(client_conv2d_fwd conv2d_fwd.cpp)
target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InLayout = ck::tensor_layout::convolution::GNHWC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t N = 16;
static constexpr ck::index_t K = 32;
static constexpr ck::index_t C = 3;
static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 256;
static constexpr ck::index_t K = 192;
static constexpr ck::index_t C = 192;
static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3;
static constexpr ck::index_t Hi = 224;
static constexpr ck::index_t Wi = 224;
static constexpr ck::index_t Ho = 113;
static constexpr ck::index_t Wo = 113;
static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 28;
struct SimpleDeviceMem
{
......@@ -47,30 +50,66 @@ struct SimpleDeviceMem
void* p_mem_;
};
int main(int argc, char* argv[])
int main()
{
std::vector<ck::index_t> in_spatial_lengths{Hi, Wi};
std::vector<ck::index_t> filter_spatial_lengths{Y, X};
std::vector<ck::index_t> out_spatial_lengths{Ho, Wo};
std::vector<ck::index_t> filter_strides{2, 2};
std::vector<ck::index_t> filter_dilations{1, 1};
std::vector<ck::index_t> input_left_pads{2, 2};
std::vector<ck::index_t> input_right_pads{2, 2};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
PassThrough,
PassThrough,
PassThrough>;
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std::rotate(
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
std::rotate(
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
std::rotate(
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
std::rotate(
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
std::rotate(
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
std::rotate(
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
PassThrough,
PassThrough,
PassThrough>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
......@@ -91,13 +130,16 @@ int main(int argc, char* argv[])
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
out.GetDeviceBuffer(),
N,
K,
C,
in_spatial_lengths,
filter_spatial_lengths,
out_spatial_lengths,
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
......@@ -112,10 +154,10 @@ int main(int argc, char* argv[])
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = 2 * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * C +
sizeof(WeiDataType) * K * Y * X * C +
sizeof(OutDataType) * N * Ho * Wo * K;
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * G * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
......@@ -134,10 +176,16 @@ int main(int argc, char* argv[])
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......@@ -148,13 +196,16 @@ int main(int argc, char* argv[])
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
out.GetDeviceBuffer(),
N,
K,
C,
in_spatial_lengths,
filter_spatial_lengths,
out_spatial_lengths,
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
......@@ -172,6 +223,4 @@ int main(int argc, char* argv[])
std::cout << "Done" << std::endl;
}
return 0;
}
\ No newline at end of file
}
add_custom_target(example_grouped_conv_fwd_multiple_d)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
endif() # USE_BITINT_EXTENSION_INT4
add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
Command
```bash
arg1: verification (0=no, 1=yes)
arg2: initialization (0=no init, 1=integer value, 2=decimal value)
arg3: time kernel (0=no, 1=yes)
Following arguments (depending on number of spatial dims):
Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
G, N, K, C,
<filter spatial dimensions>, (ie Y, X for 2D)
<input image spatial dimensions>, (ie Hi, Wi for 2D)
<strides>, (ie Sy, Sx for 2D)
<dilations>, (ie Dy, Dx for 2D)
<left padding>, (ie LeftPy, LeftPx for 2D)
<right padding>, (ie RightPy, RightPx for 2D)
./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
```
Result (MI100)
```
in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
```
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#pragma once
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
// kernel data types
using InKernelDataType = int8_t;
using WeiKernelDataType = int8_t;
using AccDataType = int32_t;
using CShuffleDataType = int8_t;
using BiasKernelDataType = int8_t;
using ResidualKernelDataType = int8_t;
using OutKernelDataType = int8_t;
#include <algorithm>
#include <array>
#include <iostream>
#include <string>
#include <type_traits>
// tensor data types
using InUserDataType = ck::int4_t;
using WeiUserDataType = ck::int4_t;
using OutUserDataType = ck::int4_t;
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using BF16 = ck::bhalf_t;
using FP16 = ck::half_t;
using FP32 = float;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
using I4 = ck::int4_t;
#endif
using I8 = std::int8_t;
using I32 = std::int32_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
16, // ABlockTransferSrcScalarPerVector
16, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
16, // BBlockTransferSrcScalarPerVector
16, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 64, 1, 4>,
16>;
int main(int argc, char* argv[])
template <typename InputLay, typename WeightLay, typename OutputLay>
struct CommonLayoutSetting
{
using InputLayout = InputLay;
using WeightLayout = WeightLay;
using OutputLayout = OutputLay;
};
template <ck::index_t NDimSpatial>
struct CommonLayoutSettingSelector;
namespace ctl = ck::tensor_layout::convolution;
template <>
struct CommonLayoutSettingSelector<1> final
: CommonLayoutSetting<ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>
{
namespace ctc = ck::tensor_layout::convolution;
};
print_helper_msg();
template <>
struct CommonLayoutSettingSelector<2> final
: CommonLayoutSetting<ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>
{
};
template <>
struct CommonLayoutSettingSelector<3> final
: CommonLayoutSetting<ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>
{
};
template <ck::index_t NDimSpatial>
using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
template <ck::index_t NDimSpatial>
using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
template <ck::index_t NDimSpatial>
using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
bool time_kernel = true;
};
#define DefaultConvParam \
ck::utils::conv::ConvParam \
{ \
2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
}
inline void print_help_msg()
{
std::cerr << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
inline bool parse_cmd_args(int argc,
char* argv[],
ExecutionConfig& config,
ck::utils::conv::ConvParam& conv_param)
{
constexpr int num_execution_config_args =
3; // arguments for do_verification, init_method, time_kernel
constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
constexpr int threshold_to_catch_all_args =
threshold_to_catch_partial_args + num_conv_param_leading_args;
if(argc == 1)
{
// use default
}
else if(argc == 4)
// catch only ExecutionConfig arguments
else if(argc == threshold_to_catch_partial_args)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else
// catch both ExecutionConfig & ConvParam arguments
else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(
num_dim_spatial, threshold_to_catch_partial_args, argv);
}
else
{
print_help_msg();
return false;
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
return true;
}
if(conv_param.num_dim_spatial_ == 1)
inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param)
{
switch(conv_param.num_dim_spatial_)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
case 1:
return HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
......@@ -150,81 +161,8 @@ int main(int argc, char* argv[])
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
case 2:
return HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
......@@ -239,104 +177,8 @@ int main(int argc, char* argv[])
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
case 3:
return HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
......@@ -353,8 +195,42 @@ int main(int argc, char* argv[])
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
}
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
throw std::runtime_error("unsuppored # dim spatial");
}
inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param)
{
switch(conv_param.num_dim_spatial_)
{
case 1:
return HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
case 2:
return HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
case 3:
return HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
......@@ -373,40 +249,89 @@ int main(int argc, char* argv[])
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
}
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
throw std::runtime_error("unsuppored # dim spatial");
}
inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param)
{
switch(conv_param.num_dim_spatial_)
{
case 1:
return HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
case 2:
return HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
case 3:
return HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
}
throw std::runtime_error("unsuppored # dim spatial");
}
inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param)
{
switch(conv_param.num_dim_spatial_)
{
case 1:
return HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
case 2:
return HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
case 3:
return HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
......@@ -423,37 +348,7 @@ int main(int argc, char* argv[])
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
throw std::runtime_error("unsuppored # dim spatial");
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = BF16;
using WeiKernelDataType = BF16;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using BiasKernelDataType = BF16;
using ResidualKernelDataType = BF16;
using OutKernelDataType = BF16;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = FP16;
using WeiKernelDataType = FP16;
using AccDataType = FP32;
using CShuffleDataType = FP16;
using BiasKernelDataType = FP16;
using ResidualKernelDataType = FP16;
using OutKernelDataType = FP16;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = FP32;
using WeiKernelDataType = FP32;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using BiasKernelDataType = FP32;
using ResidualKernelDataType = FP32;
using OutKernelDataType = FP32;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#error Should compile this file with ck::int4_t support
#endif
#include "common.hpp"
// kernel data types
using InKernelDataType = I8;
using WeiKernelDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I8;
using BiasKernelDataType = I8;
using ResidualKernelDataType = I8;
using OutKernelDataType = I8;
// tensor data types
using InUserDataType = I4;
using WeiUserDataType = I4;
using OutUserDataType = I4;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#define BUILD_INT4_EXAMPLE
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = I8;
using WeiKernelDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I8;
using BiasKernelDataType = I8;
using ResidualKernelDataType = I8;
using OutKernelDataType = I8;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = FP16;
using WeiKernelDataType = FP16;
using AccDataType = FP32;
using CShuffleDataType = FP16;
using OutKernelDataType = FP16;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = PassThrough;
#include "run_grouped_conv_fwd_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void print_helper_msg()
template <typename BiasLay, typename ResidualLay>
struct LayoutSetting
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
using BiasLayout = BiasLay;
using ResidualLayout = ResidualLay;
};
template <ck::index_t NDimSpatial>
struct LayoutSettingSelector;
template <>
struct LayoutSettingSelector<1> final : LayoutSetting<ctl::G_K, ctl::G_NW_K>
{
};
template <>
struct LayoutSettingSelector<2> final : LayoutSetting<ctl::G_K, ctl::G_NHW_K>
{
};
template <>
struct LayoutSettingSelector<3> final : LayoutSetting<ctl::G_K, ctl::G_NDHW_K>
{
};
template <ck::index_t NDimSpatial>
using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
template <ck::index_t NDimSpatial>
using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
template <ck::index_t NDimSpatial>
using DeviceConvFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>,
WeightLayout<NDimSpatial>,
ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
OutputLayout<NDimSpatial>,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 16, 1, 16>,
4>;
template <ck::index_t NDimSpatial,
typename InKernelDataType,
typename WeiKernelDataType,
typename CShuffleDataType,
typename OutKernelDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename InUserDataType,
typename WeiUserDataType,
typename OutUserDataType,
typename DeviceConvNDFwdInstance>
int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
int init_method,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& in_g_n_c_wis_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& bias_g_n_k_wos_desc,
const HostTensorDescriptor& residual_g_n_k_wos_desc,
const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
template <ck::index_t NDimSpatial>
using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InUserDataType,
WeiUserDataType,
CShuffleDataType,
InElementOp,
WeiElementOp,
PassThrough>;
template <ck::index_t NDimSpatial>
bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
const ck::utils::conv::ConvParam& conv_param)
{
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param);
const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param);
const auto bias_g_n_k_wos_desc = make_bias_descriptor(conv_param);
const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
Tensor<InUserDataType> in(in_g_n_c_wis_desc);
Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
Tensor<OutUserDataType> residual(residual_g_n_k_wos_desc);
Tensor<OutUserDataType> residual(bias_g_n_k_wos_desc);
Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
......@@ -63,7 +114,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
std::cout << "residual: " << residual.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl;
switch(init_method)
switch(config.init_method)
{
case 0: break;
case 1:
......@@ -83,7 +134,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef BUILD_INT4_EXAMPLE
const Tensor<InKernelDataType> in_converted(in);
const Tensor<WeiKernelDataType> wei_converted(wei);
const Tensor<OutKernelDataType> bias_converted(bias);
......@@ -93,12 +144,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
wei_device_buf.ToDevice(wei_converted.mData.data());
bias_device_buf.ToDevice(bias_converted.mData.data());
residual_device_buf.ToDevice(residual_converted.mData.data());
#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#else
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
residual_device_buf.ToDevice(residual.mData.data());
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#endif
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
......@@ -123,8 +174,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths);
copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides);
copy(residual_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
copy(residual_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
copy(bias_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
copy(bias_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
......@@ -133,7 +184,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
copy(conv_param.input_right_pads_, input_right_pads);
// do Conv
auto conv = DeviceConvNDFwdInstance{};
auto conv = DeviceConvFwdInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker();
auto argument =
conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
......@@ -155,9 +206,9 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
......@@ -166,7 +217,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
"not support this Conv problem");
}
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
......@@ -176,20 +227,11 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
if(config.do_verification)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InUserDataType,
WeiUserDataType,
CShuffleDataType,
InElementOp,
WeiElementOp,
PassThrough>();
auto ref_conv = HostConvFwdInstance<NDimSpatial>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in,
wei,
......@@ -198,36 +240,49 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
InElementOp{},
WeiElementOp{},
PassThrough{});
ref_invoker.Run(ref_argument);
// TODO: implement elementwise operation for host
out_host.ForEach([&](auto&, auto idx) {
out_element_op(out_host(idx), c_host(idx), bias(idx), residual(idx));
OutElementOp{}(out_host(idx), c_host(idx), bias(idx), residual(idx));
});
out_device_buf.FromDevice(out_device.mData.data());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef BUILD_INT4_EXAMPLE
const Tensor<OutUserDataType> out_device_converted(out_device);
return ck::utils::check_err(out_device_converted.mData,
out_host.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
? 0
: 1;
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
#else
return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
#endif
}
return true;
}
bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return false;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param);
case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param);
case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param);
}
return 0;
return false;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
template <ck::index_t NDimSpatial>
using DeviceConvFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>,
WeightLayout<NDimSpatial>,
ck::Tuple<>,
OutputLayout<NDimSpatial>,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 16, 1, 16>,
4>;
template <ck::index_t NDimSpatial>
using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InUserDataType,
WeiUserDataType,
CShuffleDataType,
InElementOp,
WeiElementOp,
PassThrough>;
template <ck::index_t NDimSpatial>
bool run_grouped_conv_fwd(const ExecutionConfig& config,
const ck::utils::conv::ConvParam& conv_param)
{
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param);
const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param);
const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
Tensor<InUserDataType> in(in_g_n_c_wis_desc);
Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl;
switch(config.init_method)
{
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<InKernelDataType> in_converted(in);
const Tensor<WeiKernelDataType> wei_converted(wei);
in_device_buf.ToDevice(in_converted.mData.data());
wei_device_buf.ToDevice(wei_converted.mData.data());
#else
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
#endif
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
// do Conv
auto conv = DeviceConvFwdInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_conv = HostConvFwdInstance<NDimSpatial>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in,
wei,
out_host,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(out_device.mData.data());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<OutUserDataType> out_device_converted(out_device);
return ck::utils::check_err(
out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
#else
return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
#endif
}
return true;
}
bool run_grouped_conv_fwd_example(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return false;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return run_grouped_conv_fwd<1>(config, conv_param);
case 2: return run_grouped_conv_fwd<2>(config, conv_param);
case 3: return run_grouped_conv_fwd<3>(config, conv_param);
}
return false;
}
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp)
endif() # USE_BITINT_EXTENSION_INT4
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: time kernel (0=no, 1=yes)
#Following arguments (depending on number of spatial dims):
# N spatial dimensions
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
# <strides>, (ie Sy, Sx for 2D)
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1
```
Result (MI100)
```
in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384}
wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512}
A[M, K]: {165888, 1728}
B[N, K]: {256, 1728}
Ds[M, N]: {165888, 256}
Ds[M, N]: {165888, 256}
E[M, N]: {165888, 256}
launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
```
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
// kernel data types
using InKernelDataType = ck::bhalf_t;
using WeiKernelDataType = ck::bhalf_t;
using AccDataType = float;
using CShuffleDataType = float;
using BiasKernelDataType = ck::bhalf_t;
using ResidualKernelDataType = ck::bhalf_t;
using OutKernelDataType = ck::bhalf_t;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 32, 1, 8>,
8>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 1)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1],
conv_param.input_spatial_lengths_[2]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1],
conv_param.filter_spatial_lengths_[2]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
// kernel data types
using InKernelDataType = ck::half_t;
using WeiKernelDataType = ck::half_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using BiasKernelDataType = ck::half_t;
using ResidualKernelDataType = ck::half_t;
using OutKernelDataType = ck::half_t;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 32, 1, 8>,
8>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 1)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // n
1, // k
0 // wo
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1],
conv_param.input_spatial_lengths_[2]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1],
conv_param.filter_spatial_lengths_[2]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
// kernel data types
using InKernelDataType = float;
using WeiKernelDataType = float;
using AccDataType = float;
using CShuffleDataType = float;
using BiasKernelDataType = float;
using ResidualKernelDataType = float;
using OutKernelDataType = float;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 16, 1, 16>,
4>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 1)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1],
conv_param.input_spatial_lengths_[2]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1],
conv_param.filter_spatial_lengths_[2]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
// kernel data types
using InKernelDataType = int8_t;
using WeiKernelDataType = int8_t;
using AccDataType = int32_t;
using CShuffleDataType = int8_t;
using BiasKernelDataType = int8_t;
using ResidualKernelDataType = int8_t;
using OutKernelDataType = int8_t;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
16, // ABlockTransferSrcScalarPerVector
16, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
16, // BBlockTransferSrcScalarPerVector
16, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 64, 1, 4>,
16>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 1)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1],
conv_param.input_spatial_lengths_[2]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1],
conv_param.filter_spatial_lengths_[2]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InKernelDataType,
WeiKernelDataType,
CShuffleDataType,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
InUserDataType,
WeiUserDataType,
OutUserDataType,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment