Unverified Commit 5d37d7bf authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Reorganize files, Part 1 (#119)

* delete obselete files

* move files

* build

* update cmake

* update cmake

* fix build

* reorg examples

* update cmake for example and test
parent 245f7414
add_example_executable(example_conv2d_bwd_wgt_xdl conv2d_bwd_wgt_xdl.cpp)
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "device_reduce_blockwise.hpp" #include "device_reduce_blockwise.hpp"
#include "host_reduce_util.hpp" #include "host_reduce_util.hpp"
#include "host_generic_reduction.hpp" #include "host_generic_reduction.hpp"
#include "reduction_enums.hpp" #include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp" #include "reduction_operator_mapping.hpp"
......
add_example_executable(example_pool2d_fwd pool2d_fwd.cpp)
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
#include "reduction_operator.hpp" #include "reduction_operator.hpp"
#include "device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp" #include "device_pool2d_fwd_nhwc_nhwc.hpp"
using InDataType = ck::half_t; using InDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
......
# Instructions for ```conv_xdl_bias_relu_add``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```conv_xdl_bias_relu_add```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j conv_xdl_bias_relu_add
```
## Run ```conv_xdl_bias_relu_add```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
./example/conv_xdl_bias_relu_add 0 1 5
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
bias_k: dim 1, lengths {256}, strides {1}
resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
arg.b_grid_desc_k0_n_k1_{216, 256, 8}
arg.c_grid_desc_m_n_{ 165888, 256}
arg.c0_grid_desc_m_n_{ 165888, 256}
arg.c1_grid_desc_m_n_{ 165888, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
```
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "convolution_utility.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto MemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
// clang-format off
using DeviceConvFwdInstance = ck::tensor_operation::device::
DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// clang-format off
// | InData| WeiData| OutData| AccData| In| Wei| Out| Out| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| GlobalMemory| Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
// | | | | | Operation| Operation| Operation| DataOperation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, MemoryAtomicAdd, ConvFwdDefault, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 8, 1, 1,32>, 2>;
// clang-format on
template <typename TIn,
typename TWei,
typename TOut,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp>
void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
const Tensor<TWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const Tensor<TOut>& bias_k,
const std::vector<ck::index_t>& conv_strides,
const std::vector<ck::index_t>& conv_dilations,
const std::vector<ck::index_t>& in_left_pads,
const std::vector<ck::index_t>& /* in_right_pads */,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
{
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
float v_acc = 0;
for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
{
float v_in;
float v_wei;
in_element_op(v_in, static_cast<const float>(in_n_c_hi_wi(n, c, hi, wi)));
wei_element_op(v_wei, static_cast<const float>(wei_k_c_y_x(k, c, y, x)));
v_acc += v_in * v_wei;
}
}
}
}
float v_out;
out_element_op(v_out, v_acc, static_cast<float>(bias_k(k)));
out_n_k_ho_wo(n, k, ho, wo) += v_out;
};
make_ParallelTensorFunctor(f_nchw,
out_n_k_ho_wo.mDesc.GetLengths()[0],
out_n_k_ho_wo.mDesc.GetLengths()[1],
out_n_k_ho_wo.mDesc.GetLengths()[2],
out_n_k_ho_wo.mDesc.GetLengths()[3])(
std::thread::hardware_concurrency());
}
int main(int argc, char* argv[])
{
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
// Conv shape
ck::index_t N = 128;
ck::index_t K = 256;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t conv_stride_h = 2;
ck::index_t conv_stride_w = 2;
ck::index_t conv_dilation_h = 1;
ck::index_t conv_dilation_w = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
}
else if(argc == 19)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
N = std::stoi(argv[4]);
K = std::stoi(argv[5]);
C = std::stoi(argv[6]);
Y = std::stoi(argv[7]);
X = std::stoi(argv[8]);
Hi = std::stoi(argv[9]);
Wi = std::stoi(argv[10]);
conv_stride_h = std::stoi(argv[11]);
conv_stride_w = std::stoi(argv[12]);
conv_dilation_h = std::stoi(argv[13]);
conv_dilation_w = std::stoi(argv[14]);
in_left_pad_h = std::stoi(argv[15]);
in_left_pad_w = std::stoi(argv[16]);
in_right_pad_h = std::stoi(argv[17]);
in_right_pad_w = std::stoi(argv[18]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n");
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
}
const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
const auto output_spatial_lengths =
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
{Y, X},
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
// tensor layout
auto f_host_tensor_descriptor = [](std::size_t N_,
std::size_t C_,
std::size_t H,
std::size_t W,
auto layout) {
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
}
else if constexpr(ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWC>::value ||
ck::is_same<decltype(layout),
ck::tensor_layout::convolution::KYXC>::value ||
ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWK>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
}
};
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_host_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_device_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
out_device_buf.ToDevice(out_n_k_ho_wo_host_result.mData.data());
bias_device_buf.ToDevice(bias_k.mData.data());
auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument =
conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device operator with the specified compilation parameters does "
"not support this problem");
}
float ave_time = invoker.Run(argument, nrepeat);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{
host_reference_calculation(in_n_c_hi_wi,
wei_k_c_y_x,
out_n_k_ho_wo_host_result,
bias_k,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
}
}
# Instructions for ```conv2d_fwd_xdl``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```conv2d_fwd_xdl```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j conv2d_fwd_xdl
```
## Run ```conv2d_fwd_xdl_int8```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
./example/conv2d_fwd_xdl_int8 0 1 5
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
arg.b_grid_desc_k0_n_k1_{216, 256, 8}
arg.c_grid_desc_m_n_{ 165888, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s
```
include_directories(BEFORE include_directories(BEFORE
${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/host/host_tensor/include ${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/host/device/include ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/device_operation/include ${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/reference_operation/include ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
${PROJECT_SOURCE_DIR}/composable_kernel/include ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
${PROJECT_SOURCE_DIR}/composable_kernel/include/utility ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
${PROJECT_SOURCE_DIR}/external/rocm/include ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
${PROJECT_SOURCE_DIR}/device_operation_reference/include ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
${PROJECT_SOURCE_DIR}/external/include/half
) )
set(GEMM_XDL_SOURCE 1_gemm_xdl/gemm_xdl.cpp) add_custom_target(examples)
set(GEMM_XDL_INT8_SOURCE 1_gemm_xdl/gemm_xdl_int8.cpp)
set(GEMM_XDL_BF16_SOURCE 1_gemm_xdl/gemm_xdl_bf16.cpp)
set(GEMM_XDL_BIAS_RELU_SOURCE 2_gemm_xdl_bias_relu/gemm_xdl_bias_relu.cpp)
set(GEMM_XDL_BIAS_RELU_ADD_SOURCE 3_gemm_xdl_bias_relu_add/gemm_xdl_bias_relu_add.cpp)
set(CONV2D_FWD_XDL_SOURCE 4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp)
set(CONV2D_FWD_XDL_BIAS_RELU_SOURCE 5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp)
set(CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE 6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp)
set(CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE 7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp)
set(GEMM_XDL_ALPHA_BETA_SOURCE 8_gemm_xdl_alpha_beta/gemm_xdl_alpha_beta.cpp)
set(CONV2D_FWD_XDL_INT8_SOURCE 9_conv2d_fwd_xdl_int8/conv2d_fwd_xdl_int8.cpp)
set(CONV2D_WRW_XDL_SOURCE 13_conv2d_backward_weight_xdl/main.cpp)
set(CONV3D_FWD_XDL_SOURCE 10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp)
set(CONVND_FWD_XDL_SOURCE 11_convnd_fwd_xdl/convnd_fwd_xdl.cpp)
set(CONV2D_BWD_DATA_XDL_SOURCE 12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp)
set(POOL2D_FWD_SOURCE 12_pool2d_fwd/pool2d_fwd.cpp)
set(REDUCE_BLOCKWISE_SOURCE 13_reduce_blockwise/reduce_blockwise.cpp)
add_executable(gemm_xdl ${GEMM_XDL_SOURCE}) function(add_example_executable EXAMPLE_NAME)
add_executable(gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE}) message("adding example ${EXAMPLE_NAME}")
add_executable(gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE}) add_executable(${EXAMPLE_NAME} ${ARGN})
add_executable(gemm_xdl_bias_relu ${GEMM_XDL_BIAS_RELU_SOURCE}) target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
add_executable(gemm_xdl_bias_relu_add ${GEMM_XDL_BIAS_RELU_ADD_SOURCE}) add_dependencies(examples ${EXAMPLE_NAME})
add_executable(conv2d_fwd_xdl ${CONV2D_FWD_XDL_SOURCE}) endfunction(add_example_executable EXAMPLE_NAME)
add_executable(conv2d_fwd_xdl_bias_relu ${CONV2D_FWD_XDL_BIAS_RELU_SOURCE})
add_executable(conv2d_fwd_xdl_bias_relu_add ${CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE})
add_executable(conv2d_fwd_xdl_bias_relu_atomic_add ${CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE})
add_executable(gemm_xdl_alpha_beta ${GEMM_XDL_ALPHA_BETA_SOURCE})
add_executable(conv2d_fwd_xdl_int8 ${CONV2D_FWD_XDL_INT8_SOURCE})
add_executable(conv2d_wrw_xdl ${CONV2D_WRW_XDL_SOURCE})
add_executable(conv3d_fwd_xdl ${CONV3D_FWD_XDL_SOURCE})
add_executable(convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
add_executable(conv2d_bwd_data_xdl ${CONV2D_BWD_DATA_XDL_SOURCE})
add_executable(pool2d_fwd ${POOL2D_FWD_SOURCE})
add_executable(reduce_blockwise ${REDUCE_BLOCKWISE_SOURCE})
target_link_libraries(gemm_xdl PRIVATE host_tensor)
target_link_libraries(gemm_xdl_int8 PRIVATE host_tensor)
target_link_libraries(gemm_xdl_bf16 PRIVATE host_tensor)
target_link_libraries(gemm_xdl_bias_relu PRIVATE host_tensor)
target_link_libraries(gemm_xdl_bias_relu_add PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_bias_relu PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_bias_relu_add PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_bias_relu_atomic_add PRIVATE host_tensor)
target_link_libraries(gemm_xdl_alpha_beta PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_int8 PRIVATE host_tensor)
target_link_libraries(conv2d_wrw_xdl PRIVATE host_tensor)
target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor)
target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor)
target_link_libraries(conv2d_bwd_data_xdl PRIVATE host_tensor)
target_link_libraries(pool2d_fwd PRIVATE host_tensor)
target_link_libraries(reduce_blockwise PRIVATE host_tensor)
add_subdirectory(01_gemm)
add_subdirectory(02_gemm_alpha_beta)
add_subdirectory(03_gemm_bias_relu)
add_subdirectory(04_gemm_bias_relu_add)
add_subdirectory(05_conv2d_fwd)
add_subdirectory(06_conv2d_fwd_bias_relu)
add_subdirectory(07_conv2d_fwd_bias_relu_add)
add_subdirectory(08_conv3d_fwd)
add_subdirectory(09_convnd_fwd)
add_subdirectory(10_conv2d_bwd_data)
add_subdirectory(11_conv2d_bwd_wgt)
add_subdirectory(12_reduce)
add_subdirectory(13_pool2d_fwd)
add_subdirectory(host_tensor)
#ifndef CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
#include <numeric>
#include <sstream>
namespace ck {
namespace driver {
struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
auto GetCompileParameterString() const
{
auto param = std::stringstream();
// clang-format off
param <<
" -DCK_PARAM_ABDataTypeEnum=" <<
ABDataTypeEnum <<
" -DCK_PARAM_AccDataTypeEnum=" <<
AccDataTypeEnum <<
" -DCK_PARAM_CDataTypeEnum=" <<
CDataTypeEnum <<
" -DCK_PARAM_BlockSize=" <<
BlockSize <<
" -DCK_PARAM_GN0=" <<
GN0 <<
" -DCK_PARAM_GK1=" <<
GK1 <<
" -DCK_PARAM_GM1PerBlockGM11="
<< GM1PerBlockGM11 <<
" -DCK_PARAM_GN1PerBlockGN11=" <<
GN1PerBlockGN11 <<
" -DCK_PARAM_GK0PerBlock=" <<
GK0PerBlock <<
" -DCK_PARAM_BM1PerThreadBM11=" <<
BM1PerThreadBM11 <<
" -DCK_PARAM_BN1PerThreadBN11=" <<
BN1PerThreadBN11 <<
" -DCK_PARAM_BK0PerThread=" <<
BK0PerThread <<
" -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" <<
BM10BN10ThreadClusterBM10Xs[0] << "," <<
BM10BN10ThreadClusterBM10Xs[1] <<
" -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" <<
BM10BN10ThreadClusterBN10Xs[0] << "," <<
BM10BN10ThreadClusterBN10Xs[1] <<
" -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" <<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4] <<
" -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" <<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4] <<
" -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
" -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
" -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" <<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4] <<
" -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" <<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4] <<
" -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] <<
" -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] <<
" -DCK_PARAM_CThreadTransferDstScalarPerVector=" <<
CThreadTransferDstScalarPerVector <<
" -DCK_PARAM_HasMainKBlockLoop=" <<
static_cast<int>(HasMainKBlockLoop) <<
" -DCK_PARAM_HasDoubleTailKBlockLoop=" <<
static_cast<int>(HasDoubleTailKBlockLoop);
// clang-format on
return param.str();
}
ck::DataTypeEnum_t ABDataTypeEnum = ck::DataTypeEnum_t::Unknown;
ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
ck::DataTypeEnum_t CDataTypeEnum = ck::DataTypeEnum_t::Unknown;
int BlockSize = -1;
int GN0 = -1;
int GK1 = -1;
int GM1PerBlockGM11 = -1;
int GN1PerBlockGN11 = -1;
int GK0PerBlock = -1;
int BM1PerThreadBM11 = -1;
int BN1PerThreadBN11 = -1;
int BK0PerThread = -1;
std::array<int, 2> BM10BN10ThreadClusterBM10Xs = {-1, -1};
std::array<int, 2> BM10BN10ThreadClusterBN10Xs = {-1, -1};
std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {
-1, -1, -1, -1, -1};
std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
-1, -1, -1, -1, -1};
std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-1, -1, -1, -1, -1};
std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-1, -1, -1, -1, -1};
std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {
-1, -1, -1, -1, -1};
std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
-1, -1, -1, -1, -1};
std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-1, -1, -1, -1, -1};
std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-1, -1, -1, -1, -1};
int CThreadTransferDstScalarPerVector = -1;
bool HasMainKBlockLoop = false;
bool HasDoubleTailKBlockLoop = false;
};
struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
ck::DataTypeEnum_t ABDataTypeEnum;
ck::DataTypeEnum_t CDataTypeEnum;
int BlockSize;
int GN0;
int GK1;
int GM1PerBlockGM11;
int GN1PerBlockGN11;
int GK0PerBlock;
int BM1PerThreadBM11;
int BN1PerThreadBN11;
int BK0PerThread;
std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
};
inline static auto generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw()
{
constexpr auto f32 = ck::DataTypeEnum_t::Float;
constexpr auto f16 = ck::DataTypeEnum_t::Half;
constexpr auto i8 = ck::DataTypeEnum_t::Int8;
return std::vector<TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw>{
// clang-format off
// fp32
{f32, f32, 256, 1, 1, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 1}, {4, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 1}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 256, 2, 1, 128, 64, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 1}, { 4, 1, 1, 64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 256, 4, 1, 128, 32, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 256, 8, 1, 128, 16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 1}, {16, 1, 1, 16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 128, 1, 1, 64, 128, 8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 1}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
// fp16
{f16, f16, 256, 1, 2, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 2}, {4, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 2}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 256, 2, 2, 128, 64, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 2}, { 4, 1, 1, 64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 256, 4, 2, 128, 32, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 256, 8, 2, 128, 16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 2}, {16, 1, 1, 16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 128, 1, 2, 64, 128, 8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 2}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
// i8
{ i8, i8, 256, 1, 4, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 4}, {4, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 4}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 256, 2, 4, 128, 64, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 4}, { 4, 1, 1, 64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 256, 4, 4, 128, 32, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 256, 8, 4, 128, 16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 4}, {16, 1, 1, 16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 128, 1, 4, 64, 128, 8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 4}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}
// clang-format on
};
}
// TODO make this common interface and write specs for it
struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
static auto
CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
{
const int C = conv_problem_desc.C;
const int Y = conv_problem_desc.Y;
const int X = conv_problem_desc.X;
const int Ho = conv_problem_desc.Ho;
const int Wo = conv_problem_desc.Wo;
if(!(conv_problem_desc.InDataTypeEnum == tunable.ABDataTypeEnum &&
conv_problem_desc.WeiDataTypeEnum == tunable.ABDataTypeEnum &&
conv_problem_desc.OutDataTypeEnum == tunable.CDataTypeEnum))
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
const auto ABDataTypeEnum = conv_problem_desc.InDataTypeEnum;
const auto CDataTypeEnum = conv_problem_desc.OutDataTypeEnum;
DataTypeEnum_t AccDataTypeEnum;
if(ABDataTypeEnum == DataTypeEnum_t::Float || ABDataTypeEnum == DataTypeEnum_t::Half)
{
AccDataTypeEnum = DataTypeEnum_t::Float;
}
else if(ABDataTypeEnum == DataTypeEnum_t::Int8)
{
AccDataTypeEnum = DataTypeEnum_t::Int32;
}
else
{
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
}
const int BlockSize = tunable.BlockSize;
const int GN0 = tunable.GN0;
const int GK1 = tunable.GK1;
const int GM11 = tunable.GM1PerBlockGM11;
const int GN11 = tunable.GN1PerBlockGN11;
const int GK0PerBlock = tunable.GK0PerBlock;
const int BM11 = tunable.BM1PerThreadBM11;
const int BN11 = tunable.BN1PerThreadBN11;
const int BK0PerThread = tunable.BK0PerThread;
const auto BM10BN10ThreadClusterBM10Xs = tunable.BM10BN10ThreadClusterBM10Xs;
const auto BM10BN10ThreadClusterBN10Xs = tunable.BM10BN10ThreadClusterBN10Xs;
const auto ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
const auto ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
const auto ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
const auto ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
const auto BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
const auto BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
const auto BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
const auto BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
// C threadwise copy: {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
const int CThreadTransferDstScalarPerVector = gcd(4, GN11, BN11, Ho * Wo);
const int C0 = GK1;
if(!(C % C0 == 0))
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
const int C1 = C / C0;
const int GK0 = C1 * Y * X;
if(!(GK0 % GK0PerBlock == 0))
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
const bool HasMainKBlockLoop = ((GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1);
const bool HasDoubleTailKBlockLoop = ((GK0 / GK0PerBlock) % 2 == 0);
return std::make_tuple(
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{
ABDataTypeEnum,
AccDataTypeEnum,
CDataTypeEnum,
BlockSize,
GN0,
GK1,
GM11,
GN11,
GK0PerBlock,
BM11,
BN11,
BK0PerThread,
BM10BN10ThreadClusterBM10Xs,
BM10BN10ThreadClusterBN10Xs,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
CThreadTransferDstScalarPerVector,
HasMainKBlockLoop,
HasDoubleTailKBlockLoop},
true);
}
static auto GetDefaultCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc)
{
for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
{
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param{};
bool found = false;
std::tie(compile_param, found) =
CalculateCompileParameterBasedOnTunable(conv_problem_desc, tunable);
if(found && IsValidCompileParameter(conv_problem_desc, compile_param))
return std::make_tuple(compile_param, true);
}
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
}
static bool IsApplicable(const ConvolutionProblemDescriptor& conv_problem_desc)
{
bool found = false;
std::tie(std::ignore, found) = GetDefaultCompileParameter(conv_problem_desc);
return found;
}
static bool
IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
{
const int N = conv_problem_desc.N;
const int K = conv_problem_desc.K;
const int C = conv_problem_desc.C;
const int Y = conv_problem_desc.Y;
const int X = conv_problem_desc.X;
const int Ho = conv_problem_desc.Ho;
const int Wo = conv_problem_desc.Wo;
const int GK1 = compile_param.GK1;
const int GN0 = compile_param.GN0;
const int GM11 = compile_param.GM1PerBlockGM11;
const int GN11 = compile_param.GN1PerBlockGN11;
const int BM11 = compile_param.BM1PerThreadBM11;
const int BN11 = compile_param.BN1PerThreadBN11;
const int C0 = GK1;
const int N0 = GN0;
if(!(C % C0 == 0))
return false;
const int C1 = C / C0;
if(!(N % N0 == 0))
return false;
const int N1 = N / N0;
const int GM0 = 1;
const int GM1 = K;
const int GN1 = N1 * Ho * Wo;
const int GK0 = C1 * Y * X;
// check data type
{
if(!(conv_problem_desc.InDataTypeEnum == conv_problem_desc.WeiDataTypeEnum &&
conv_problem_desc.InDataTypeEnum == compile_param.ABDataTypeEnum))
return false;
if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Float ||
compile_param.ABDataTypeEnum == DataTypeEnum_t::Half)
{
if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Float))
return false;
}
else if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Int8)
{
if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Int32))
return false;
}
}
// check gridwise contraction
{
if(!(GM1 % GM11 == 0 && GN1 % GN11 == 0 && GK0 % compile_param.GK0PerBlock == 0))
return false;
const bool has_main_k_block_loop =
((GK0 + compile_param.GK0PerBlock) / (2 * compile_param.GK0PerBlock) > 1);
const bool has_double_tail_k_block_loop = ((GK0 / compile_param.GK0PerBlock) % 2 == 0);
if(!(has_main_k_block_loop == compile_param.HasMainKBlockLoop &&
has_double_tail_k_block_loop == compile_param.HasDoubleTailKBlockLoop))
return false;
}
// check A blockwise copy
{
const auto block_slice_lengths =
std::array<int, 5>{compile_param.GK0PerBlock, GM0, 1, GM11, GK1};
const auto& cluster_lengths =
compile_param.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
const auto& thread_slice_lengths =
compile_param.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
const auto& src_vector_lengths =
compile_param.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
const auto& dst_vector_lengths =
compile_param.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
// check number of working thread
const int num_work_thread = std::accumulate(
cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
if(!(compile_param.BlockSize >= num_work_thread))
return false;
// check block slice lengths vs thread slice lengths vs cluster lengths
for(int i = 0; i < 5; ++i)
{
if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
return false;
}
// check thread slice lengths vs vector lengths
for(int i = 0; i < 5; ++i)
{
if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0))
return false;
if(!(thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
return false;
}
// check Src vectorization, GK0 is global mem vector dim
if(!(src_vector_lengths[1] == 1 && src_vector_lengths[2] == 1 &&
src_vector_lengths[3] == 1 && src_vector_lengths[4] == 1))
return false;
// check Dst vectorization, {GM11, GK1} are LDS vector dims
if(dst_vector_lengths[4] == GK1)
{ // vectorize on {GM11, GK1}
if(!(GM11 % dst_vector_lengths[3] == 0))
return false;
}
else
{ // vectorize on {GK1} only
if(!(GK1 % dst_vector_lengths[4] == 0))
return false;
if(!(dst_vector_lengths[3] == 1))
return false;
}
}
// check B blockwise copy
{
const auto block_slice_lengths =
std::array<int, 5>{compile_param.GK0PerBlock, GN0, 1, GN11, GK1};
const auto& cluster_lengths =
compile_param.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
const auto& thread_slice_lengths =
compile_param.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
const auto& src_vector_lengths =
compile_param.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
const auto& dst_vector_lengths =
compile_param.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
// check number of working thread
const int num_work_thread = std::accumulate(
cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
if(!(compile_param.BlockSize >= num_work_thread))
return false;
// check block slice lengths vs thread slice lengths vs cluster lengths
for(int i = 0; i < 5; ++i)
{
if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
return false;
}
// check thread slice lengths vs vector lengths
for(int i = 0; i < 5; ++i)
{
if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0 &&
thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
return false;
}
// check Src vectorization: {GN11} is global mem vector dim
if(!(src_vector_lengths[0] == 1 && src_vector_lengths[1] == 1 &&
src_vector_lengths[2] == 1 && src_vector_lengths[4] == 1))
return false;
// check Src tensor layout related vectorization
if(Y == 1 && X == 1 && conv_problem_desc.ConvStrideH == 1 &&
conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadH == 0 &&
conv_problem_desc.InLeftPadW == 0 && conv_problem_desc.InRightPadH == 0 &&
conv_problem_desc.InRightPadW == 0)
{
if(!((Ho * Wo) % src_vector_lengths[3] == 0))
return false;
}
else if(conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadW == 0 &&
conv_problem_desc.InRightPadW == 0)
{
if(!(Wo % src_vector_lengths[3] == 0))
return false;
}
else
{
if(!(src_vector_lengths[3] == 1))
return false;
}
// check Dst vectorization: {GN11, GK1} are LDS vector dims
if(dst_vector_lengths[4] == GK1)
{ // vectorize on {GN11, GK1}
if(!(GN11 % dst_vector_lengths[3] == 0))
return false;
}
else
{ // vectorize on {GK1} only
if(!(dst_vector_lengths[3] == 1))
return false;
if(!(GK1 % dst_vector_lengths[4] == 0))
return false;
}
}
// check blockwise GEMM
{
const int BM10 = std::accumulate(compile_param.BM10BN10ThreadClusterBM10Xs.begin(),
compile_param.BM10BN10ThreadClusterBM10Xs.end(),
1,
std::multiplies<int>{});
const int BN10 = std::accumulate(compile_param.BM10BN10ThreadClusterBN10Xs.begin(),
compile_param.BM10BN10ThreadClusterBN10Xs.end(),
1,
std::multiplies<int>{});
if(!(compile_param.BlockSize == BM10 * BN10))
return false;
const int BM = GM0 * GM11;
const int BN = GN0 * GN11;
const int BM1 = BM10 * BM11;
const int BN1 = BN10 * BN11;
if(!(BM % BM1 == 0 && BN % BN1 == 0))
return false;
const int BM0 = BM / BM1;
const int BN0 = BN / BN1;
// blockwise GEMM currently only support BM0 == 2 && BN0 == 2
if(!(BM0 == 2 && BN0 == 2))
return false;
if(!(compile_param.GK0PerBlock % compile_param.BK0PerThread == 0))
return false;
}
// check C threadwise copy
{
// {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
const int dst_vector_len_gn11 = compile_param.CThreadTransferDstScalarPerVector;
// check slice length vs Dst vector length:
if(!(BN11 % dst_vector_len_gn11 == 0 && GN11 % dst_vector_len_gn11 == 0))
return false;
// check Dst memory layout related vectorization:
if(!((Ho * Wo) % compile_param.CThreadTransferDstScalarPerVector == 0))
return false;
}
return true;
};
static int GetBlockSize(const ConvolutionProblemDescriptor&,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
{
return compile_param.BlockSize;
}
static int GetGridSize(const ConvolutionProblemDescriptor& conv_problem_desc,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
{
const int N = conv_problem_desc.N;
const int K = conv_problem_desc.K;
const int Ho = conv_problem_desc.Ho;
const int Wo = conv_problem_desc.Wo;
const int N0 = compile_param.GN0;
const int N1 = N / N0;
const int GM1 = K;
const int GN1 = N1 * Ho * Wo;
const int GM11 = compile_param.GM1PerBlockGM11;
const int GN11 = compile_param.GN1PerBlockGN11;
const int GM10 = GM1 / GM11;
const int GN10 = GN1 / GN11;
return GM10 * GN10;
}
static std::size_t GetWorkSpaceSize(const ConvolutionProblemDescriptor&,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw&)
{
// workspace is used for save transformed tensor descritpors created by prepare kernel
return 4096L;
}
static std::size_t GetMaxWorkSpaceSize(const ConvolutionProblemDescriptor&) { return 4096L; }
static auto GetTunableList()
{
return generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw();
}
};
} // namespace driver
} // namespace ck
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
{
int BlockSize;
int MPerBlock;
int NPerBlock;
int KPerBlock;
int M1PerThread;
int N1PerThread;
int KPerThread;
int M1N1ThreadClusterM10;
int M1N1ThreadClusterN10;
int M1N1ThreadClusterM11;
int M1N1ThreadClusterN11;
std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int, 3> ABlockTransferSrcAccessOrder;
int ABlockTransferSrcVectorDim;
int ABlockTransferSrcScalarPerVector;
int ABlockTransferDstScalarPerVector_M1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int, 3> BBlockTransferSrcAccessOrder;
int BBlockTransferSrcVectorDim;
int BBlockTransferSrcScalarPerVector;
int BBlockTransferDstScalarPerVector_N1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 6> CThreadTransferSrcDstAccessOrder;
int CThreadTransferSrcDstVectorDim;
int CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
256, 128, 128, 8, 4, 4, 1,
8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
{2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128},
{0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2},
5, 1};
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
{
int BlockSize;
int MPerBlock;
int NPerBlock;
int KPerBlock;
int MPerXDL;
int NPerXDL;
int K1;
int MRepeat;
int NRepeat;
std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int, 3> ABlockTransferSrcAccessOrder;
int ABlockTransferSrcVectorDim;
int ABlockTransferSrcScalarPerVector;
int ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int, 3> BBlockTransferSrcAccessOrder;
int BBlockTransferSrcVectorDim;
int BBlockTransferSrcScalarPerVector;
int BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 8> CThreadTransferSrcDstAccessOrder;
int CThreadTransferSrcDstVectorDim;
int CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
256, // BlockSize
128, // MPerBlock,
128, // NPerBlock,
4, // KPerBlock,
32, // MPerXDL,
32, // NPerXDL,
4, // K1,
2, // MRepeat,
2, // NRepeat,
{1, 2, 4}, // ABlockTransferThreadSliceLengths_K0_M_K1,
{4, 64, 1}, // ABlockTransferThreadClusterLengths_K0_M_K1,
{1, 0, 2}, // ABlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // ABlockTransferSrcAccessOrder,
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector,
4, // ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun,
{1, 2, 4}, // BBlockTransferThreadSliceLengths_K0_N_K1,
{4, 64, 1}, // BBlockTransferThreadClusterLengths_K0_N_K1,
{0, 2, 1}, // BBlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // BBlockTransferSrcAccessOrder,
1, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_K1
false, // BThreadTransferSrcResetCoordinateAfterRun
{3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
7, // CThreadTransferSrcDstVectorDim,
1 // CThreadTransferDstScalarPerVector
};
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
{
int BlockSize;
int MPerBlock;
int NPerBlock;
int KPerBlock;
int MPerWave;
int NPerWave;
int K1;
int MRepeat;
int NRepeat;
std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int, 3> ABlockTransferSrcAccessOrder;
int ABlockTransferSrcVectorDim;
int ABlockTransferSrcScalarPerVector;
int ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int, 3> BBlockTransferSrcAccessOrder;
int BBlockTransferSrcVectorDim;
int BBlockTransferSrcScalarPerVector;
int BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 8> CThreadTransferSrcDstAccessOrder;
int CThreadTransferSrcDstVectorDim;
int CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
256, // BlockSize
128, // MPerBlock,
128, // NPerBlock,
4, // KPerBlock,
32, // MPerWave,
32, // NPerWave,
4, // K1,
2, // MRepeat,
2, // NRepeat,
{1, 2, 4}, // ABlockTransferThreadSliceLengths_K0_M_K1,
{4, 64, 1}, // ABlockTransferThreadClusterLengths_K0_M_K1,
{1, 0, 2}, // ABlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // ABlockTransferSrcAccessOrder,
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector,
4, // ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun,
{1, 2, 4}, // BBlockTransferThreadSliceLengths_K0_N_K1,
{4, 64, 1}, // BBlockTransferThreadClusterLengths_K0_N_K1,
{1, 0, 2}, // BBlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // BBlockTransferSrcAccessOrder,
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_K1
false, // BThreadTransferSrcResetCoordinateAfterRun
{2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
7, // CThreadTransferSrcDstVectorDim,
1 // CThreadTransferDstScalarPerVector
};
#endif
#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
#define CONVOLUTION_PROBLEM_DESCRIPTOR
namespace ck {
namespace driver {
struct ConvolutionProblemDescriptor
{
ConvolutionProblemDescriptor() = default;
ConvolutionProblemDescriptor(int N_,
int K_,
int C_,
int Y_,
int X_,
int Hi_,
int Wi_,
int Ho_,
int Wo_,
int ConvStrideH_,
int ConvStrideW_,
int ConvDilationH_,
int ConvDilationW_,
int InLeftPadH_,
int InLeftPadW_,
int InRightPadH_,
int InRightPadW_,
ck::DataTypeEnum_t InDataTypeEnum_,
ck::DataTypeEnum_t WeiDataTypeEnum_,
ck::DataTypeEnum_t OutDataTypeEnum_)
: N{N_},
K{K_},
C{C_},
Y{Y_},
X{X_},
Hi{Hi_},
Wi{Wi_},
Ho{Ho_},
Wo{Wo_},
ConvStrideH{ConvStrideH_},
ConvStrideW{ConvStrideW_},
ConvDilationH{ConvDilationH_},
ConvDilationW{ConvDilationW_},
InLeftPadH{InLeftPadH_},
InLeftPadW{InLeftPadW_},
InRightPadH{InRightPadH_},
InRightPadW{InRightPadW_},
InDataTypeEnum{InDataTypeEnum_},
WeiDataTypeEnum{WeiDataTypeEnum_},
OutDataTypeEnum{OutDataTypeEnum_}
{
}
int N;
int K;
int C;
int Y;
int X;
int Hi;
int Wi;
int Ho;
int Wo;
int ConvStrideH;
int ConvStrideW;
int ConvDilationH;
int ConvDilationW;
int InLeftPadH;
int InLeftPadW;
int InRightPadH;
int InRightPadW;
ck::DataTypeEnum_t InDataTypeEnum;
ck::DataTypeEnum_t WeiDataTypeEnum;
ck::DataTypeEnum_t OutDataTypeEnum;
std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
};
} // namespace driver
} // namespace ck
#endif
#ifndef CK_SOLVER_COMMON_HPP
#define CK_SOLVER_COMMON_HPP
namespace ck {
namespace driver {
// greatest common divisor, aka highest common factor
inline int gcd(int x, int y)
{
if(x < 0)
{
return gcd(-x, y);
}
else if(y < 0)
{
return gcd(x, -y);
}
else if(x == y || x == 0)
{
return y;
}
else if(y == 0)
{
return x;
}
else if(x > y)
{
return gcd(x % y, y);
}
else
{
return gcd(x, y % x);
}
}
template <typename X,
typename... Ys,
typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
auto gcd(X x, Ys... ys)
{
return gcd(x, gcd(ys...));
}
} // namespace driver
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment