Reorganize files, Part 1 (#119)

* delete obselete files * move files * build * update cmake * update cmake * fix build * reorg examples * update cmake for example and test

Reorganize files, Part 1 (#119)
* delete obselete files * move files * build * update cmake * update cmake * fix build * reorg examples * update cmake for example and test
5d37d7bf · Chao Liu · GitHub · 245f7414 · 5d37d7bf · 5d37d7bf
Unverified Commit 5d37d7bf authored Mar 08, 2022 by Chao Liu Committed by GitHub Mar 08, 2022
20 changed files
--- a/example/11_conv2d_bwd_wgt/CMakeLists.txt
+++ b/example/11_conv2d_bwd_wgt/CMakeLists.txt
+add_example_executable(example_conv2d_bwd_wgt_xdl conv2d_bwd_wgt_xdl.cpp)
--- a/example/13_conv2d_backward_weight_xdl/README.md
+++ b/example/13_conv2d_backward_weight_xdl/README.md
--- a/example/13_conv2d_backward_weight_xdl/main.cpp
+++ b/example/13_conv2d_backward_weight_xdl/main.cpp
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
+add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
--- a/example/13_reduce_blockwise/reduce_blockwise.cpp
+++ b/example/13_reduce_blockwise/reduce_blockwise.cpp
@@ -14,7 +14,6 @@
 #include "device_reduce_blockwise.hpp"
 #include "host_reduce_util.hpp"
 #include "host_generic_reduction.hpp"
-
 #include "reduction_enums.hpp"
 #include "reduction_operator_mapping.hpp"


--- a/example/13_pool2d_fwd/CMakeLists.txt
+++ b/example/13_pool2d_fwd/CMakeLists.txt
+add_example_executable(example_pool2d_fwd pool2d_fwd.cpp)
--- a/example/12_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/12_pool2d_fwd/pool2d_fwd.cpp
@@ -12,7 +12,7 @@
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
 #include "reduction_operator.hpp"
-#include "device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "device_pool2d_fwd_nhwc_nhwc.hpp"

 using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;

--- a/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/README.md
+++ b/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/README.md
-# Instructions for ```conv_xdl_bias_relu_add``` Example
-
-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-
-## Build ```conv_xdl_bias_relu_add```
-```bash
-mkdir build && cd build
-```
-
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-
-```bash
- make -j conv_xdl_bias_relu_add
-```
-
-## Run ```conv_xdl_bias_relu_add```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv_xdl_bias_relu_add 0 1 5
-```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-bias_k: dim 1, lengths {256}, strides {1}
-resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-arg.c0_grid_desc_m_n_{ 165888, 256}
-arg.c1_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
-```
--- a/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp
+++ b/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
-#include "element_wise_operation.hpp"
-#include "convolution_utility.hpp"
-
-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InLayout  = ck::tensor_layout::convolution::NHWC;
-using WeiLayout = ck::tensor_layout::convolution::KYXC;
-using OutLayout = ck::tensor_layout::convolution::NHWK;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-
-static constexpr auto MemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
-
-// clang-format off
-using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|             Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|    GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|   DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |                |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, MemoryAtomicAdd, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1,  8, 1, 1,32>,               2>;
-// clang-format on
-
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
-                                const Tensor<TWei>& wei_k_c_y_x,
-                                Tensor<TOut>& out_n_k_ho_wo,
-                                const Tensor<TOut>& bias_k,
-                                const std::vector<ck::index_t>& conv_strides,
-                                const std::vector<ck::index_t>& conv_dilations,
-                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>& /* in_right_pads */,
-                                const InElementOp& in_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        float v_acc = 0;
-
-        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
-                    {
-                        float v_in;
-                        float v_wei;
-
-                        in_element_op(v_in, static_cast<const float>(in_n_c_hi_wi(n, c, hi, wi)));
-                        wei_element_op(v_wei, static_cast<const float>(wei_k_c_y_x(k, c, y, x)));
-
-                        v_acc += v_in * v_wei;
-                    }
-                }
-            }
-        }
-
-        float v_out;
-
-        out_element_op(v_out, v_acc, static_cast<float>(bias_k(k)));
-
-        out_n_k_ho_wo(n, k, ho, wo) += v_out;
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out_n_k_ho_wo.mDesc.GetLengths()[0],
-                               out_n_k_ho_wo.mDesc.GetLengths()[1],
-                               out_n_k_ho_wo.mDesc.GetLengths()[2],
-                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
-        std::thread::hardware_concurrency());
-}
-
-int main(int argc, char* argv[])
-{
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
-
-    // Conv shape
-    ck::index_t N               = 128;
-    ck::index_t K               = 256;
-    ck::index_t C               = 192;
-    ck::index_t Y               = 3;
-    ck::index_t X               = 3;
-    ck::index_t Hi              = 71;
-    ck::index_t Wi              = 71;
-    ck::index_t conv_stride_h   = 2;
-    ck::index_t conv_stride_w   = 2;
-    ck::index_t conv_dilation_h = 1;
-    ck::index_t conv_dilation_w = 1;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-    }
-    else if(argc == 19)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-
-        N               = std::stoi(argv[4]);
-        K               = std::stoi(argv[5]);
-        C               = std::stoi(argv[6]);
-        Y               = std::stoi(argv[7]);
-        X               = std::stoi(argv[8]);
-        Hi              = std::stoi(argv[9]);
-        Wi              = std::stoi(argv[10]);
-        conv_stride_h   = std::stoi(argv[11]);
-        conv_stride_w   = std::stoi(argv[12]);
-        conv_dilation_h = std::stoi(argv[13]);
-        conv_dilation_w = std::stoi(argv[14]);
-        in_left_pad_h   = std::stoi(argv[15]);
-        in_left_pad_w   = std::stoi(argv[16]);
-        in_right_pad_h  = std::stoi(argv[17]);
-        in_right_pad_w  = std::stoi(argv[18]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
-        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(0);
-    }
-
-    const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
-    const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
-    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
-    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
-    const auto output_spatial_lengths =
-        ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
-                                                                              {Y, X},
-                                                                              conv_filter_strides,
-                                                                              conv_filter_dilations,
-                                                                              input_left_pads,
-                                                                              input_right_pads);
-
-    const ck::index_t Ho = output_spatial_lengths[0];
-    const ck::index_t Wo = output_spatial_lengths[1];
-
-    // tensor layout
-    auto f_host_tensor_descriptor = [](std::size_t N_,
-                                       std::size_t C_,
-                                       std::size_t H,
-                                       std::size_t W,
-                                       auto layout) {
-        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
-        }
-        else if constexpr(ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::KYXC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWK>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
-        }
-    };
-
-    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
-    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_host_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_device_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-
-    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias_k(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
-
-    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        break;
-    default:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
-        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
-    }
-
-    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) *
-                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_device_buf.ToDevice(out_n_k_ho_wo_host_result.mData.data());
-    bias_device_buf.ToDevice(bias_k.mData.data());
-
-    auto conv    = DeviceConvFwdInstance{};
-    auto invoker = conv.MakeInvoker();
-    auto argument =
-        conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
-                          static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                          static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                          static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
-                          N,
-                          K,
-                          C,
-                          std::vector<ck::index_t>{Hi, Wi},
-                          std::vector<ck::index_t>{Y, X},
-                          std::vector<ck::index_t>{Ho, Wo},
-                          conv_filter_strides,
-                          conv_filter_dilations,
-                          input_left_pads,
-                          input_right_pads,
-                          InElementOp{},
-                          WeiElementOp{},
-                          OutElementOp{});
-
-    if(!conv.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device operator with the specified compilation parameters does "
-            "not support this problem");
-    }
-
-    float ave_time = invoker.Run(argument, nrepeat);
-
-    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
-
-    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
-                            sizeof(WeiDataType) * (K * C * Y * X) +
-                            sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    if(do_verification)
-    {
-        host_reference_calculation(in_n_c_hi_wi,
-                                   wei_k_c_y_x,
-                                   out_n_k_ho_wo_host_result,
-                                   bias_k,
-                                   conv_filter_strides,
-                                   conv_filter_dilations,
-                                   input_left_pads,
-                                   input_right_pads,
-                                   InElementOp{},
-                                   WeiElementOp{},
-                                   OutElementOp{});
-
-        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
-
-        check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
-    }
-}
--- a/example/9_conv2d_fwd_xdl_int8/README.md
+++ b/example/9_conv2d_fwd_xdl_int8/README.md
-# Instructions for ```conv2d_fwd_xdl``` Example
-
-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-
-## Build ```conv2d_fwd_xdl```
-```bash
-mkdir build && cd build
-```
-
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-
-```bash
- make -j conv2d_fwd_xdl
-```
-
-## Run ```conv2d_fwd_xdl_int8```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv2d_fwd_xdl_int8 0 1 5
-```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s
-```
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
 include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/host/device/include
-    ${PROJECT_SOURCE_DIR}/device_operation/include
-    ${PROJECT_SOURCE_DIR}/reference_operation/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/device_operation_reference/include
+    ${PROJECT_SOURCE_DIR}/include/ck
+    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor
+    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
+    ${PROJECT_SOURCE_DIR}/external/include/half
 )

-set(GEMM_XDL_SOURCE 1_gemm_xdl/gemm_xdl.cpp)
-set(GEMM_XDL_INT8_SOURCE 1_gemm_xdl/gemm_xdl_int8.cpp)
-set(GEMM_XDL_BF16_SOURCE 1_gemm_xdl/gemm_xdl_bf16.cpp)
-set(GEMM_XDL_BIAS_RELU_SOURCE 2_gemm_xdl_bias_relu/gemm_xdl_bias_relu.cpp)
-set(GEMM_XDL_BIAS_RELU_ADD_SOURCE 3_gemm_xdl_bias_relu_add/gemm_xdl_bias_relu_add.cpp)
-set(CONV2D_FWD_XDL_SOURCE 4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_SOURCE 5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE 6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE 7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp)
-set(GEMM_XDL_ALPHA_BETA_SOURCE 8_gemm_xdl_alpha_beta/gemm_xdl_alpha_beta.cpp)
-set(CONV2D_FWD_XDL_INT8_SOURCE 9_conv2d_fwd_xdl_int8/conv2d_fwd_xdl_int8.cpp)
-set(CONV2D_WRW_XDL_SOURCE 13_conv2d_backward_weight_xdl/main.cpp)
-set(CONV3D_FWD_XDL_SOURCE 10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp)
-set(CONVND_FWD_XDL_SOURCE 11_convnd_fwd_xdl/convnd_fwd_xdl.cpp)
-set(CONV2D_BWD_DATA_XDL_SOURCE 12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp)
-set(POOL2D_FWD_SOURCE 12_pool2d_fwd/pool2d_fwd.cpp)
-set(REDUCE_BLOCKWISE_SOURCE 13_reduce_blockwise/reduce_blockwise.cpp)
+add_custom_target(examples)

-add_executable(gemm_xdl ${GEMM_XDL_SOURCE})
-add_executable(gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE})
-add_executable(gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE})
-add_executable(gemm_xdl_bias_relu ${GEMM_XDL_BIAS_RELU_SOURCE})
-add_executable(gemm_xdl_bias_relu_add ${GEMM_XDL_BIAS_RELU_ADD_SOURCE})
-add_executable(conv2d_fwd_xdl ${CONV2D_FWD_XDL_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu ${CONV2D_FWD_XDL_BIAS_RELU_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu_add ${CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu_atomic_add ${CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE})
-add_executable(gemm_xdl_alpha_beta ${GEMM_XDL_ALPHA_BETA_SOURCE})
-add_executable(conv2d_fwd_xdl_int8 ${CONV2D_FWD_XDL_INT8_SOURCE})
-add_executable(conv2d_wrw_xdl ${CONV2D_WRW_XDL_SOURCE})
-add_executable(conv3d_fwd_xdl ${CONV3D_FWD_XDL_SOURCE})
-add_executable(convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
-add_executable(conv2d_bwd_data_xdl ${CONV2D_BWD_DATA_XDL_SOURCE})
-add_executable(pool2d_fwd ${POOL2D_FWD_SOURCE})
-add_executable(reduce_blockwise ${REDUCE_BLOCKWISE_SOURCE})
-
-target_link_libraries(gemm_xdl PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bf16 PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bias_relu PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bias_relu_add PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu_add PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu_atomic_add PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_alpha_beta PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(conv2d_wrw_xdl PRIVATE host_tensor)
-target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(conv2d_bwd_data_xdl PRIVATE host_tensor)
-target_link_libraries(pool2d_fwd PRIVATE host_tensor)
-target_link_libraries(reduce_blockwise PRIVATE host_tensor)
+function(add_example_executable EXAMPLE_NAME)
+    message("adding example ${EXAMPLE_NAME}")
+    add_executable(${EXAMPLE_NAME} ${ARGN})
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
+    add_dependencies(examples ${EXAMPLE_NAME})
+endfunction(add_example_executable EXAMPLE_NAME)

+add_subdirectory(01_gemm)
+add_subdirectory(02_gemm_alpha_beta)
+add_subdirectory(03_gemm_bias_relu)
+add_subdirectory(04_gemm_bias_relu_add)
+add_subdirectory(05_conv2d_fwd)
+add_subdirectory(06_conv2d_fwd_bias_relu)
+add_subdirectory(07_conv2d_fwd_bias_relu_add)
+add_subdirectory(08_conv3d_fwd)
+add_subdirectory(09_convnd_fwd)
+add_subdirectory(10_conv2d_bwd_data)
+add_subdirectory(11_conv2d_bwd_wgt)
+add_subdirectory(12_reduce)
+add_subdirectory(13_pool2d_fwd)
--- a/external/half/include/half.hpp
+++ b/external/half/include/half.hpp
--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
-add_subdirectory(host_tensor)
--- a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
--- a/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
-
-struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
-{
-    int BlockSize;
-
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-
-    int M1PerThread;
-    int N1PerThread;
-    int KPerThread;
-
-    int M1N1ThreadClusterM10;
-    int M1N1ThreadClusterN10;
-    int M1N1ThreadClusterM11;
-    int M1N1ThreadClusterN11;
-
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_M1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_N1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 6> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-
-static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
-    default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
-        256,       128,       128, 8, 4,         4,           1,
-        8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
-        {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
-        {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
-        5,         1};
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
-
-struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-{
-    int BlockSize;
-
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-
-    int MPerXDL;
-    int NPerXDL;
-    int K1;
-
-    int MRepeat;
-    int NRepeat;
-
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_K1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_K1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-
-static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-    default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
-        256,                      // BlockSize
-        128,                      // MPerBlock,
-        128,                      // NPerBlock,
-        4,                        // KPerBlock,
-        32,                       // MPerXDL,
-        32,                       // NPerXDL,
-        4,                        // K1,
-        2,                        // MRepeat,
-        2,                        // NRepeat,
-        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
-        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
-        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
-        2,                        // ABlockTransferSrcVectorDim
-        1,                        // ABlockTransferSrcScalarPerVector,
-        4,                        // ABlockTransferDstScalarPerVector_K1,
-        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
-        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
-        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
-        {0, 2, 1},                // BBlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
-        1,                        // BBlockTransferSrcVectorDim
-        1,                        // BBlockTransferSrcScalarPerVector
-        4,                        // BBlockTransferDstScalarPerVector_K1
-        false,                    // BThreadTransferSrcResetCoordinateAfterRun
-        {3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
-        7,                        // CThreadTransferSrcDstVectorDim,
-        1                         // CThreadTransferDstScalarPerVector
-};
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
-#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
-
-struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-{
-    int BlockSize;
-
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-
-    int MPerWave;
-    int NPerWave;
-    int K1;
-
-    int MRepeat;
-    int NRepeat;
-
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_K1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_K1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-
-static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-    default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
-        256,                      // BlockSize
-        128,                      // MPerBlock,
-        128,                      // NPerBlock,
-        4,                        // KPerBlock,
-        32,                       // MPerWave,
-        32,                       // NPerWave,
-        4,                        // K1,
-        2,                        // MRepeat,
-        2,                        // NRepeat,
-        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
-        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
-        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
-        2,                        // ABlockTransferSrcVectorDim
-        4,                        // ABlockTransferSrcScalarPerVector,
-        4,                        // ABlockTransferDstScalarPerVector_K1,
-        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
-        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
-        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
-        {1, 0, 2},                // BBlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
-        2,                        // BBlockTransferSrcVectorDim
-        4,                        // BBlockTransferSrcScalarPerVector
-        4,                        // BBlockTransferDstScalarPerVector_K1
-        false,                    // BThreadTransferSrcResetCoordinateAfterRun
-        {2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
-        7,                        // CThreadTransferSrcDstVectorDim,
-        1                         // CThreadTransferDstScalarPerVector
-};
-#endif
--- a/host/solver/include/convolution_problem_descriptor.hpp
+++ b/host/solver/include/convolution_problem_descriptor.hpp
-#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
-#define CONVOLUTION_PROBLEM_DESCRIPTOR
-
-namespace ck {
-namespace driver {
-
-struct ConvolutionProblemDescriptor
-{
-    ConvolutionProblemDescriptor() = default;
-
-    ConvolutionProblemDescriptor(int N_,
-                                 int K_,
-                                 int C_,
-                                 int Y_,
-                                 int X_,
-                                 int Hi_,
-                                 int Wi_,
-                                 int Ho_,
-                                 int Wo_,
-                                 int ConvStrideH_,
-                                 int ConvStrideW_,
-                                 int ConvDilationH_,
-                                 int ConvDilationW_,
-                                 int InLeftPadH_,
-                                 int InLeftPadW_,
-                                 int InRightPadH_,
-                                 int InRightPadW_,
-                                 ck::DataTypeEnum_t InDataTypeEnum_,
-                                 ck::DataTypeEnum_t WeiDataTypeEnum_,
-                                 ck::DataTypeEnum_t OutDataTypeEnum_)
-        : N{N_},
-          K{K_},
-          C{C_},
-          Y{Y_},
-          X{X_},
-          Hi{Hi_},
-          Wi{Wi_},
-          Ho{Ho_},
-          Wo{Wo_},
-          ConvStrideH{ConvStrideH_},
-          ConvStrideW{ConvStrideW_},
-          ConvDilationH{ConvDilationH_},
-          ConvDilationW{ConvDilationW_},
-          InLeftPadH{InLeftPadH_},
-          InLeftPadW{InLeftPadW_},
-          InRightPadH{InRightPadH_},
-          InRightPadW{InRightPadW_},
-          InDataTypeEnum{InDataTypeEnum_},
-          WeiDataTypeEnum{WeiDataTypeEnum_},
-          OutDataTypeEnum{OutDataTypeEnum_}
-    {
-    }
-
-    int N;
-    int K;
-    int C;
-    int Y;
-    int X;
-    int Hi;
-    int Wi;
-    int Ho;
-    int Wo;
-    int ConvStrideH;
-    int ConvStrideW;
-    int ConvDilationH;
-    int ConvDilationW;
-    int InLeftPadH;
-    int InLeftPadW;
-    int InRightPadH;
-    int InRightPadW;
-
-    ck::DataTypeEnum_t InDataTypeEnum;
-    ck::DataTypeEnum_t WeiDataTypeEnum;
-    ck::DataTypeEnum_t OutDataTypeEnum;
-
-    std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
-};
-
-} // namespace driver
-} // namespace ck
-#endif
--- a/host/solver/include/solver_common.hpp
+++ b/host/solver/include/solver_common.hpp
-#ifndef CK_SOLVER_COMMON_HPP
-#define CK_SOLVER_COMMON_HPP
-
-namespace ck {
-namespace driver {
-
-// greatest common divisor, aka highest common factor
-inline int gcd(int x, int y)
-{
-    if(x < 0)
-    {
-        return gcd(-x, y);
-    }
-    else if(y < 0)
-    {
-        return gcd(x, -y);
-    }
-    else if(x == y || x == 0)
-    {
-        return y;
-    }
-    else if(y == 0)
-    {
-        return x;
-    }
-    else if(x > y)
-    {
-        return gcd(x % y, y);
-    }
-    else
-    {
-        return gcd(x, y % x);
-    }
-}
-
-template <typename X,
-          typename... Ys,
-          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
-auto gcd(X x, Ys... ys)
-{
-    return gcd(x, gcd(ys...));
-}
-
-} // namespace driver
-} // namespace ck
-#endif
--- a/composable_kernel/include/config.hpp
+++ b/composable_kernel/include/config.hpp