rename files, added header guard, added namespace

88b77181 · Chao Liu · 05e04665 · 88b77181 · 88b77181 · 05e04665
Commit 88b77181 authored Jun 11, 2019 by Chao Liu
20 changed files
--- a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
+using namespace ck;
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
                                                 const Tensor<T>& in,
@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
                                                                          WoPerThread,
                                                                          InBlockCopyDataPerRead,
                                                                          WeiBlockCopyDataPerRead>;
-        float time = launch_kernel(run_gridwise_convolution<gridwise_conv, T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,

--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
+using namespace ck;
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
 #elif 0
            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #elif 1
-            GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
+            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
             WeiBlockCopyDataPerRead_K,
             OutThreadCopyDataPerWrite_N>{};
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,

--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp"
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
-                                                        const Tensor<T>& in_nchw,
-                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
-                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
-                                                        index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
-        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
-    };
-    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
-        std::thread::hardware_concurrency());
-    // output
-    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
-    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
-#if 1
-    // for 3x3, 34x34, v1r3, Pascal
-    constexpr index_t BlockSize = 128;
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 16;
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 2;
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 1
-    // for 3x3, 34x34, v1r3, Vega 20
-    constexpr index_t BlockSize = 256;
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 16;
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 2, 16>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 2;
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 3x3, 28x28, v1r2, Pascal
-    constexpr index_t BlockSize = 128;
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 2>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W             = 2;
-    constexpr index_t InBlockReorderDataPerWrite_N            = 4;
-    using WeiBlockCopyClusterLengths            = Sequence<4, 1, 32>;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 3x3, 28x28, v1r3, Pascal, bad
-    constexpr index_t BlockSize = 128;
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 1;
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#endif
-    constexpr index_t GridSize =
-        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
-        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
-#elif 1
-            GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_nchw_desc),
-             decltype(wei_cyxk_desc),
-             decltype(out_khwn_desc),
-             NPerBlock,
-             KPerBlock,
-             CPerBlock,
-             HoPerBlock,
-             WoPerBlock,
-             NPerThread,
-             KPerThread,
-             HoPerThread,
-             WoPerThread,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockReorderSrcSubLengths_NCHW,
-             InBlockReorderSrcClusterLengths_NCHW,
-             InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
-             InBlockReorderDataPerRead_W,
-             InBlockReorderDataPerWrite_N,
-             WeiBlockCopyClusterLengths,
-             WeiBlockCopyDataPerRead_K,
-             OutThreadCopyDataPerWrite_N>{};
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
-    // reorder output
-    auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
-        out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
-    };
-    make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
-        std::thread::hardware_concurrency());
-}
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+using namespace ck;
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    for(index_t i = 0; i < nrepeat; ++i)
    {
        constexpr auto gridwise_conv =
-#if 1
+#if 0
            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
 #else
-            GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
+            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
             WeiBlockCopyDataPerRead_K,
             OutThreadCopyDataPerWrite_W>{};
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,

--- a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
+using namespace ck;
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
                                                        const Tensor<T>& in_nchw,
@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
             WeiBlockCopyDataPerRead,
             OutThreadCopyDataPerWrite>{};
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,

--- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+using namespace ck;
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
 #if 0
            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
 #else
-            GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
+            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
             WeiBlockCopyDataPerAccess_K>{};
 #if 1
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,

--- a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+using namespace ck;
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
 #if 0
            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
 #else
-            GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
+            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
             WeiBlockCopySrcDataPerRead_E,
             WeiBlockCopyDstDataPerWrite_K>{};
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,

--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -3,6 +3,8 @@
 #include "device.hpp"
 #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
+using namespace ck;
 template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
 void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
                                                           const Tensor<TInWei>& in_nchw,

--- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -3,6 +3,8 @@
 #include "device.hpp"
 #include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+using namespace ck;
 template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
 void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
                                                              const Tensor<T>& in_nchw,

--- a/driver/driver.cpp
+++ b/driver/driver.cpp
@@ -3,19 +3,19 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include "config.h"
+#include "config.hpp"
 #include "tensor.hpp"
 #include "ConstantTensorDescriptor.hpp"
 #include "conv_common.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
 #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+using namespace ck;
 struct GeneratorTensor_1
 {
    template <class... Is>
@@ -419,7 +419,7 @@ int main(int argc, char* argv[])
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 0
+#elif 1
    // 3x3, 34x34
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
@@ -633,15 +633,9 @@ int main(int argc, char* argv[])
 #if 1
 #if 0
-    device_direct_convolution_1
-#elif 0
    device_convolution_direct_v2_nchw_kcyx_nkhw
-#elif 0
-    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 0
    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
-#elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
 #elif 0
    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
 #elif 0

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
-configure_file("${PROJECT_SOURCE_DIR}/src/include/config.h.in" "${PROJECT_BINARY_DIR}/src/include/config.h")
+configure_file("${PROJECT_SOURCE_DIR}/src/include/config.hpp.in" "${PROJECT_BINARY_DIR}/src/include/config.hpp")
 set(TENSOR_SOURCE 
    tensor.cpp;

--- a/src/device.cpp
+++ b/src/device.cpp
-#include "config.h"
+#include "config.hpp"
 #include "device.hpp"
 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)

--- a/src/include/Array.hpp
+++ b/src/include/Array.hpp
-#pragma once
+#ifndef CK_ARRAY_HPP
+#define CK_ARRAY_HPP
 #include "Sequence.hpp"
 #include "functional2.hpp"
+namespace ck {
 template <class TData, index_t NSize>
 struct Array
 {
@@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
    static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
-    return Array<TData, NSize>{old_array.mSize[IRs]...};
+    return Array<TData, NSize>{old_array[IRs]...};
 }
 template <class TData, index_t NSize, class MapOld2New>
@@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
 {
    Array<TData, NSize> result;
-    auto f = mod_conv::plus<index_t>{};
+    auto f = math::plus<index_t>{};
    static_for<0, NSize, 1>{}(
        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
@@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
 {
    Array<TData, NSize> result;
-    auto f = mod_conv::minus<index_t>{};
+    auto f = math::minus<index_t>{};
    static_for<0, NSize, 1>{}(
        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
@@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
    Array<TData, NSize> result;
-    auto f = mod_conv::plus<index_t>{};
+    auto f = math::plus<index_t>{};
    static_for<0, NSize, 1>{}(
        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
@@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
    Array<TData, NSize> result;
-    auto f = mod_conv::minus<index_t>{};
+    auto f = math::minus<index_t>{};
    static_for<0, NSize, 1>{}(
        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
@@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
    Array<TData, NSize> result;
-    auto f = mod_conv::multiplies<index_t>{};
+    auto f = math::multiplies<index_t>{};
    static_for<0, NSize, 1>{}(
        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
@@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
    Array<TData, NSize> result;
-    auto f = mod_conv::minus<index_t>{};
+    auto f = math::minus<index_t>{};
    static_for<0, NSize, 1>{}(
        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
@@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
               a[9]);
    });
 }
+} // namespace ck
+#endif
--- a/src/include/ConstantMatrixDescriptor.hpp
+++ b/src/include/ConstantMatrixDescriptor.hpp
-#pragma once
+#ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
+#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
 #include "common.hpp"
+namespace ck {
 template <index_t NRow_, index_t NCol_, index_t RowStride_>
 struct ConstantMatrixDescriptor
 {
@@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
    printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride());
 }
+} // namespace ck
+#endif
--- a/src/include/ConstantMergedTensorDescriptor.hpp
+++ b/src/include/ConstantMergedTensorDescriptor.hpp
-#pragma once
+#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
+#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
 #include "common.hpp"
 #include "ConstantTensorDescriptor.hpp"
+namespace ck {
 // OriginalTensorDesc : ConstantTensorDescriptor<...>
 //     it's the tensor whose dimensions are to be merged
 // OriginalDimMergeSeqs : Sequence<...>...
@@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe
 {
    print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor());
 }
+} // namespace ck
+#endif
--- a/src/include/ConstantTensorDescriptor.hpp
+++ b/src/include/ConstantTensorDescriptor.hpp
-#pragma once
+#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
+#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
 #include "common.hpp"
+namespace ck {
 template <class Lengths>
 __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
 {
    return reverse_inclusive_scan_sequence(
-               Lengths{}.PopFront(), mod_conv::multiplies<index_t>{}, Number<1>{})
+               Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
        .PushBack(Number<1>{});
 }
@@ -13,7 +17,7 @@ template <class Lengths, index_t Align>
 __host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
 {
    constexpr index_t L_back_align =
-        Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
+        Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
    return calculate_tensor_strides_packed(
        Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
@@ -100,7 +104,7 @@ struct ConstantTensorDescriptor
    __host__ __device__ static constexpr index_t GetElementSize()
    {
-        return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
+        return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{});
    }
    template <class Align = Number<1>>
@@ -109,7 +113,7 @@ struct ConstantTensorDescriptor
        // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
        // dimension
        constexpr index_t element_space_unaligned = accumulate_on_sequence(
-            (GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
+            (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
        return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
    }
@@ -161,8 +165,7 @@ struct ConstantTensorDescriptor
        constexpr auto multi_id = Sequence<Is...>{};
-        return accumulate_on_sequence(
+        return accumulate_on_sequence(multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{});
-            multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
    }
    // emulate constexpr lambda
@@ -323,7 +326,7 @@ struct ConstantTensorDescriptor
        constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
        constexpr index_t fold_intervals_product =
-            accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{});
+            accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
        constexpr auto unfold_length = GetLength(Number<IDim>{});
        constexpr auto unfold_stride = GetStride(Number<IDim>{});
@@ -341,7 +344,7 @@ struct ConstantTensorDescriptor
        constexpr auto fold_strides =
            Number<unfold_stride>{} *
            reverse_inclusive_scan_sequence(
-                fold_intervals.PushBack(Number<1>{}), mod_conv::multiplies<index_t>{}, Number<1>{});
+                fold_intervals.PushBack(Number<1>{}), math::multiplies<index_t>{}, Number<1>{});
        // left and right
        constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{};
@@ -376,7 +379,7 @@ struct ConstantTensorDescriptor
        // unfolded length, stride
        constexpr index_t unfold_length = accumulate_on_sequence(
-            GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
+            GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});
        constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
@@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s,
               Strides...);
    });
 }
+} // namespace ck
+#endif
--- a/src/include/Sequence.hpp
+++ b/src/include/Sequence.hpp
-#pragma once
+#ifndef CK_SEQUENCE_HPP
+#define CK_SEQUENCE_HPP
 #include "integral_constant.hpp"
 #include "functional.hpp"
+namespace ck {
 template <class Seq>
 struct is_valid_sequence_map;
@@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
    static_if<nsize == 10>{}(
        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
 }
+} // namespace ck
+#endif
--- a/src/include/amd_inline_asm.hpp
+++ b/src/include/amd_inline_asm.hpp
-#pragma once
+#ifndef CK_AMD_INLINE_ASM_HPP
+#define CK_AMD_INLINE_ASM_HPP
 #include "common.hpp"
 #define NO_VM_WAIT 0
@@ -7,6 +9,8 @@
 #define NO_DS_WRITE 0
 #define NO_GLB_READ 0
+namespace ck {
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
@@ -759,3 +763,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off
    }
 #endif
 }
+} // namespace ck
+#endif
--- a/src/include/blockwise_2d_tensor_op.hpp
+++ b/src/include/blockwise_2d_tensor_op.hpp
-#pragma once
+#ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP
+#define CK_BLOCKWISE_2D_TENSOR_OP_HPP
 #include "common.hpp"
 #include "ConstantTensorDescriptor.hpp"
+namespace ck {
 template <index_t BlockSize, class Float, class DstDesc, class F>
 __device__ void
 blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
@@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1
        //   but we need to make sure dst stride0 is big enough,
        //   so that the out-of-bound write won't contaminate next line in dst
        constexpr index_t L1          = CopyLengths{}.Get(I1);
-        constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead);
+        constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
        static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0),
                      "wrong! out-of-bound write will contaminate next line!\n");
@@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1
        constexpr index_t L0 = CopyLengths{}.Get(I0);
        constexpr index_t L1 = CopyLengths{}.Get(I1);
-        constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead);
+        constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
        constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, read_per_d1>{});
@@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3
        }
    }
-#if USE_AMD_INLINE_ASM
+#if CK_USE_AMD_INLINE_ASM
    __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
                                                 Float* p_clipboard) const
    {
@@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3
    }
 #endif
 };
+} // namespace ck
+#endif
--- a/src/include/blockwise_3d_tensor_op.hpp
+++ b/src/include/blockwise_3d_tensor_op.hpp
-#pragma once
+#ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP
+#define CK_BLOCKWISE_3D_TENSOR_OP_HPP
 #include "common.hpp"
 #include "ConstantTensorDescriptor.hpp"
+namespace ck {
 template <index_t BlockSize,
          class Float,
          class SrcDesc,
@@ -33,7 +37,7 @@ struct Blockwise3dTensorCopy1
        //   but we need to make sure dst stride2 is big enough,
        //   so that the out-of-bound write won't contaminate next line in dst
        constexpr index_t L2          = CopyLengths{}.Get(I2);
-        constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead);
+        constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
        static_assert(read_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
                      "wrong! out-of-bound write will contaminate next line!\n");
@@ -52,7 +56,7 @@ struct Blockwise3dTensorCopy1
        constexpr index_t L1 = CopyLengths{}.Get(I1);
        constexpr index_t L2 = CopyLengths{}.Get(I2);
-        constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead);
+        constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
        constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, L1, read_per_d2>{});
@@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3
        // we allow out-of-bound read from src in D2 dimension,
        //   but we need to make sure dst stride is big enough,
        //   so that the out-of-bound write won't contaminate next line in dst
-        constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
+        constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
        static_assert(nloop_d2 * thread_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
                      "wrong! out-of-bound write will contaminate next line!\n");
@@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3
                      "wrrong! BlockSize is not big enough for ThreadPerDims!");
        constexpr index_t num_active_thread =
-            accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
+            accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
        if(BlockSize > num_active_thread)
        {
@@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3
        constexpr index_t nloop_d0 = L0 / thread_per_d0;
        constexpr index_t nloop_d1 = L1 / thread_per_d1;
-        constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
+        constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
 #pragma unroll
        for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0)
@@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3
        constexpr index_t nloop_d0 = L0 / thread_per_d0;
        constexpr index_t nloop_d1 = L1 / thread_per_d1;
-        constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
+        constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
        return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
    }
@@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3
        constexpr index_t nloop_d0 = L0 / thread_per_d0;
        constexpr index_t nloop_d1 = L1 / thread_per_d1;
-        constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
+        constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
        constexpr auto clipboard_desc =
            make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
@@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3
        constexpr index_t nloop_d0 = L0 / thread_per_d0;
        constexpr index_t nloop_d1 = L1 / thread_per_d1;
-        constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
+        constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
        constexpr auto clipboard_desc =
            make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
@@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3
        }
    }
 };
+} // namespace ck
+#endif