ckProfiler and device-level XDL GEMM operator (#48)

* add DeviceGemmXdl * update script * fix naming issue * fix comment * output HostTensorDescriptor * rename * padded GEMM for fwd v4r4r4 nhwc * refactor * refactor * refactor * adding ckProfiler * adding ckProfiler * refactor * fix tuning parameter bug * add more gemm instances * add more fp16 GEMM instances * fix profiler driver * fix bug in tuning parameter * add fp32 gemm instances * small fix * refactor * rename * refactor gemm profiler; adding DeviceConv and conv profiler * refactor * fix * add conv profiler * refactor * adding more GEMM and Conv instance * Create README.md Add build instruction for ckProfiler * Create README.md Add Readme for gemm_xdl example * Update README.md Remove build instruction from top most folder * Update README.md * clean up

ckProfiler and device-level XDL GEMM operator (#48)
* add DeviceGemmXdl * update script * fix naming issue * fix comment * output HostTensorDescriptor * rename * padded GEMM for fwd v4r4r4 nhwc * refactor * refactor * refactor * adding ckProfiler * adding ckProfiler * refactor * fix tuning parameter bug * add more gemm instances * add more fp16 GEMM instances * fix profiler driver * fix bug in tuning parameter * add fp32 gemm instances * small fix * refactor * rename * refactor gemm profiler; adding DeviceConv and conv profiler * refactor * fix * add conv profiler * refactor * adding more GEMM and Conv instance * Create README.md Add build instruction for ckProfiler * Create README.md Add Readme for gemm_xdl example * Update README.md Remove build instruction from top most folder * Update README.md * clean up
e823d518 · Chao Liu · GitHub · 6014185a · e823d518 · e823d518
Unverified Commit e823d518 authored Nov 14, 2021 by Chao Liu Committed by GitHub Nov 14, 2021
20 changed files
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -11,7 +11,6 @@
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
-#include "host_conv_bwd_weight.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
@@ -19,6 +18,15 @@
 #include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp"
 #include "device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk.hpp"

+enum ConvTensorLayout
+{
+    NCHW,
+    NHWC,
+    CHWN,
+    NCHWc,
+    NHWCc
+};
+
 #define USE_DYNAMIC_MODE 1
 #define USE_CONV_WRW_V4R4R2_XDL_NCHW 0
 #define USE_CONV_WRW_V4R4R4_XDL_NHWC 0
@@ -35,6 +43,92 @@ enum ConvBackwardWeightAlgo
    V4R4R5XDLATOMICNHWC, // 4
 };

+template <typename TOut,
+          typename TIn,
+          typename TWei,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_convolution_backward_weight(const Tensor<TOut>& out,
+                                      const Tensor<TIn>& in,
+                                      Tensor<TWei>& wei,
+                                      const ConvStrides& conv_strides,
+                                      const ConvDilations& conv_dilations,
+                                      const InLeftPads& in_left_pads,
+                                      const InRightPads&,
+                                      const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    auto f_kcyx       = [&](auto k, auto c, auto y, auto x) {
+        double v = 0;
+        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+        {
+            for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(out(n, k, ho, wo));
+                    }
+                }
+            }
+        }
+        wei(k, c, y, x) = v;
+    };
+
+    auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
+        double v = 0;
+        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+        {
+            for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(out(n, ho, wo, k));
+                    }
+                }
+            }
+        }
+        wei(k, y, x, c) = v;
+    };
+
+    if(layout == ConvTensorLayout::NCHW)
+    {
+        make_ParallelTensorFunctor(f_kcyx,
+                                   wei.mDesc.GetLengths()[0],
+                                   wei.mDesc.GetLengths()[1],
+                                   wei.mDesc.GetLengths()[2],
+                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
+        make_ParallelTensorFunctor(f_kyxc,
+                                   wei.mDesc.GetLengths()[0],
+                                   wei.mDesc.GetLengths()[1],
+                                   wei.mDesc.GetLengths()[2],
+                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
+    }
+}
+
 int main(int argc, char* argv[])
 {
    using namespace ck;
@@ -414,7 +508,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_direct_convolution_backward_weights(out,
+        host_convolution_backward_weight(out,
                                         in,
                                         wei_host,
                                         make_tuple(conv_stride_h, conv_stride_w),

--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
@@ -3,15 +3,6 @@

 #include "tensor_descriptor.hpp"

-enum ConvTensorLayout
-{
-    NCHW,
-    NHWC,
-    CHWN,
-    NCHWc,
-    NHWCc
-};
-
 template <typename... InDesc,
          typename... WeiDesc,
          typename ConvStrides,

--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
 #pragma once
 #include "host_tensor.hpp"
+#include "conv_common.hpp"

 template <typename TIn,
          typename TWei,
@@ -8,19 +9,16 @@ template <typename TIn,
          typename ConvDilations,
          typename InLeftPads,
          typename InRightPads>
-void host_direct_convolution(const Tensor<TIn>& in,
+void host_conv_nchw_kcyx_nkhw(const Tensor<TIn>& in,
                              const Tensor<TWei>& wei,
                              Tensor<TOut>& out,
                              const ConvStrides& conv_strides,
                              const ConvDilations& conv_dilations,
                              const InLeftPads& in_left_pads,
-                             const InRightPads&,
-                             const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+                              const InRightPads&)
 {
-    using namespace ck;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
+    constexpr auto I0 = ck::Number<0>{};
+    constexpr auto I1 = ck::Number<1>{};

    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
        double v = 0;
@@ -44,281 +42,9 @@ void host_direct_convolution(const Tensor<TIn>& in,
        out(n, k, ho, wo) = v;
    };

-    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
-        double v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[2])
-                    {
-                        v += static_cast<const double>(in(n, hi, wi, c)) *
-                             static_cast<const double>(wei(k, y, x, c));
-                    }
-                }
-            }
-        }
-        out(n, ho, wo, k) = v;
-    };
-
-    if(layout == ConvTensorLayout::NCHW)
-    {
    make_ParallelTensorFunctor(f_nchw,
                               out.mDesc.GetLengths()[0],
                               out.mDesc.GetLengths()[1],
                               out.mDesc.GetLengths()[2],
                               out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        make_ParallelTensorFunctor(f_nhwc,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
-
-template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
-void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
-                                   const Tensor<TWei>& wei_kcyx,
-                                   Tensor<TOut>& out_nkhw,
-                                   InLeftPads,
-                                   InRightPads)
-{
-    using namespace ck;
-
-    constexpr std::size_t HoPerTile = 2;
-    constexpr std::size_t WoPerTile = 2;
-
-    std::size_t N = in_nchw.mDesc.GetLengths()[0];
-    std::size_t C = in_nchw.mDesc.GetLengths()[1];
-
-    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
-    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
-    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
-
-    std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
-    std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
-
-    index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
-    index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
-
-    std::size_t HiPerTile = HoPerTile + Y - 1;
-    std::size_t WiPerTile = WoPerTile + X - 1;
-
-    std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
-    std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
-
-    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
-    Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
-    Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
-
-    auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
-        for(int j = 0; j < HiPerTile; ++j)
-        {
-            int hi = HoPerTile * htile + j - h_pad_low;
-            for(int i = 0; i < WiPerTile; ++i)
-            {
-                int wi = WoPerTile * wtile + i - w_pad_low;
-
-                if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
-                   wi < in_nchw.mDesc.GetLengths()[3])
-                {
-                    in_hold(n, c, htile, wtile, j, i) = in_nchw(n, c, hi, wi);
-                }
-                else
-                {
-                    in_hold(n, c, htile, wtile, j, i) = TIn(0);
-                }
-            }
-        }
-    };
-
-    auto f_in_transform = [&](auto n, auto c, auto htile, auto wtile) {
-        in_transform(n, c, htile, wtile, 0, 0) =
-            in_hold(n, c, htile, wtile, 0, 0) - in_hold(n, c, htile, wtile, 0, 2) -
-            in_hold(n, c, htile, wtile, 2, 0) + in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 0, 1) =
-            in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) -
-            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 0, 2) =
-            -in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) +
-            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 0, 3) =
-            in_hold(n, c, htile, wtile, 0, 1) - in_hold(n, c, htile, wtile, 0, 3) -
-            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 3);
-
-        in_transform(n, c, htile, wtile, 1, 0) =
-            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) +
-            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 1, 1) =
-            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
-            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 1, 2) =
-            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
-            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 1, 3) =
-            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) +
-            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
-
-        in_transform(n, c, htile, wtile, 2, 0) =
-            -in_hold(n, c, htile, wtile, 1, 0) + in_hold(n, c, htile, wtile, 1, 2) +
-            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 2, 1) =
-            -in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) +
-            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 2, 2) =
-            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) -
-            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
-        in_transform(n, c, htile, wtile, 2, 3) =
-            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 3) +
-            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
-
-        in_transform(n, c, htile, wtile, 3, 0) =
-            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) -
-            in_hold(n, c, htile, wtile, 3, 0) + in_hold(n, c, htile, wtile, 3, 2);
-        in_transform(n, c, htile, wtile, 3, 1) =
-            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
-            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
-        in_transform(n, c, htile, wtile, 3, 2) =
-            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
-            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
-        in_transform(n, c, htile, wtile, 3, 3) =
-            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) -
-            in_hold(n, c, htile, wtile, 3, 1) + in_hold(n, c, htile, wtile, 3, 3);
-    };
-
-    auto f_wei_transform = [&](auto k, auto c) {
-        wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
-        wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
-                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
-                                    0.5 * double(wei_kcyx(k, c, 0, 2));
-        wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
-                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
-                                    0.5 * double(wei_kcyx(k, c, 0, 2));
-        wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
-
-        wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
-                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
-                                    0.5 * double(wei_kcyx(k, c, 2, 0));
-        wei_transform(k, c, 1, 1) =
-            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
-            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
-            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
-            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
-            0.25 * double(wei_kcyx(k, c, 2, 2));
-        wei_transform(k, c, 1, 2) =
-            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
-            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
-            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
-            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
-            0.25 * double(wei_kcyx(k, c, 2, 2));
-        wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
-                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
-                                    0.5 * double(wei_kcyx(k, c, 2, 2));
-
-        wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
-                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
-                                    0.5 * double(wei_kcyx(k, c, 2, 0));
-        wei_transform(k, c, 2, 1) =
-            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
-            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
-            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
-            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
-            0.25 * double(wei_kcyx(k, c, 2, 2));
-        wei_transform(k, c, 2, 2) =
-            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
-            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
-            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
-            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
-            0.25 * double(wei_kcyx(k, c, 2, 2));
-        wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
-                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
-                                    0.5 * double(wei_kcyx(k, c, 2, 2));
-
-        wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
-        wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
-                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
-                                    0.5 * double(wei_kcyx(k, c, 2, 2));
-        wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
-                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
-                                    0.5 * double(wei_kcyx(k, c, 2, 2));
-        wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
-    };
-
-    auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
-        for(int j = 0; j < HiPerTile; ++j)
-        {
-            for(int i = 0; i < WiPerTile; ++i)
-            {
-                double v = 0;
-                for(int c = 0; c < C; ++c)
-                {
-                    v += in_transform(n, c, htile, wtile, j, i) * wei_transform(k, c, j, i);
-                }
-
-                out_transform(n, k, htile, wtile, j, i) = v;
-            }
-        }
-    };
-
-    auto f_out_hold = [&](auto n, auto k, auto htile, auto wtile) {
-        out_hold(n, k, htile, wtile, 0, 0) =
-            out_transform(n, k, htile, wtile, 0, 0) + out_transform(n, k, htile, wtile, 0, 1) +
-            out_transform(n, k, htile, wtile, 0, 2) + out_transform(n, k, htile, wtile, 1, 0) +
-            out_transform(n, k, htile, wtile, 1, 1) + out_transform(n, k, htile, wtile, 1, 2) +
-            out_transform(n, k, htile, wtile, 2, 0) + out_transform(n, k, htile, wtile, 2, 1) +
-            out_transform(n, k, htile, wtile, 2, 2);
-        out_hold(n, k, htile, wtile, 0, 1) =
-            out_transform(n, k, htile, wtile, 0, 1) - out_transform(n, k, htile, wtile, 0, 2) -
-            out_transform(n, k, htile, wtile, 0, 3) + out_transform(n, k, htile, wtile, 1, 1) -
-            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 1, 3) +
-            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
-            out_transform(n, k, htile, wtile, 2, 3);
-        out_hold(n, k, htile, wtile, 1, 0) =
-            out_transform(n, k, htile, wtile, 1, 0) + out_transform(n, k, htile, wtile, 1, 1) +
-            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 2, 0) -
-            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
-            out_transform(n, k, htile, wtile, 3, 0) - out_transform(n, k, htile, wtile, 3, 1) -
-            out_transform(n, k, htile, wtile, 3, 2);
-        out_hold(n, k, htile, wtile, 1, 1) =
-            out_transform(n, k, htile, wtile, 1, 1) - out_transform(n, k, htile, wtile, 1, 2) -
-            out_transform(n, k, htile, wtile, 1, 3) - out_transform(n, k, htile, wtile, 2, 1) +
-            out_transform(n, k, htile, wtile, 2, 2) + out_transform(n, k, htile, wtile, 2, 3) -
-            out_transform(n, k, htile, wtile, 3, 1) + out_transform(n, k, htile, wtile, 3, 2) +
-            out_transform(n, k, htile, wtile, 3, 3);
-    };
-
-    auto f_out = [&](auto n, auto k, auto htile, auto wtile) {
-        for(int j = 0; j < HoPerTile; ++j)
-        {
-            std::size_t ho = HoPerTile * htile + j;
-            for(int i = 0; i < WoPerTile; ++i)
-            {
-                std::size_t wo         = WoPerTile * wtile + i;
-                out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
-            }
-        }
-    };
-
-    std::size_t num_thread = std::thread::hardware_concurrency();
-
-    make_ParallelTensorFunctor(f_in_hold, N, C, HTile, WTile)(num_thread);
-    make_ParallelTensorFunctor(f_in_transform, N, C, HTile, WTile)(num_thread);
-    make_ParallelTensorFunctor(f_wei_transform, K, C)(num_thread);
-    make_ParallelTensorFunctor(f_out_transform, N, K, HTile, WTile)(num_thread);
-    make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
-    make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
 }
--- a/host/host_tensor/include/host_conv_bwd_data.hpp
+++ b/host/host_tensor/include/host_conv_bwd_data.hpp
-#pragma once
-#include "host_tensor.hpp"
-
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_direct_convolution_backward_data(Tensor<TIn>& in,
-                                           const Tensor<TWei>& wei,
-                                           const Tensor<TOut>& out,
-                                           const ConvStrides& conv_strides,
-                                           const ConvDilations& conv_dilations,
-                                           const InLeftPads& in_left_pads,
-                                           const InRightPads& /* in_right_pads */,
-                                           const ConvTensorLayout layout = ConvTensorLayout::NCHW)
-{
-    using namespace ck;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
-        std::size_t K = wei.mDesc.GetLengths()[I0];
-        std::size_t Y = wei.mDesc.GetLengths()[I2];
-        std::size_t X = wei.mDesc.GetLengths()[I3];
-
-        std::size_t Ho = out.mDesc.GetLengths()[I2];
-        std::size_t Wo = out.mDesc.GetLengths()[I3];
-
-        double v = 0;
-
-        for(int y = 0; y < Y; ++y)
-        {
-            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
-
-            if(h_tmp % conv_strides[I0] == 0)
-            {
-                int ho = h_tmp / conv_strides[I0];
-
-                if(ho >= 0 && ho < Ho)
-                {
-                    for(int x = 0; x < X; ++x)
-                    {
-                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
-
-                        if(w_tmp % conv_strides[I1] == 0)
-                        {
-                            int wo = w_tmp / conv_strides[I1];
-
-                            if(wo >= 0 && wo < Wo)
-                            {
-                                for(int k = 0; k < K; ++k)
-                                {
-                                    v += out(n, k, ho, wo) * wei(k, c, y, x);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        in(n, c, hi, wi) = v;
-    };
-
-    auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
-        std::size_t K = wei.mDesc.GetLengths()[I0];
-        std::size_t Y = wei.mDesc.GetLengths()[I1];
-        std::size_t X = wei.mDesc.GetLengths()[I2];
-
-        std::size_t Ho = out.mDesc.GetLengths()[I1];
-        std::size_t Wo = out.mDesc.GetLengths()[I2];
-
-        double v = 0;
-
-        for(int y = 0; y < Y; ++y)
-        {
-            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
-
-            if(h_tmp % conv_strides[I0] == 0)
-            {
-                int ho = h_tmp / conv_strides[I0];
-
-                if(ho >= 0 && ho < Ho)
-                {
-                    for(int x = 0; x < X; ++x)
-                    {
-                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
-
-                        if(w_tmp % conv_strides[I1] == 0)
-                        {
-                            int wo = w_tmp / conv_strides[I1];
-
-                            if(wo >= 0 && wo < Wo)
-                            {
-                                for(int k = 0; k < K; ++k)
-                                {
-                                    v += out(n, ho, wo, k) * wei(k, y, x, c);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        in(n, hi, wi, c) = v;
-    };
-
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        make_ParallelTensorFunctor(f_nchw,
-                                   in.mDesc.GetLengths()[0],
-                                   in.mDesc.GetLengths()[1],
-                                   in.mDesc.GetLengths()[2],
-                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        make_ParallelTensorFunctor(f_nhwc,
-                                   in.mDesc.GetLengths()[0],
-                                   in.mDesc.GetLengths()[1],
-                                   in.mDesc.GetLengths()[2],
-                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
--- a/host/host_tensor/include/host_conv_bwd_weight.hpp
+++ b/host/host_tensor/include/host_conv_bwd_weight.hpp
-#pragma once
-#include "host_tensor.hpp"
-
-template <typename TOut,
-          typename TIn,
-          typename TWei,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_direct_convolution_backward_weights(
-    const Tensor<TOut>& out,
-    const Tensor<TIn>& in,
-    Tensor<TWei>& wei,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads&,
-    const ConvTensorLayout layout = ConvTensorLayout::NCHW)
-{
-    using namespace ck;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    auto f_kcyx       = [&](auto k, auto c, auto y, auto x) {
-        double v = 0;
-        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
-        {
-            for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        v += static_cast<const double>(in(n, c, hi, wi)) *
-                             static_cast<const double>(out(n, k, ho, wo));
-                    }
-                }
-            }
-        }
-        wei(k, c, y, x) = v;
-    };
-
-    auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
-        double v = 0;
-        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
-        {
-            for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[2])
-                    {
-                        v += static_cast<const double>(in(n, hi, wi, c)) *
-                             static_cast<const double>(out(n, ho, wo, k));
-                    }
-                }
-            }
-        }
-        wei(k, y, x, c) = v;
-    };
-
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        make_ParallelTensorFunctor(f_kcyx,
-                                   wei.mDesc.GetLengths()[0],
-                                   wei.mDesc.GetLengths()[1],
-                                   wei.mDesc.GetLengths()[2],
-                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        make_ParallelTensorFunctor(f_kyxc,
-                                   wei.mDesc.GetLengths()[0],
-                                   wei.mDesc.GetLengths()[1],
-                                   wei.mDesc.GetLengths()[2],
-                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
@@ -157,3 +157,26 @@ void host_gemm(const Tensor<AType>& a,
        throw std::runtime_error("wrong! not supported layout");
    }
 }
+
+template <typename AType, typename BType, typename CType>
+void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
+                        const Tensor<BType>& b_k_n,
+                        Tensor<CType>& c_m_n)
+{
+    auto f_mk_kn_mn = [&](auto m, auto n) {
+        const int K = a_m_k.mDesc.GetLengths()[1];
+
+        double v = 0;
+
+        for(int k = 0; k < K; ++k)
+        {
+            v += static_cast<const double>(a_m_k(m, k)) * static_cast<const double>(b_k_n(k, n));
+        }
+
+        c_m_n(m, n) = v;
+    };
+
+    make_ParallelTensorFunctor(f_mk_kn_mn,
+                               c_m_n.mDesc.GetLengths()[0],
+                               c_m_n.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
+}
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -120,6 +120,8 @@ struct HostTensorDescriptor
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }

+    friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
+
    private:
    std::vector<std::size_t> mLens;
    std::vector<std::size_t> mStrides;
@@ -224,7 +226,7 @@ struct Tensor
    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}

    template <typename G>
-    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
    {
        switch(mDesc.GetNumOfDimension())
        {

--- a/host/host_tensor/src/host_tensor.cpp
+++ b/host/host_tensor/src/host_tensor.cpp
@@ -34,6 +34,21 @@ const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { retur

 const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }

+std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
+{
+    os << "dim " << desc.GetNumOfDimension() << ", ";
+
+    os << "lengths {";
+    LogRange(os, desc.GetLengths(), ", ");
+    os << "}, ";
+
+    os << "strides {";
+    LogRange(os, desc.GetStrides(), ", ");
+    os << "}";
+
+    return os;
+}
+
 void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os)
 {
    os << "dim " << desc.GetNumOfDimension() << ", ";

--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
+include_directories(BEFORE
+    include
+    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/device/include
+    ${PROJECT_SOURCE_DIR}/device_operation/include
+    ${PROJECT_SOURCE_DIR}/profiler/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
+    ${PROJECT_SOURCE_DIR}/external/rocm/include
+)
+
+# device_gemm_instance
+set(DEVICE_GEMM_INSTANCE_SOURCE 
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_kn_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_nk_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_kn_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_nk_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_kn_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_nk_mn.cpp;
+) 
+
+add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE}) 
+target_include_directories(device_gemm_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+target_compile_features(device_gemm_instance PUBLIC)
+set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+install(TARGETS device_gemm_instance LIBRARY DESTINATION lib) 
+
+# device_conv_instance
+set(DEVICE_CONV_INSTANCE_SOURCE 
+   ${PROJECT_SOURCE_DIR}/device_operation/device_conv_xdl_instance_f32_f32_f32_nhwc_kyxc_nhwk.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_conv_xdl_instance_f16_f16_f16_nhwc_kyxc_nhwk.cpp;
+) 
+
+add_library(device_conv_instance SHARED ${DEVICE_CONV_INSTANCE_SOURCE}) 
+target_include_directories(device_conv_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+target_compile_features(device_conv_instance PUBLIC)
+set_target_properties(device_conv_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+install(TARGETS device_conv_instance LIBRARY DESTINATION lib) 
+
+# ck_profiler
+set(PROFILER_SOURCE profiler.cpp gemm_profiler.cpp conv_profiler.cpp)
+add_executable(ckProfiler ${PROFILER_SOURCE})
+
+target_link_libraries(ckProfiler PRIVATE host_tensor)
+target_link_libraries(ckProfiler PRIVATE device_gemm_instance device_conv_instance)
--- a/profiler/README.md
+++ b/profiler/README.md
+## Docker script
+```bash
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
+/bin/bash
+```
+
+## Build ```ckProfiler```
+```bash
+mkdir build && cd build
+```
+
+```bash
+# Need to Specify target ID, example below is gfx908
+cmake                                                                  \
+-D BUILD_DEV=OFF                                                       \
+-D CMAKE_BUILD_TYPE=Release                                            \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
+..
+```
+
+```bash
+ make -j ckProfiler
+```
+
+## Profile GEMM kernels
+```bash
+#arg1: tensor operation (gemm=GEMM)
+#arg2: data type (0=fp32, 1=fp16)
+#arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT)
+#arg4: verification (0=no, 1=yes)
+#arg5: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg6: print matrix value (0=no, 1=yes)
+#arg7: run kernel # of times (>1)
+#arg8 to 13: M, N, K, StrideA, StrideB, StrideC
+
+#####################   op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC
+./profiler/ckProfiler gemm         1       1       1     1    0       5  3840 4096 4096     4096    4096    4096
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```bash
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
+c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+....
+Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
+```
+
+## Profile forward convolution kernels
+```bash
+#arg1: tensor operation (conv=Convolution)
+#arg2: data type (0=fp32, 1=fp16)
+#arg3: input tensor layout (0=NCHW, 1=NHWC)
+#arg4: weight tensor layout (0=KCYX, 1=KYXC)
+#arg5: output tensor layout (0=NKHW, 1=NHWK)
+#arg6: verification (0=no, 1=yes)
+#arg7: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg8: print matrix value (0=no, 1=yes)
+#arg9: run kernel # of times (>1)
+#arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
+ #####################   op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ ./profiler/ckProfiler conv         1          1            1           1       1     1    0       5  128  256  192 3 3   71   71     2 2       1 1      1 1       1 1
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
+out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
+....
+Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
+```
--- a/profiler/conv_profiler.cpp
+++ b/profiler/conv_profiler.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv.hpp"
+
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+int conv_profiler(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv=Convolution)\n");
+        printf("arg2: data type (0=fp32, 1=fp16)\n");
+        printf("arg3: input tensor layout (0=NCHW, 1=NHWC)\n");
+        printf("arg4: weight tensor layout (0=KCYX, 1=KYXC)\n");
+        printf("arg5: output tensor layout (0=NKHW, 1=NHWK)\n");
+        printf("arg6: verification (0=no, 1=yes)\n");
+        printf("arg7: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg8: print matrix value (0=no, 1=yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv<2,
+                                   float,
+                                   float,
+                                   float,
+                                   ck::tensor_layout::convolution::NHWC,
+                                   ck::tensor_layout::convolution::KYXC,
+                                   ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv<2,
+                                   ck::half_t,
+                                   ck::half_t,
+                                   ck::half_t,
+                                   ck::tensor_layout::convolution::NHWC,
+                                   ck::tensor_layout::convolution::KYXC,
+                                   ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
+    }
+
+    return 1;
+}
--- a/profiler/gemm_profiler.cpp
+++ b/profiler/gemm_profiler.cpp
--- a/profiler/include/profile_conv.hpp
+++ b/profiler/include/profile_conv.hpp
--- a/profiler/include/profile_gemm.hpp
+++ b/profiler/include/profile_gemm.hpp
--- a/profiler/profiler.cpp
+++ b/profiler/profiler.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+
+int gemm_profiler(int, char*[]);
+int conv_profiler(int, char*[]);
+
+int main(int argc, char* argv[])
+{
+    if(strcmp(argv[1], "gemm") == 0)
+    {
+        return gemm_profiler(argc, argv);
+    }
+    else if(strcmp(argv[1], "conv") == 0)
+    {
+        return conv_profiler(argc, argv);
+    }
+    else
+    {
+        printf("arg1: tensor operation (gemm=GEMM, conv=Convolution)\n");
+        return 0;
+    }
+}
--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -8,11 +8,11 @@ MY_PROJECT_INSTALL=../install.dir

 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
-D HALF_INCLUDE_DIR="/root/workspace/external/half/include"                                                                                    \
-D BUILD_DEV=ON                                                                                                                                \
+-D BUILD_DEV=OFF                                                                                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
 ${MY_PROJECT_SOURCE}
+
--- a/script/conv_driver.sh
+++ b/script/conv_driver.sh
--- a/script/example_gemm_xdl.sh
+++ b/script/example_gemm_xdl.sh
+#!/bin/bash
+
+## GPU visibility
+ export HIP_VISIBLE_DEVICES=1
+
+ make -j gemm_xdl
+
+ DRIVER="./example/gemm_xdl"
+
+VERIFY=$1
+INIT=$2
+LOG=$3
+REPEAT=$4
+
+######### verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC
+#$DRIVER $VERIFY $INIT $LOG $REPEAT   960 1024 1024     1024    1024    1024
+#$DRIVER $VERIFY $INIT $LOG $REPEAT  1024 1024 1024     1024    1024    1024
+#$DRIVER $VERIFY $INIT $LOG $REPEAT  1920 2048 2048     2048    2048    2048
+ $DRIVER $VERIFY $INIT $LOG $REPEAT  3840 4096 4096     4096    4096    4096
+#$DRIVER $VERIFY $INIT $LOG $REPEAT  7680 8192 8192     8192    8192    8192
--- a/script/gemm_driver.sh
+++ b/script/gemm_driver.sh
+#!/bin/bash
+
+## GPU visibility
+ export HIP_VISIBLE_DEVICES=0
+
+ make -j gemm_driver_offline
+
+ DRIVER="./host/driver_offline/gemm_driver_offline"
+
+LAYOUT=$1
+ALGO=$2
+VERIFY=$3
+INIT=$4
+LOG=$5
+REPEAT=$6
+
+ M01=$7
+ N01=$8
+
+######### layout  algo  verify  init  log  repeat  M___ N___ K___  M01_  N01_
+#$DRIVER $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT   960 1024 1024  $M01  $N01
+#$DRIVER $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  1024 1024 1024  $M01  $N01
+#$DRIVER $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  1920 2048 2048  $M01  $N01
+ $DRIVER $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  3840 4096 4096  $M01  $N01
+#$DRIVER $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  7680 8192 8192  $M01  $N01
--- a/script/run.sh
+++ b/script/run.sh