add fwd-v4r4-nhwc, change vector_type

20fa988f · Chao Liu · 1c62b47b · 20fa988f · 20fa988f · 20fa988f
Commit 20fa988f authored Feb 23, 2021 by Chao Liu
6 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+#ifndef CK_DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+#define CK_DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm.hpp"
+#include "gridwise_operation_wrapper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = Y * X * C
+template <index_t BlockSize,
+          typename Float,
+          typename AccFloat,
+          index_t GemmMPerBlock,
+          index_t GemmNPerBlock,
+          index_t GemmKPerBlock,
+          index_t GemmMPerThread,
+          index_t GemmNPerThread,
+          index_t GemmKPerThread,
+          index_t GemmMLevel0Cluster,
+          index_t GemmNLevel0Cluster,
+          index_t GemmMLevel1Cluster,
+          index_t GemmNLevel1Cluster,
+          typename GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
+          typename GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
+          index_t GemmABlockTransferSrcScalarPerVector_GemmK,
+          index_t GemmABlockTransferDstScalarPerVector_GemmM,
+          typename GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
+          typename GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
+          index_t GemmBBlockTransferSrcScalarPerVector_GemmK,
+          index_t GemmBBlockTransferDstScalarPerVector_GemmN,
+          index_t GemmCThreadTransferDstScalarPerVector_GemmM1>
+struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads,
+                      const Float* __restrict__ p_wei_global,
+                      const Float* __restrict__ p_in_global,
+                      Float* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto N = in_n_hi_wi_c_global_desc.GetLength(I0);
+        const auto C = in_n_hi_wi_c_global_desc.GetLength(I3);
+        const auto K = out_n_ho_wo_k_global_desc.GetLength(I3);
+
+        const auto Hi = in_n_hi_wi_c_global_desc.GetLength(I1);
+        const auto Wi = in_n_hi_wi_c_global_desc.GetLength(I2);
+
+        const auto Ho = out_n_ho_wo_k_global_desc.GetLength(I1);
+        const auto Wo = out_n_ho_wo_k_global_desc.GetLength(I2);
+
+        const auto Y = wei_k_y_x_c_global_desc.GetLength(I1);
+        const auto X = wei_k_y_x_c_global_desc.GetLength(I2);
+
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+
+        const auto InRightPadH = in_right_pads[I0];
+        const auto InRightPadW = in_right_pads[I1];
+
+        // weight tensor
+        const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        // input tensor
+        const auto in_n_hip_wip_c_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_hi_wi_c_global_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto in_n_y_ho_x_wo_c_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_hip_wip_c_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+        const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_y_ho_x_wo_c_global_desc,
+            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                       make_merge_transform(make_tuple(N, Ho, Wo))),
+            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        // output tensor
+        const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+            make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        const auto GemmM = out_gemmm_gemmn_global_desc.GetLength(I0);
+        const auto GemmN = out_gemmm_gemmn_global_desc.GetLength(I1);
+        const auto GemmK = wei_gemmk_gemmm_global_desc.GetLength(I0);
+
+        if(!(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 &&
+             GemmK % GemmKPerBlock == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        constexpr auto GemmM1 = Number<GemmMPerThread * GemmMLevel0Cluster * GemmMLevel1Cluster>{};
+        constexpr auto GemmN1 = Number<GemmNPerThread * GemmNLevel0Cluster * GemmNLevel1Cluster>{};
+
+        const auto GemmM0 = GemmM / GemmM1;
+        const auto GemmN0 = GemmN / GemmN1;
+
+        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
+            transform_dynamic_tensor_descriptor(
+                out_gemmm_gemmn_global_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmM0, GemmM1)),
+                           make_unmerge_transform(make_tuple(GemmN0, GemmN1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_k_m_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_k_m_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        // hack to control index calculation when iterating over b_k_n_global tensor
+        constexpr auto b_k_n_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{}));
+
+        constexpr auto b_k_n_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_m0_m1_n0_n1_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 1, 0, 0>{},
+                                  Sequence<0, 0, 1, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 2, 0, 0>{},
+                                  Sequence<0, 0, 2, 0, 0>{}));
+
+        // GEMM
+        using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<
+            BlockSize,
+            Float,
+            AccFloat,
+            InMemoryDataOperation::Set,
+            decltype(wei_gemmk_gemmm_global_desc),
+            decltype(in_gemmk_gemmn_global_desc),
+            decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerThread,
+            GemmNPerThread,
+            GemmKPerThread,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
+            GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            GemmABlockTransferSrcScalarPerVector_GemmK,
+            GemmABlockTransferDstScalarPerVector_GemmM,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
+            GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            GemmBBlockTransferSrcScalarPerVector_GemmK,
+            GemmBBlockTransferDstScalarPerVector_GemmN,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<2, 3, 0, 1>,
+            1,
+            GemmCThreadTransferDstScalarPerVector_GemmM1,
+            decltype(a_k_m_global_iterator_hacks),
+            decltype(b_k_n_global_iterator_hacks),
+            decltype(c_m0_m1_n0_n1_global_tensor_iterator_hacks),
+            decltype(a_k_m_global_move_slice_window_iterator_hack),
+            decltype(b_k_n_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (GemmM / GemmMPerBlock) * (GemmN / GemmNPerBlock);
+
+        const bool has_main_k_block_loop = (GemmK + GemmKPerBlock) / (2 * GemmKPerBlock) > 1;
+
+        const bool has_double_tail_k_block_loop = (GemmK / GemmKPerBlock) % 2 == 0;
+
+#if 1 // pass tensor descriptors by their reference
+        index_t nrepeat = 100;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc),
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc),
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                               Float*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc,
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc,
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc),
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc),
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                               Float*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc,
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc,
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc),
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc),
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                               Float*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc,
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc,
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc),
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc),
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
+                                               Float*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc,
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc,
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+#elif 1 // pass tensor descriptors by their pointers
+        using ADesc = decltype(wei_gemmk_gemmm_global_desc);
+        using BDesc = decltype(in_gemmk_gemmn_global_desc);
+        using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
+
+        DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
+        DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
+        DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
+
+        wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
+        in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
+        out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
+            &out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
+
+        index_t nrepeat = 100;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc)*,
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc)*,
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
+                                               Float*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  reinterpret_cast<const ADesc*>(
+                                      wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_wei_global,
+                                  reinterpret_cast<const BDesc*>(
+                                      in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_in_global,
+                                  reinterpret_cast<const CDesc*>(
+                                      out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                          .GetDeviceBuffer()),
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc)*,
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc)*,
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
+                                               Float*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  reinterpret_cast<const ADesc*>(
+                                      wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_wei_global,
+                                  reinterpret_cast<const BDesc*>(
+                                      in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_in_global,
+                                  reinterpret_cast<const CDesc*>(
+                                      out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                          .GetDeviceBuffer()),
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc)*,
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc)*,
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
+                                               Float*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  reinterpret_cast<const ADesc*>(
+                                      wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_wei_global,
+                                  reinterpret_cast<const BDesc*>(
+                                      in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_in_global,
+                                  reinterpret_cast<const CDesc*>(
+                                      out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                          .GetDeviceBuffer()),
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_gemmk_gemmm_global_desc)*,
+                                               const Float*,
+                                               decltype(in_gemmk_gemmn_global_desc)*,
+                                               const Float*,
+                                               decltype(
+                                                   out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
+                                               Float*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  reinterpret_cast<const ADesc*>(
+                                      wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_wei_global,
+                                  reinterpret_cast<const BDesc*>(
+                                      in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
+                                  p_in_global,
+                                  reinterpret_cast<const CDesc*>(
+                                      out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                          .GetDeviceBuffer()),
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+#elif 1 // pass tensor descriptor by void*
+        using ADesc = decltype(wei_gemmk_gemmm_global_desc);
+        using BDesc = decltype(in_gemmk_gemmn_global_desc);
+        using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
+
+        DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
+        DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
+        DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
+
+        wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
+        in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
+        out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
+            &out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
+
+        index_t nrepeat = 100;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               Float*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                      .GetDeviceBuffer(),
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               Float*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                      .GetDeviceBuffer(),
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               Float*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                      .GetDeviceBuffer(),
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               const Float*,
+                                                               const void*,
+                                                               Float*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_wei_global,
+                                  in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
+                                  p_in_global,
+                                  out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
+                                      .GetDeviceBuffer(),
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+#endif
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
@@ -168,6 +168,51 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
 #endif
 }

+template <>
+__device__ float8_t amd_buffer_load_v2<float, 8>(const float* p_src_wave,
+                                                 index_t src_thread_data_offset,
+                                                 bool src_thread_data_valid,
+                                                 index_t src_data_range)
+{
+    BufferResourceConstant<float> src_wave_buffer_resource;
+
+    // wavewise base address (64 bit)
+    src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
+    // wavewise range (32 bit)
+    src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
+    // wavewise setting (32 bit)
+    src_wave_buffer_resource.config[3] = 0x00027000;
+
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+
+#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
+
+    vector_type<float, 8> vector;
+
+    vector.Set(Number<4>{}, Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
+        src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
+
+    vector.Set(Number<4>{}, Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
+        src_wave_buffer_resource.data,
+        src_addr_shift + src_thread_addr_offset + 4 * sizeof(float),
+        0,
+        0);
+
+    return vector.Get(Number<8>{}, Number<0>{});
+#else
+    vector_type<float, 8> vector;
+
+    vector.Set(Number<4>{}, Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
+        src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0);
+
+    vector.Set(Number<4>{}, Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
+        src_wave_buffer_resource.data, src_thread_addr_offset + 4 * sizeof(float), 0, 0);
+
+    return src_thread_data_valid ? vector.Get(Number<8>{}, Number<0>{}) : float8_t(0);
+#endif
+}
+
 template <>
 __device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
                                              float* p_dst_wave,

--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -4,19 +4,20 @@
 namespace ck {

 // For some reason, HIP compiler need this definition to generate optimal ISA
-// float
+// fp32
 typedef float float2_t __attribute__((ext_vector_type(2)));
 typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef float float8_t __attribute__((ext_vector_type(8)));
 typedef float float16_t __attribute__((ext_vector_type(16)));
 typedef float float32_t __attribute__((ext_vector_type(32)));

-// float16
+// fp16
 typedef _Float16 half_t;
 typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
 typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
 typedef _Float16 half8_t __attribute__((ext_vector_type(8)));

-// bfloat16
+// bfp16
 typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
 typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
 typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
@@ -168,23 +169,17 @@ struct c_vec4_1_t
    }
 };

-template <class T, index_t N>
-struct vector_type
-{
-    typedef struct
-    {
-        T scalar[N];
-    } MemoryType;
-};
+template <typename T, index_t N>
+struct vector_type;

-template <>
-struct vector_type<float, 1>
+template <typename T>
+struct vector_type<T, 1>
 {
-    using MemoryType = float;
+    using MemoryType = T;

-    float data_;
+    T data_;

-    __host__ __device__ constexpr vector_type() : data_{0} {}
+    __host__ __device__ constexpr vector_type() : data_{T{0}} {}

    __host__ __device__ static constexpr index_t Size() { return 1; }

@@ -192,6 +187,22 @@ struct vector_type<float, 1>

    __host__ __device__ constexpr auto& Vector() { return data_; }

+    template <index_t I>
+    __host__ __device__ constexpr const auto& Get(Number<1>, Number<I>) const
+    {
+        static_assert(I == 0, "wrong!");
+
+        return data_;
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<1>, Number<I>)
+    {
+        static_assert(I == 0, "wrong!");
+
+        return data_;
+    }
+
    template <index_t I>
    __host__ __device__ constexpr const auto& operator[](Number<I>) const
    {
@@ -209,31 +220,66 @@ struct vector_type<float, 1>
    }
 };

-template <>
-struct vector_type<float, 2>
+template <typename T>
+struct vector_type<T, 2>
 {
-    using MemoryType = float2_t;
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+
+    using MemoryType = d2_t;

    union
    {
-        float2_t vector_;
-        StaticallyIndexedArray<float, 2> scalars_;
+        d2_t d2_;
+        StaticallyIndexedArray<d1_t, 2> d1x2_;
    } data_;

-    __host__ __device__ constexpr vector_type() : data_{MemoryType{0}} {}
+    __host__ __device__ constexpr vector_type() : data_{d2_t{0}} {}

    __host__ __device__ static constexpr index_t Size() { return 2; }

-    __host__ __device__ constexpr const auto& Vector() const { return data_.vector_; }
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d2_; }
+
+    __host__ __device__ constexpr auto& Vector() { return data_.d2_; }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
+    {
+        static_assert(I >= 0 && I < 2, "wrong!");
+
+        return data_.d1x2_[i];
+    }

-    __host__ __device__ constexpr auto& Vector() { return data_.vector_; }
+    template <index_t I>
+    __host__ __device__ constexpr const auto& Get(Number<2>, Number<I>) const
+    {
+        static_assert(I == 0, "wrong!");
+
+        return data_.d2_;
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
+    {
+        static_assert(I >= 0 && I < 2, "wrong!");
+
+        return data_.d1x2_(i);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<2>, Number<I>)
+    {
+        static_assert(I == 0, "wrong!");
+
+        return data_.d2_;
+    }

    template <index_t I>
    __host__ __device__ constexpr const auto& operator[](Number<I>) const
    {
        static_assert(I >= 0 && I < 2, "wrong!");

-        return data_.scalars_[Number<I>{}];
+        return data_.d1x2_[Number<I>{}];
    }

    template <index_t I>
@@ -241,219 +287,203 @@ struct vector_type<float, 2>
    {
        static_assert(I >= 0 && I < 2, "wrong!");

-        return data_.scalars_(Number<I>{});
+        return data_.d1x2_(Number<I>{});
    }
 };

-template <>
-struct vector_type<float, 4>
+template <typename T>
+struct vector_type<T, 4>
 {
-    using MemoryType = float4_t;
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+
+    using MemoryType = d4_t;

    union
    {
-        float4_t vector_;
-        StaticallyIndexedArray<float, 4> scalars_;
+        d4_t d4_;
+        StaticallyIndexedArray<d1_t, 4> d1x4_;
+        StaticallyIndexedArray<d2_t, 2> d2x2_;
    } data_;

-    __host__ __device__ constexpr vector_type() : data_{MemoryType{0}} {}
+    __host__ __device__ constexpr vector_type() : data_{d4_t{0}} {}

    __host__ __device__ static constexpr index_t Size() { return 4; }

-    __host__ __device__ constexpr const auto& Vector() const { return data_.vector_; }
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d4_; }

-    __host__ __device__ constexpr auto& Vector() { return data_.vector_; }
+    __host__ __device__ constexpr auto& Vector() { return data_.d4_; }

    template <index_t I>
-    __host__ __device__ constexpr const auto& operator[](Number<I>) const
+    __host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
    {
        static_assert(I >= 0 && I < 4, "wrong!");

-        return data_.scalars_[Number<I>{}];
+        return data_.d1x4_[i];
    }

    template <index_t I>
-    __host__ __device__ constexpr auto& operator()(Number<I>)
+    __host__ __device__ constexpr const auto& Get(Number<2>, Number<I> i) const
    {
-        static_assert(I >= 0 && I < 4, "wrong!");
+        static_assert(I >= 0 && I < 2, "wrong!");

-        return data_.scalars_(Number<I>{});
+        return data_.d2x2_[i];
    }
-};
-
-template <>
-struct vector_type<half_t, 1>
-{
-    using MemoryType = half_t;

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    __host__ __device__ constexpr const auto& Get(Number<4>, Number<I>) const
    {
-        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
-    }
-};
-
-template <>
-struct vector_type<half_t, 2>
-{
-    using MemoryType = half2_t;
+        static_assert(I == 0, "wrong!");

-    union DataType
-    {
-        MemoryType vector;
-        half_t scalar[2];
-    };
+        return data_.d4_;
+    }

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    __host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
    {
-        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
+        static_assert(I >= 0 && I < 4, "wrong!");
+
+        return data_.d1x4_(i);
    }

-    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1)
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
+        static_assert(I >= 0 && I < 3, "wrong!");

-template <>
-struct vector_type<half_t, 4>
-{
-    using MemoryType = half4_t;
+        return data_.d2x2_(i);
+    }

-    union DataType
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<4>, Number<I>)
    {
-        MemoryType vector;
-        half_t scalar[4];
-    };
+        static_assert(I == 0, "wrong!");
+
+        return data_.d4_;
+    }

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    __host__ __device__ constexpr const auto& operator[](Number<I>) const
    {
-        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
+        static_assert(I >= 0 && I < 4, "wrong!");
+
+        return data_.d1x4_[Number<I>{}];
    }

-    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3)
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I>)
    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        data.scalar[2] = s2;
-        data.scalar[3] = s3;
-        return data.vector;
+        static_assert(I >= 0 && I < 4, "wrong!");
+
+        return data_.d1x4_(Number<I>{});
    }
 };

-template <>
-struct vector_type<half_t, 8>
+template <typename T>
+struct vector_type<T, 8>
 {
-    using MemoryType = half8_t;
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+
+    using MemoryType = d8_t;

-    union DataType
+    union
    {
-        MemoryType vector;
-        half_t scalar[8];
-    };
+        d8_t d8_;
+        StaticallyIndexedArray<d1_t, 8> d1x8_;
+        StaticallyIndexedArray<d2_t, 4> d2x4_;
+        StaticallyIndexedArray<d4_t, 2> d4x2_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{d8_t{0}} {}
+
+    __host__ __device__ static constexpr index_t Size() { return 8; }
+
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d8_; }
+
+    __host__ __device__ constexpr auto& Vector() { return data_.d8_; }

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    __host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
    {
-        static_assert(I < 8, "wrong");
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
-    }
-};
+        static_assert(I >= 0 && I < 8, "wrong!");

-template <>
-struct vector_type<ushort, 1>
-{
-    using MemoryType = ushort;
+        return data_.d1x8_[i];
+    }

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    __host__ __device__ constexpr const auto& Get(Number<2>, Number<I> i) const
    {
-        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
-    }
-};
+        static_assert(I >= 0 && I < 4, "wrong!");

-template <>
-struct vector_type<ushort, 2>
-{
-    using MemoryType = ushort2_t;
+        return data_.d2x4_[i];
+    }

-    union DataType
+    template <index_t I>
+    __host__ __device__ constexpr const auto& Get(Number<4>, Number<I> i) const
    {
-        MemoryType vector;
-        ushort scalar[2];
-    };
+        static_assert(I >= 0 && I < 2, "wrong!");
+
+        return data_.d4x2_[i];
+    }

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    __host__ __device__ constexpr const auto& Get(Number<8>, Number<I>) const
    {
-        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
+        static_assert(I == 0, "wrong!");
+
+        return data_.d8_;
    }

-    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
+        static_assert(I >= 0 && I < 8, "wrong!");

-template <>
-struct vector_type<ushort, 4>
-{
-    using MemoryType = ushort4_t;
+        return data_.d1x8_(i);
+    }

-    union DataType
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
    {
-        MemoryType vector;
-        ushort scalar[4];
-    };
+        static_assert(I >= 0 && I < 4, "wrong!");
+
+        return data_.d2x4_(i);
+    }

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    __host__ __device__ constexpr auto& Set(Number<4>, Number<I> i)
    {
-        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
+        static_assert(I >= 0 && I < 2, "wrong!");
+
+        return data_.d4x2_(i);
    }

-    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
+    template <index_t I>
+    __host__ __device__ constexpr auto& Set(Number<8>, Number<I> i)
    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        data.scalar[2] = s2;
-        data.scalar[3] = s3;
-        return data.vector;
-    }
-};
+        static_assert(I == 0, "wrong!");

-template <>
-struct vector_type<ushort, 8>
-{
-    using MemoryType = ushort8_t;
+        return data_.d8_;
+    }

-    union DataType
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I>) const
    {
-        MemoryType vector;
-        ushort scalar[8];
-    };
+        static_assert(I >= 0 && I < 8, "wrong!");
+
+        return data_.d1x8_[Number<I>{}];
+    }

    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    __host__ __device__ constexpr auto& operator()(Number<I>)
    {
-        static_assert(I < 8, "wrong");
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
+        static_assert(I >= 0 && I < 8, "wrong!");
+
+        return data_.d1x8_(Number<I>{});
    }
 };


--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -23,6 +23,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
                                                                          InRightPads,
                                                                          ck::index_t nrepeat)
 {
+    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"
+              << std::endl;
+
    using namespace ck;

    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
+
+template <class T,
+          class InDesc,
+          class WeiDesc,
+          class OutDesc,
+          class ConvStrides,
+          class ConvDilations,
+          class InLeftPads,
+          class InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(InDesc,
+                                                                          const Tensor<T>& in_nchw,
+                                                                          WeiDesc,
+                                                                          const Tensor<T>& wei_kcyx,
+                                                                          OutDesc,
+                                                                          Tensor<T>& out_nkhw,
+                                                                          ConvStrides,
+                                                                          ConvDilations,
+                                                                          InLeftPads,
+                                                                          InRightPads,
+                                                                          ck::index_t nrepeat)
+{
+    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk"
+              << std::endl;
+
+    using namespace ck;
+
+    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto N = OutDesc::GetLengths()[I0];
+    constexpr auto K = OutDesc::GetLengths()[I1];
+    constexpr auto C = WeiDesc::GetLengths()[I1];
+
+    constexpr auto Hi = InDesc::GetLengths()[I2];
+    constexpr auto Wi = InDesc::GetLengths()[I3];
+
+    constexpr auto Ho = OutDesc::GetLengths()[I2];
+    constexpr auto Wo = OutDesc::GetLengths()[I3];
+
+    constexpr auto Y = WeiDesc::GetLengths()[I2];
+    constexpr auto X = WeiDesc::GetLengths()[I3];
+
+#if 1
+    // run-time variables
+    constexpr auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C));
+    constexpr auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(K, Y, X, C));
+    constexpr auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Ho, Wo, K));
+
+    const auto conv_strides   = to_multi_index(ConvStrides{});
+    const auto conv_dilations = to_multi_index(ConvDilations{});
+    const auto in_left_pads   = to_multi_index(InLeftPads{});
+    const auto in_right_pads  = to_multi_index(InRightPads{});
+#else
+    // compile-time variables
+    constexpr auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, Hi, Wi, C));
+    constexpr auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y, X, C));
+    constexpr auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, Ho, Wo, K));
+
+    const auto conv_strides   = sequence_to_tuple_of_number(ConvStrides{});
+    const auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
+    const auto in_left_pads   = sequence_to_tuple_of_number(InLeftPads{});
+    const auto in_right_pads  = sequence_to_tuple_of_number(InRightPads{});
+#endif
+
+    Tensor<float> in_nhwc(
+        make_HostTensorDescriptor(make_native_tensor_descriptor_packed(Sequence<N, Hi, Wi, C>{})));
+    Tensor<float> wei_kyxc(
+        make_HostTensorDescriptor(make_native_tensor_descriptor_packed(Sequence<K, Y, X, C>{})));
+    Tensor<float> out_nhwk(
+        make_HostTensorDescriptor(make_native_tensor_descriptor_packed(Sequence<N, Ho, Wo, K>{})));
+
+    auto f_nchw2nhwc = [&](auto n, auto hi, auto wi, auto c) {
+        in_nhwc(n, hi, wi, c) = in_nchw(n, c, hi, wi);
+    };
+
+    auto f_kcyx2kyxc = [&](auto k, auto y, auto x, auto c) {
+        wei_kyxc(k, y, x, c) = wei_kcyx(k, c, y, x);
+    };
+
+    auto f_nkhw2nhwk = [&](auto n, auto ho, auto wo, auto k) {
+        out_nhwk(n, ho, wo, k) = out_nkhw(n, k, ho, wo);
+    };
+
+    make_ParallelTensorFunctor(f_nchw2nhwc, N, Hi, Wi, C)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f_kcyx2kyxc, K, Y, X, C)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f_nkhw2nhwk, N, Ho, Wo, K)(std::thread::hardware_concurrency());
+
+    std::size_t data_sz = sizeof(T);
+
+    DeviceMem in_nhwc_device_buf(data_sz * in_nhwc.mDesc.GetElementSpace());
+    DeviceMem wei_kyxc_device_buf(data_sz * wei_kyxc.mDesc.GetElementSpace());
+    DeviceMem out_nhwk_device_buf(data_sz * out_nhwk.mDesc.GetElementSpace());
+
+    in_nhwc_device_buf.ToDevice(in_nhwc.mData.data());
+    wei_kyxc_device_buf.ToDevice(wei_kyxc.mData.data());
+    out_nhwk_device_buf.ToDevice(out_nhwk.mData.data());
+
+#if 0
+    // cdata = 64, BlockSize = 128, 32x256x8
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t GemmMPerBlock = 32;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 8;
+
+    constexpr index_t GemmMPerThread     = 4;
+    constexpr index_t GemmNPerThread     = 4;
+    constexpr index_t GemmKPerThread     = 1;
+
+    constexpr index_t GemmMLevel0Cluster     = 2;
+    constexpr index_t GemmNLevel0Cluster     = 2;
+    constexpr index_t GemmMLevel1Cluster     = 2;
+    constexpr index_t GemmNLevel1Cluster     = 16;
+
+    constexpr index_t ThreadGemmDataPerReadM = 4;
+    constexpr index_t ThreadGemmDataPerReadN = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<2, 1>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 32>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<8, 2>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 128>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
+    // cdata = 64, BlockSize = 256, 128x128x8
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 8;
+
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 8;
+    constexpr index_t GemmNLevel1Cluster = 8;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<4, 1>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<4, 1>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmM1 = 4;
+#elif 1
+    // cdata = 64, BlockSize = 256, 128x128x16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 16;
+
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 8;
+    constexpr index_t GemmNLevel1Cluster = 8;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<4, 2>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 64>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 2;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<8, 1>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmM1 = 4;
+#endif
+
+    constexpr auto conv_driver =
+#if 1
+        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
+#elif 1
+        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_no_pad
+#elif 1
+        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
+#endif
+        <BlockSize,
+         TDevice,
+         TDevice,
+         GemmMPerBlock,
+         GemmNPerBlock,
+         GemmKPerBlock,
+         GemmMPerThread,
+         GemmNPerThread,
+         GemmKPerThread,
+         GemmMLevel0Cluster,
+         GemmNLevel0Cluster,
+         GemmMLevel1Cluster,
+         GemmNLevel1Cluster,
+         GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
+         GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
+         GemmABlockTransferSrcScalarPerVector_GemmK,
+         GemmABlockTransferDstScalarPerVector_GemmM,
+         GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
+         GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
+         GemmBBlockTransferSrcScalarPerVector_GemmK,
+         GemmBBlockTransferDstScalarPerVector_GemmN,
+         GemmCThreadTransferDstScalarPerVector_GemmM1>{};
+
+    conv_driver.Run(wei_k_y_x_c_desc,
+                    in_n_hi_wi_c_desc,
+                    out_n_ho_wo_k_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    static_cast<TDevice*>(wei_kyxc_device_buf.GetDeviceBuffer()),
+                    static_cast<TDevice*>(in_nhwc_device_buf.GetDeviceBuffer()),
+                    static_cast<TDevice*>(out_nhwk_device_buf.GetDeviceBuffer()));
+
+    out_nhwk_device_buf.FromDevice(out_nhwk.mData.data());
+
+    auto f_nhwk2nkhw = [&](auto n, auto k, auto ho, auto wo) {
+        out_nkhw(n, k, ho, wo) = out_nhwk(n, ho, wo, k);
+    };
+
+    make_ParallelTensorFunctor(f_nhwk2nkhw, N, K, Ho, Wo)(std::thread::hardware_concurrency());
+}
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -14,6 +14,7 @@
 #include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"

 int main(int argc, char* argv[])
 {
@@ -615,7 +616,7 @@ int main(int argc, char* argv[])
                                                                 LeftPads{},
                                                                 RightPads{},
                                                                 nrepeat);
-#elif 1
+#elif 0
    device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
                                                                         in_nchw,
                                                                         wei_kcyx_desc,
@@ -627,6 +628,18 @@ int main(int argc, char* argv[])
                                                                         LeftPads{},
                                                                         RightPads{},
                                                                         nrepeat);
+#elif 1
+    device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(in_nchw_desc,
+                                                                         in_nchw,
+                                                                         wei_kcyx_desc,
+                                                                         wei_kcyx,
+                                                                         out_nkhw_desc,
+                                                                         out_nkhw_device,
+                                                                         ConvStrides{},
+                                                                         ConvDilations{},
+                                                                         LeftPads{},
+                                                                         RightPads{},
+                                                                         nrepeat);
 #endif

    if(do_verification)