experimenting

766b0a9e · Chao Liu · f35c64eb · 766b0a9e · 766b0a9e · 766b0a9e
Commit 766b0a9e authored Mar 24, 2019 by Chao Liu
13 changed files
--- a/src/include/functional.hip.hpp
+++ b/src/include/functional.hip.hpp
 #pragma once
 #include "constant_integral.hip.hpp"

-template <unsigned NLoop>
+template <index_t NLoop>
 struct static_loop_n
 {
    template <class F>
@@ -24,7 +24,7 @@ struct static_loop_n<1>
    }
 };

-template <unsigned NLoop>
+template <index_t NLoop>
 struct static_const_reduce_n
 {
    template <class F, class Reduce>

--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -8,18 +8,18 @@ template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned HoPerBlock,
-          unsigned WoPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned HoPerThread,
-          unsigned WoPerThread,
-          unsigned BlockSize,
-          unsigned GridSize>
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t NPerThread,
+          index_t KPerThread,
+          index_t CPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t BlockSize,
+          index_t GridSize>
 __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_in_global,
                                              const Float* const __restrict__ p_wei_global,
                                              Float* const __restrict__ p_out_global)
@@ -33,16 +33,16 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    constexpr auto wei_global_desc = WeiGlobalDesc{};
    constexpr auto out_global_desc = OutGlobalDesc{};

-    constexpr unsigned Y = wei_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_global_desc.GetLength(I3);
+    constexpr index_t Y = wei_global_desc.GetLength(I2);
+    constexpr index_t X = wei_global_desc.GetLength(I3);

-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;

-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+    constexpr index_t NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr index_t KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr index_t HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr index_t WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

    constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());
@@ -59,31 +59,31 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    constexpr auto out_block_desc =
        make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());

-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
-    constexpr unsigned out_block_size = out_block_desc.GetElementSpace();
+    constexpr index_t in_block_size  = in_block_desc.GetElementSpace();
+    constexpr index_t wei_block_size = wei_block_desc.GetElementSpace();
+    constexpr index_t out_block_size = out_block_desc.GetElementSpace();

    __shared__ Float p_in_block[in_block_size];
    __shared__ Float p_wei_block[wei_block_size];
    __shared__ Float p_out_block[out_block_size];

-    const unsigned block_id = blockIdx.x;
+    const index_t block_id = blockIdx.x;

-    unsigned itmp            = block_id;
-    unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    index_t itmp            = block_id;
+    index_t n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
-    unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    index_t k_block_work_id = itmp / (HBlockWork * WBlockWork);
    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
-    unsigned h_block_work_id = itmp / WBlockWork;
-    unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+    index_t h_block_work_id = itmp / WBlockWork;
+    index_t w_block_work_id = itmp - h_block_work_id * WBlockWork;

-    unsigned n_block_work_begin  = n_block_work_id * NPerBlock;
-    unsigned k_block_work_begin  = k_block_work_id * KPerBlock;
-    unsigned ho_block_work_begin = h_block_work_id * HoPerBlock;
-    unsigned wo_block_work_begin = w_block_work_id * WoPerBlock;
+    index_t n_block_work_begin  = n_block_work_id * NPerBlock;
+    index_t k_block_work_begin  = k_block_work_id * KPerBlock;
+    index_t ho_block_work_begin = h_block_work_id * HoPerBlock;
+    index_t wo_block_work_begin = w_block_work_id * WoPerBlock;

-    unsigned hi_block_work_begin = ho_block_work_begin; // minus padding
-    unsigned wi_block_work_begin = wo_block_work_begin; // minus padding
+    index_t hi_block_work_begin = ho_block_work_begin; // minus padding
+    index_t wi_block_work_begin = wo_block_work_begin; // minus padding

    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
@@ -109,7 +109,7 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    // set output tensor in LDS to 0
    blockwise_4d_tensor_set_zero<BlockSize>(out_block_desc, p_out_block);

-    for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
+    for(index_t c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
        c_block_work_begin += CPerBlock)
    {
        // copy input tensor to LDS

--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -11,20 +11,20 @@ template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned HoPerBlock,
-          unsigned WoPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned HoPerThread,
-          unsigned WoPerThread,
-          unsigned InBlockCopyDataPerRead,
-          unsigned WeiBlockCopyDataPerRead,
-          unsigned BlockSize,
-          unsigned GridSize>
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t NPerThread,
+          index_t KPerThread,
+          index_t CPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t InBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead,
+          index_t BlockSize,
+          index_t GridSize>
 __global__ void
 gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
                                             const Float* const __restrict__ p_wei_global,
@@ -39,17 +39,17 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
    constexpr auto out_nkhw_global_desc = OutGlobalDesc{};

-    constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
-    constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
-    constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
-    constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
+    constexpr index_t N = in_nchw_global_desc.GetLength(I0);
+    constexpr index_t K = wei_kcyx_global_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_global_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_global_desc.GetLength(I3);

    constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy

-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;

    constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
@@ -63,21 +63,21 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});

    // shared mem
-    constexpr unsigned in_block_size =
+    constexpr index_t in_block_size =
        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
-    constexpr unsigned wei_block_size =
+    constexpr index_t wei_block_size =
        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

-    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                       ? InBlockCopyDataPerRead
-                                       : WeiBlockCopyDataPerRead;
+    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                      ? InBlockCopyDataPerRead
+                                      : WeiBlockCopyDataPerRead;

    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];

    // threadwise tensors
-    constexpr unsigned HiPerThread = HoPerThread + Y - 1;
-    constexpr unsigned WiPerThread = WoPerThread + X - 1;
+    constexpr index_t HiPerThread = HoPerThread + Y - 1;
+    constexpr index_t WiPerThread = WoPerThread + X - 1;

    constexpr auto in_nchw_thread_block_desc =
        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
@@ -93,56 +93,54 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];

    // divide block work
-    constexpr unsigned NBlockWork =
-        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork =
-        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork =
+    constexpr index_t NBlockWork = (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr index_t KBlockWork = (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr index_t HBlockWork =
        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork =
+    constexpr index_t WBlockWork =
        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

-    const unsigned block_id = blockIdx.x;
+    const index_t block_id = blockIdx.x;

-    unsigned itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    index_t itmp                  = block_id;
+    const index_t n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
-    const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    const index_t k_block_work_id = itmp / (HBlockWork * WBlockWork);
    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
-    const unsigned h_block_work_id = itmp / WBlockWork;
-    const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+    const index_t h_block_work_id = itmp / WBlockWork;
+    const index_t w_block_work_id = itmp - h_block_work_id * WBlockWork;

-    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
-    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
+    const index_t n_block_data_begin  = n_block_work_id * NPerBlock;
+    const index_t k_block_data_begin  = k_block_work_id * KPerBlock;
+    const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;

-    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
-    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
+    const index_t hi_block_data_begin = ho_block_data_begin; // minus padding
+    const index_t wi_block_data_begin = wo_block_data_begin; // minus padding

    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
-    constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
+    constexpr index_t NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
+    constexpr index_t KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
+    constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
+    constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;

-    const unsigned thread_id = threadIdx.x;
+    const index_t thread_id = threadIdx.x;

-    itmp                            = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
+    itmp                           = thread_id;
+    const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
    itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
-    const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
+    const index_t k_thread_work_id = itmp / (HThreadWork * WThreadWork);
    itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
-    const unsigned h_thread_work_id = itmp / WThreadWork;
-    const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
+    const index_t h_thread_work_id = itmp / WThreadWork;
+    const index_t w_thread_work_id = itmp - h_thread_work_id * WThreadWork;

-    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
-    const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
-    const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
+    const index_t n_thread_data_begin  = n_thread_work_id * NPerThread;
+    const index_t k_thread_data_begin  = k_thread_work_id * KPerThread;
+    const index_t ho_thread_data_begin = h_thread_work_id * HoPerThread;
+    const index_t wo_thread_data_begin = w_thread_work_id * WoPerThread;

-    const unsigned hi_thread_data_begin = ho_thread_data_begin;
-    const unsigned wi_thread_data_begin = wo_thread_data_begin;
+    const index_t hi_thread_data_begin = ho_thread_data_begin;
+    const index_t wi_thread_data_begin = wo_thread_data_begin;

    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
@@ -172,7 +170,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    // set threadwise output tensor to 0
    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);

-    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
+    for(index_t c_block_data_begin = 0; c_block_data_begin < C;
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
@@ -191,7 +189,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i

        __syncthreads();

-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
+        for(index_t c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
        {
 // threadwise convolution
 #if 1

--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -13,21 +13,21 @@ template <class TInWei,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned ScalarPerVector,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned HoPerBlock,
-          unsigned WoPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned HoPerThread,
-          unsigned WoPerThread,
-          unsigned InBlockCopyDataPerRead,
-          unsigned WeiBlockCopyDataPerRead,
-          unsigned BlockSize,
-          unsigned GridSize>
+          index_t ScalarPerVector,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t NPerThread,
+          index_t KPerThread,
+          index_t CPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t InBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead,
+          index_t BlockSize,
+          index_t GridSize>
 __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
    const typename vector_type<TInWei,
                               ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
@@ -49,17 +49,17 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
    constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{};
    constexpr auto out_nkhw_global_desc     = OutGlobalDesc{};

-    constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0);
-    constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0);
-    constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1);
-    constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3);
+    constexpr index_t N = in_nchw_vec_global_desc.GetLength(I0);
+    constexpr index_t K = wei_kcyx_vec_global_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_vec_global_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_vec_global_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_vec_global_desc.GetLength(I3);

    constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor(
        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy

-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;

    constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
@@ -73,15 +73,15 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                      Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{});

    // shared mem
-    constexpr unsigned in_block_size =
+    constexpr index_t in_block_size =
        in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});

-    constexpr unsigned wei_block_size =
+    constexpr index_t wei_block_size =
        wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

-    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                       ? InBlockCopyDataPerRead
-                                       : WeiBlockCopyDataPerRead;
+    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                      ? InBlockCopyDataPerRead
+                                      : WeiBlockCopyDataPerRead;

    __shared__ in_vector_mem_t
        p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
@@ -89,8 +89,8 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
        p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];

    // threadwise tensors
-    constexpr unsigned HiPerThread = HoPerThread + Y - 1;
-    constexpr unsigned WiPerThread = WoPerThread + X - 1;
+    constexpr index_t HiPerThread = HoPerThread + Y - 1;
+    constexpr index_t WiPerThread = WoPerThread + X - 1;

    constexpr auto in_nchw_vec_thread_block_desc =
        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
@@ -106,56 +106,54 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
    out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];

    // divide block work
-    constexpr unsigned NBlockWork =
-        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork =
-        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork =
+    constexpr index_t NBlockWork = (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr index_t KBlockWork = (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr index_t HBlockWork =
        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork =
+    constexpr index_t WBlockWork =
        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

-    const unsigned block_id = blockIdx.x;
+    const index_t block_id = blockIdx.x;

-    unsigned itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    index_t itmp                  = block_id;
+    const index_t n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
-    const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    const index_t k_block_work_id = itmp / (HBlockWork * WBlockWork);
    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
-    const unsigned h_block_work_id = itmp / WBlockWork;
-    const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+    const index_t h_block_work_id = itmp / WBlockWork;
+    const index_t w_block_work_id = itmp - h_block_work_id * WBlockWork;

-    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
-    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
+    const index_t n_block_data_begin  = n_block_work_id * NPerBlock;
+    const index_t k_block_data_begin  = k_block_work_id * KPerBlock;
+    const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;

-    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
-    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
+    const index_t hi_block_data_begin = ho_block_data_begin; // minus padding
+    const index_t wi_block_data_begin = wo_block_data_begin; // minus padding

    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
-    constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
+    constexpr index_t NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
+    constexpr index_t KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
+    constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
+    constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;

-    const unsigned thread_id = threadIdx.x;
+    const index_t thread_id = threadIdx.x;

-    itmp                            = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
+    itmp                           = thread_id;
+    const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
    itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
-    const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
+    const index_t k_thread_work_id = itmp / (HThreadWork * WThreadWork);
    itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
-    const unsigned h_thread_work_id = itmp / WThreadWork;
-    const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
+    const index_t h_thread_work_id = itmp / WThreadWork;
+    const index_t w_thread_work_id = itmp - h_thread_work_id * WThreadWork;

-    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
-    const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
-    const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
+    const index_t n_thread_data_begin  = n_thread_work_id * NPerThread;
+    const index_t k_thread_data_begin  = k_thread_work_id * KPerThread;
+    const index_t ho_thread_data_begin = h_thread_work_id * HoPerThread;
+    const index_t wo_thread_data_begin = w_thread_work_id * WoPerThread;

-    const unsigned hi_thread_data_begin = ho_thread_data_begin;
-    const unsigned wi_thread_data_begin = wo_thread_data_begin;
+    const index_t hi_thread_data_begin = ho_thread_data_begin;
+    const index_t wi_thread_data_begin = wo_thread_data_begin;

    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
@@ -188,7 +186,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
 #endif

-    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
+    for(index_t c_block_data_begin = 0; c_block_data_begin < C;
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
@@ -207,7 +205,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(

        __syncthreads();

-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
+        for(index_t c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
        {
 // threadwise convolution
 #if 1

--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
@@ -8,32 +8,32 @@
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "blockwise_batched_gemm.hip.hpp"

-template <unsigned GridSize,
-          unsigned BlockSize,
+template <index_t GridSize,
+          index_t BlockSize,
          class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned HoPerBlock,
-          unsigned WoPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned HoPerThread,
-          unsigned WoPerThread,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t NPerThread,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
          class InBlockCopyThreadPerDims,
-          unsigned InBlockCopyDataPerRead,
-          unsigned WeiBlockCopyDataPerRead,
-          unsigned GemmMPerThreadSubC,
-          unsigned GemmNPerThreadSubC,
-          unsigned GemmMLevel0Cluster,
-          unsigned GemmNLevel0Cluster,
-          unsigned GemmMLevel1Cluster,
-          unsigned GemmNLevel1Cluster,
-          unsigned GemmKPerThreadLoop,
-          unsigned OutThreadCopyDataPerWrite>
+          index_t InBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead,
+          index_t GemmMPerThreadSubC,
+          index_t GemmNPerThreadSubC,
+          index_t GemmMLevel0Cluster,
+          index_t GemmNLevel0Cluster,
+          index_t GemmMLevel1Cluster,
+          index_t GemmNLevel1Cluster,
+          index_t GemmKPerThreadLoop,
+          index_t OutThreadCopyDataPerWrite>
 __global__ void
 gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global,
                                                    const Float* const __restrict__ p_wei_global,
@@ -55,39 +55,39 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
    constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
    constexpr auto out_khwn_global_desc = OutGlobalDesc{};

-    constexpr unsigned C = in_chwn_global_desc.GetLength(I0);
+    constexpr index_t C = in_chwn_global_desc.GetLength(I0);

-    constexpr unsigned K  = out_khwn_global_desc.GetLength(I0);
-    constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
-    constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
-    constexpr unsigned N  = out_khwn_global_desc.GetLength(I3);
+    constexpr index_t K  = out_khwn_global_desc.GetLength(I0);
+    constexpr index_t Ho = out_khwn_global_desc.GetLength(I1);
+    constexpr index_t Wo = out_khwn_global_desc.GetLength(I2);
+    constexpr index_t N  = out_khwn_global_desc.GetLength(I3);

-    constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
-    constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
+    constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
+    constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);

-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;

    // divide block work: [K, Ho, Wo, N]
-    constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
-    constexpr unsigned NBlockWork = (N + NPerBlock - 1) / NPerBlock;
-
-    const unsigned k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
-    unsigned itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
-    const unsigned h_block_work_id = itmp / (WBlockWork * NBlockWork);
+    constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
+    constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
+    constexpr index_t WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
+    constexpr index_t NBlockWork = (N + NPerBlock - 1) / NPerBlock;
+
+    const index_t k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
+    index_t itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
+    const index_t h_block_work_id = itmp / (WBlockWork * NBlockWork);
    itmp -= h_block_work_id * (WBlockWork * NBlockWork);
-    const unsigned w_block_work_id = itmp / NBlockWork;
-    const unsigned n_block_work_id = itmp - w_block_work_id * NBlockWork;
+    const index_t w_block_work_id = itmp / NBlockWork;
+    const index_t n_block_work_id = itmp - w_block_work_id * NBlockWork;

-    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
-    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
-    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
+    const index_t k_block_data_begin  = k_block_work_id * KPerBlock;
+    const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;
+    const index_t n_block_data_begin  = n_block_work_id * NPerBlock;

-    const unsigned hi_block_data_begin = ho_block_data_begin;
-    const unsigned wi_block_data_begin = wo_block_data_begin;
+    const index_t hi_block_data_begin = ho_block_data_begin;
+    const index_t wi_block_data_begin = wo_block_data_begin;

    // flattend (2d) tensor view of gridwise weight
    constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
@@ -164,15 +164,15 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
        HoPerThread>{};

    // LDS: be careful of alignment
-    constexpr unsigned in_block_size =
+    constexpr index_t in_block_size =
        in_chwn_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});

-    constexpr unsigned wei_block_size =
+    constexpr index_t wei_block_size =
        wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

-    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                       ? InBlockCopyDataPerRead
-                                       : WeiBlockCopyDataPerRead;
+    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                      ? InBlockCopyDataPerRead
+                                      : WeiBlockCopyDataPerRead;

    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
@@ -191,10 +191,10 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
    const Float* p_wei_global_block_begin =
        p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);

-    for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
-                 p_in_global_block_begin += CPerBlock * in_chwn_global_desc.GetStride(I0),
-                 p_wei_global_block_begin += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
-                 __syncthreads())
+    for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
+                p_in_global_block_begin += CPerBlock * in_chwn_global_desc.GetStride(I0),
+                p_wei_global_block_begin += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
+                __syncthreads())
    {
        // input: global mem to LDS
        blockwise_in_copy.Run(p_in_global_block_begin, p_in_block);
@@ -205,9 +205,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
        __syncthreads();

        // a series of batched GEMM
-        for(unsigned y = 0; y < Y; ++y)
+        for(index_t y = 0; y < Y; ++y)
        {
-            for(unsigned x = 0; x < X; ++x)
+            for(index_t x = 0; x < X; ++x)
            {
 #if 0
                blockwise_batch_gemm.Run
@@ -227,26 +227,26 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
    const auto c_thread_mtx_begin =
        blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());

-    for(unsigned k = 0; k < out_khwn_thread_desc.GetLength(I0); ++k)
+    for(index_t k = 0; k < out_khwn_thread_desc.GetLength(I0); ++k)
    {
-        for(unsigned ho = 0; ho < out_khwn_thread_desc.GetLength(I1); ++ho)
+        for(index_t ho = 0; ho < out_khwn_thread_desc.GetLength(I1); ++ho)
        {
-            for(unsigned wo = 0; wo < out_khwn_thread_desc.GetLength(I2); ++wo)
+            for(index_t wo = 0; wo < out_khwn_thread_desc.GetLength(I2); ++wo)
            {
-                for(unsigned n = 0; n < out_khwn_thread_desc.GetLength(I3); ++n)
+                for(index_t n = 0; n < out_khwn_thread_desc.GetLength(I3); ++n)
                {
-                    const unsigned b = out_khwn_thread_desc.Get1dIndex(0, 0, wo, n);
+                    const index_t b = out_khwn_thread_desc.Get1dIndex(0, 0, wo, n);

                    const auto c_thread_mtx_distance =
                        blockwise_batch_gemm.GetDistanceFromBeginOfThreadMatrixC(ho, k, b);

-                    const unsigned ho_thread =
+                    const index_t ho_thread =
                        c_thread_mtx_begin.batch + c_thread_mtx_distance.batch;
-                    const unsigned k_thread = c_thread_mtx_begin.row + c_thread_mtx_distance.row;
-                    const unsigned b_thread = c_thread_mtx_begin.col + c_thread_mtx_distance.col;
+                    const index_t k_thread = c_thread_mtx_begin.row + c_thread_mtx_distance.row;
+                    const index_t b_thread = c_thread_mtx_begin.col + c_thread_mtx_distance.col;

-                    const unsigned wo_thread = b_thread / NPerBlock;
-                    const unsigned n_thread  = b_thread % NPerBlock;
+                    const index_t wo_thread = b_thread / NPerBlock;
+                    const index_t n_thread  = b_thread % NPerBlock;

                    p_out_global[out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread,
                                                                 ho_block_data_begin + ho_thread,
@@ -261,19 +261,19 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
    const auto c_thread_mtx_begin =
        blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());

-    const unsigned k_thread_data_begin  = c_thread_mtx_begin.row;
-    const unsigned ho_thread_data_begin = c_thread_mtx_begin.batch;
-    const unsigned wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
-    const unsigned n_thread_data_begin  = c_thread_mtx_begin.col - NPerBlock * wo_thread_data_begin;
+    const index_t k_thread_data_begin  = c_thread_mtx_begin.row;
+    const index_t ho_thread_data_begin = c_thread_mtx_begin.batch;
+    const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
+    const index_t n_thread_data_begin  = c_thread_mtx_begin.col - NPerBlock * wo_thread_data_begin;

    // this is for v2 GEMM
    // output is a 8d tensor
    if(NPerThread < NPerBlock && WoPerThread == 1)
    {
-        constexpr unsigned N1_ = GemmNPerThreadSubC;
-        constexpr unsigned W1_ = WoPerBlock / ((WoPerThread * NPerThread) / GemmNPerThreadSubC);
-        constexpr unsigned K2_ = GemmMPerThreadSubC;
-        constexpr unsigned K1_ = KPerBlock / KPerThread;
+        constexpr index_t N1_ = GemmNPerThreadSubC;
+        constexpr index_t W1_ = WoPerBlock / ((WoPerThread * NPerThread) / GemmNPerThreadSubC);
+        constexpr index_t K2_ = GemmMPerThreadSubC;
+        constexpr index_t K1_ = KPerBlock / KPerThread;

        constexpr auto out_8d_global_desc = make_ConstantTensorDescriptor(
            Sequence<K / (K1_ * K2_), K1_, K2_, Ho, Wo / W1_, W1_, N / N1_, N1_>{});

--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
@@ -7,26 +7,26 @@
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "blockwise_gemm.hip.hpp"

-template <unsigned GridSize,
-          unsigned BlockSize,
+template <index_t GridSize,
+          index_t BlockSize,
          class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
          class LowerPads,
          class UpperPads,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned HoPerBlock,
-          unsigned WoPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned HoPerThread,
-          unsigned WoPerThread,
-          unsigned WeiBlockCopyThreadPerDim0,
-          unsigned WeiBlockCopyThreadPerDim1>
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t NPerThread,
+          index_t KPerThread,
+          index_t CPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t WeiBlockCopyThreadPerDim0,
+          index_t WeiBlockCopyThreadPerDim1>
 __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
    const Float* const __restrict__ p_in_global,
    const Float* const __restrict__ p_wei_global,
@@ -48,42 +48,42 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
    constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
    constexpr auto out_khwn_global_desc = OutGlobalDesc{};

-    constexpr unsigned C = in_chwn_global_desc.GetLength(I0);
+    constexpr index_t C = in_chwn_global_desc.GetLength(I0);

-    constexpr unsigned K  = out_khwn_global_desc.GetLength(I0);
-    constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
-    constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
-    constexpr unsigned N  = out_khwn_global_desc.GetLength(I3);
+    constexpr index_t K  = out_khwn_global_desc.GetLength(I0);
+    constexpr index_t Ho = out_khwn_global_desc.GetLength(I1);
+    constexpr index_t Wo = out_khwn_global_desc.GetLength(I2);
+    constexpr index_t N  = out_khwn_global_desc.GetLength(I3);

-    constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
-    constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
+    constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
+    constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);

-    constexpr unsigned HPadLow = LowerPads{}.Get(I0);
-    constexpr unsigned WPadLow = LowerPads{}.Get(I1);
+    constexpr index_t HPadLow = LowerPads{}.Get(I0);
+    constexpr index_t WPadLow = LowerPads{}.Get(I1);

-    constexpr unsigned HPadUp = UpperPads{}.Get(I0);
-    constexpr unsigned WPadUp = UpperPads{}.Get(I1);
+    constexpr index_t HPadUp = UpperPads{}.Get(I0);
+    constexpr index_t WPadUp = UpperPads{}.Get(I1);

-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;

    // divide block work: [K, Ho, Wo, N]
-    constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
-    constexpr unsigned NBlockWork = (N + NPerBlock - 1) / NPerBlock;
-
-    const unsigned k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
-    unsigned itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
-    const unsigned h_block_work_id = itmp / (WBlockWork * NBlockWork);
+    constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
+    constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
+    constexpr index_t WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
+    constexpr index_t NBlockWork = (N + NPerBlock - 1) / NPerBlock;
+
+    const index_t k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
+    index_t itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
+    const index_t h_block_work_id = itmp / (WBlockWork * NBlockWork);
    itmp -= h_block_work_id * (WBlockWork * NBlockWork);
-    const unsigned w_block_work_id = itmp / NBlockWork;
-    const unsigned n_block_work_id = itmp - w_block_work_id * NBlockWork;
+    const index_t w_block_work_id = itmp / NBlockWork;
+    const index_t n_block_work_id = itmp - w_block_work_id * NBlockWork;

-    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
-    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
-    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
+    const index_t k_block_data_begin  = k_block_work_id * KPerBlock;
+    const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;
+    const index_t n_block_data_begin  = n_block_work_id * NPerBlock;

    // flattened (2d) tensor view of wei in global mem
    constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
@@ -114,11 +114,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(

    // blockwise copy
    // input: format is [C, Hi, Wi, N]
-    const unsigned h_block_pad_low = h_block_work_id == 0 ? HPadLow : 0;
-    const unsigned w_block_pad_low = w_block_work_id == 0 ? WPadLow : 0;
+    const index_t h_block_pad_low = h_block_work_id == 0 ? HPadLow : 0;
+    const index_t w_block_pad_low = w_block_work_id == 0 ? WPadLow : 0;

-    const unsigned h_block_pad_up = h_block_work_id == HBlockWork - 1 ? HPadUp : 0;
-    const unsigned w_block_pad_up = w_block_work_id == WBlockWork - 1 ? WPadUp : 0;
+    const index_t h_block_pad_up = h_block_work_id == HBlockWork - 1 ? HPadUp : 0;
+    const index_t w_block_pad_up = w_block_work_id == WBlockWork - 1 ? WPadUp : 0;

 #if 0
    if(get_thread_local_1d_id() == 0)
@@ -204,8 +204,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
                                                         true>{};

    // LDS
-    constexpr unsigned in_block_size  = in_chwn_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_cyxk_block_desc.GetElementSpace();
+    constexpr index_t in_block_size  = in_chwn_block_desc.GetElementSpace();
+    constexpr index_t wei_block_size = wei_cyxk_block_desc.GetElementSpace();

    __shared__ Float p_in_block[in_block_size];
    __shared__ Float p_wei_block[wei_block_size];
@@ -219,9 +219,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
    const Float* p_wei_global_block_begin =
        p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin);

-    for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
-                 p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0),
-                 __syncthreads())
+    for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
+                p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0),
+                __syncthreads())
    {
 #if 1
        // input: global mem to LDS,
@@ -245,9 +245,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
        __syncthreads();

        // a series of batched GEMM
-        for(unsigned y = 0; y < Y; ++y)
+        for(index_t y = 0; y < Y; ++y)
        {
-            for(unsigned x = 0; x < X; ++x)
+            for(index_t x = 0; x < X; ++x)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };

@@ -262,10 +262,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
    const auto matrix_c_index =
        blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());

-    const unsigned ho_thread_data_begin = matrix_c_index.batch;
-    const unsigned k_thread_data_begin  = matrix_c_index.row;
-    const unsigned wo_thread_data_begin = matrix_c_index.col / NPerBlock;
-    const unsigned n_thread_data_begin  = matrix_c_index.col - wo_thread_data_begin * NPerBlock;
+    const index_t ho_thread_data_begin = matrix_c_index.batch;
+    const index_t k_thread_data_begin  = matrix_c_index.row;
+    const index_t wo_thread_data_begin = matrix_c_index.col / NPerBlock;
+    const index_t n_thread_data_begin  = matrix_c_index.col - wo_thread_data_begin * NPerBlock;

 #if 0
    printf("block %u %u, %u %u %u %u, %u %u %u %u, %f \n", 

--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -8,32 +8,32 @@
 #include "blockwise_gemm.hip.hpp"

 // define B = flatten(N, Hi, Wi)
-template <unsigned GridSize,
-          unsigned BlockSize,
+template <index_t GridSize,
+          index_t BlockSize,
          class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned BPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned BPerThread,
-          unsigned KPerThread,
-          unsigned GemmThreadPerColumnPerCluster,
-          unsigned GemmThreadPerRowPerCluster,
-          unsigned GemmMPerThreadSubC,
-          unsigned GemmNPerThreadSubC,
-          unsigned GemmMLevel0Cluster,
-          unsigned GemmNLevel0Cluster,
-          unsigned GemmMLevel1Cluster,
-          unsigned GemmNLevel1Cluster,
-          unsigned GemmKPerThreadLoop,
-          unsigned InBlockCopyThreadPerDim0,
-          unsigned InBlockCopyThreadPerDim1,
-          unsigned WeiBlockCopyThreadPerDim0,
-          unsigned WeiBlockCopyThreadPerDim1,
-          unsigned InBlockCopyDataPerRead,
-          unsigned WeiBlockCopyDataPerRead>
+          index_t BPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t BPerThread,
+          index_t KPerThread,
+          index_t GemmThreadPerColumnPerCluster,
+          index_t GemmThreadPerRowPerCluster,
+          index_t GemmMPerThreadSubC,
+          index_t GemmNPerThreadSubC,
+          index_t GemmMLevel0Cluster,
+          index_t GemmNLevel0Cluster,
+          index_t GemmMLevel1Cluster,
+          index_t GemmNLevel1Cluster,
+          index_t GemmKPerThreadLoop,
+          index_t InBlockCopyThreadPerDim0,
+          index_t InBlockCopyThreadPerDim1,
+          index_t WeiBlockCopyThreadPerDim0,
+          index_t WeiBlockCopyThreadPerDim1,
+          index_t InBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead>
 __global__ void
 gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global,
                                                    const Float* const __restrict__ p_wei_global,
@@ -48,30 +48,30 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
    constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
    constexpr auto out_khwn_global_desc = OutGlobalDesc{};

-    constexpr unsigned C  = in_chwn_global_desc.GetLength(I0);
-    constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1);
-    constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2);
-    constexpr unsigned N  = in_chwn_global_desc.GetLength(I3);
+    constexpr index_t C  = in_chwn_global_desc.GetLength(I0);
+    constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
+    constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
+    constexpr index_t N  = in_chwn_global_desc.GetLength(I3);

-    constexpr unsigned K  = out_khwn_global_desc.GetLength(I0);
-    constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
-    constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
+    constexpr index_t K  = out_khwn_global_desc.GetLength(I0);
+    constexpr index_t Ho = out_khwn_global_desc.GetLength(I1);
+    constexpr index_t Wo = out_khwn_global_desc.GetLength(I2);

-    constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
-    constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
+    constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
+    constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);

-    constexpr unsigned B          = N * Hi * Wi;
-    constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
+    constexpr index_t B          = N * Hi * Wi;
+    constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);

    // divide block work by 2d: [K, B]
-    constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
+    constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
+    constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock;

-    const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
-    const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
+    const index_t k_block_work_id = get_block_1d_id() / BBlockWork;
+    const index_t b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;

-    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
-    const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
+    const index_t k_block_data_begin = k_block_work_id * KPerBlock;
+    const index_t b_block_data_begin = b_block_work_id * BPerBlock;

    // flattend (2d) tensor view of gridwise input
    constexpr auto in_cb_global_desc  = make_ConstantTensorDescriptor(Sequence<C, B>{});
@@ -192,15 +192,15 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
                                                                GemmKPerThreadLoop>{};

    // LDS: be careful of alignment
-    constexpr unsigned in_block_size =
+    constexpr index_t in_block_size =
        in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});

-    constexpr unsigned wei_block_size =
+    constexpr index_t wei_block_size =
        wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

-    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                       ? InBlockCopyDataPerRead
-                                       : WeiBlockCopyDataPerRead;
+    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                      ? InBlockCopyDataPerRead
+                                      : WeiBlockCopyDataPerRead;

    // LDS
    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
@@ -218,10 +218,10 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
    // set threadwise output tensor to 0
    threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);

-    for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
-                 p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
-                 p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
-                 __syncthreads())
+    for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
+                p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
+                p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
+                __syncthreads())
    {
        // load data
        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
@@ -231,18 +231,16 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric

        // compute on current data
        //   a series of GEMM
-        for(unsigned y = 0; y < Y; ++y)
+        for(index_t y = 0; y < Y; ++y)
        {
-            for(unsigned x = 0; x < X; ++x)
+            for(index_t x = 0; x < X; ++x)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
 #if 0
                blockwise_gemm.Run
-#elif 1
-                blockwise_gemm.Run_asm
-#elif 0
-                blockwise_gemm.Run_v2
 #elif 0
+                blockwise_gemm.Run_asm
+#elif 1
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
                (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
@@ -257,23 +255,23 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
    const auto c_thread_mtx_begin =
        blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());

-    const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
-    const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
+    const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
+    const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;

-    for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
+    for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
    {
-        for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
+        for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
        {
            const auto c_thread_mtx_distance =
                blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);

-            unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row;
-            unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col;
+            index_t k_data = k_thread_data_begin + c_thread_mtx_distance.row;
+            index_t b_data = b_thread_data_begin + c_thread_mtx_distance.col;

-            unsigned h_data = b_data / (Wi * N);
-            unsigned itmp   = b_data - h_data * (Wi * N);
-            unsigned w_data = itmp / N;
-            unsigned n_data = itmp - w_data * N;
+            index_t h_data = b_data / (Wi * N);
+            index_t itmp   = b_data - h_data * (Wi * N);
+            index_t w_data = itmp / N;
+            index_t n_data = itmp - w_data * N;

            if(n_data < N && h_data < Ho && w_data < Wo)
            {

--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -8,32 +8,32 @@
 #include "blockwise_gemm.hip.hpp"

 // define B = flatten(N, Hi, Wi)
-template <unsigned GridSize,
-          unsigned BlockSize,
+template <index_t GridSize,
+          index_t BlockSize,
          class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned BPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned BPerThread,
-          unsigned KPerThread,
-          unsigned GemmThreadPerColumnPerCluster,
-          unsigned GemmThreadPerRowPerCluster,
-          unsigned GemmMPerThreadSubC,
-          unsigned GemmNPerThreadSubC,
-          unsigned GemmMLevel0Cluster,
-          unsigned GemmNLevel0Cluster,
-          unsigned GemmMLevel1Cluster,
-          unsigned GemmNLevel1Cluster,
-          unsigned GemmKPerThreadLoop,
-          unsigned InBlockCopyThreadPerDim0,
-          unsigned InBlockCopyThreadPerDim1,
-          unsigned WeiBlockCopyThreadPerDim0,
-          unsigned WeiBlockCopyThreadPerDim1,
-          unsigned InBlockCopyDataPerRead,
-          unsigned WeiBlockCopyDataPerRead>
+          index_t BPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t BPerThread,
+          index_t KPerThread,
+          index_t GemmThreadPerColumnPerCluster,
+          index_t GemmThreadPerRowPerCluster,
+          index_t GemmMPerThreadSubC,
+          index_t GemmNPerThreadSubC,
+          index_t GemmMLevel0Cluster,
+          index_t GemmNLevel0Cluster,
+          index_t GemmMLevel1Cluster,
+          index_t GemmNLevel1Cluster,
+          index_t GemmKPerThreadLoop,
+          index_t InBlockCopyThreadPerDim0,
+          index_t InBlockCopyThreadPerDim1,
+          index_t WeiBlockCopyThreadPerDim0,
+          index_t WeiBlockCopyThreadPerDim1,
+          index_t InBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead>
 __global__ void
 #if 0
 __launch_bounds__(256,2)
@@ -52,30 +52,30 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(
    constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
    constexpr auto out_khwn_global_desc = OutGlobalDesc{};

-    constexpr unsigned C  = in_chwn_global_desc.GetLength(I0);
-    constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1);
-    constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2);
-    constexpr unsigned N  = in_chwn_global_desc.GetLength(I3);
+    constexpr index_t C  = in_chwn_global_desc.GetLength(I0);
+    constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
+    constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
+    constexpr index_t N  = in_chwn_global_desc.GetLength(I3);

-    constexpr unsigned K  = out_khwn_global_desc.GetLength(I0);
-    constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
-    constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
+    constexpr index_t K  = out_khwn_global_desc.GetLength(I0);
+    constexpr index_t Ho = out_khwn_global_desc.GetLength(I1);
+    constexpr index_t Wo = out_khwn_global_desc.GetLength(I2);

-    constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
-    constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
+    constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
+    constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);

-    constexpr unsigned B          = N * Hi * Wi;
-    constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
+    constexpr index_t B          = N * Hi * Wi;
+    constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);

    // divide block work by 2d: [K, B]
-    constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
+    constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
+    constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock;

-    const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
-    const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
+    const index_t k_block_work_id = get_block_1d_id() / BBlockWork;
+    const index_t b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;

-    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
-    const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
+    const index_t k_block_data_begin = k_block_work_id * KPerBlock;
+    const index_t b_block_data_begin = b_block_work_id * BPerBlock;

    // flattend (2d) tensor view of gridwise input
    constexpr auto in_cb_global_desc  = make_ConstantTensorDescriptor(Sequence<C, B>{});
@@ -210,15 +210,15 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(
 #endif

    // LDS: be careful of alignment
-    constexpr unsigned in_block_size =
+    constexpr index_t in_block_size =
        in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});

-    constexpr unsigned wei_block_size =
+    constexpr index_t wei_block_size =
        wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

-    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                       ? InBlockCopyDataPerRead
-                                       : WeiBlockCopyDataPerRead;
+    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                      ? InBlockCopyDataPerRead
+                                      : WeiBlockCopyDataPerRead;

    // LDS double buffer
    __shared__ Float p_in_block_0[max_align * ((in_block_size + max_align - 1) / max_align)];
@@ -248,11 +248,11 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(

    bool even_loop = true;

-    for(unsigned c_block_data_begin = 0; c_block_data_begin + CPerBlock < C;
+    for(index_t c_block_data_begin = 0; c_block_data_begin + CPerBlock < C;
        c_block_data_begin += CPerBlock,
-                 p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
-                 p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
-                 even_loop = !even_loop)
+                p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
+                p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
+                even_loop = !even_loop)
    {
        Float* p_in_block_now  = even_loop ? p_in_block_0 : p_in_block_1;
        Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
@@ -279,12 +279,12 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(

        // compute on current data
        //   a series of GEMM
-        for(unsigned y = 0; y < Y; ++y)
+        for(index_t y = 0; y < Y; ++y)
        {
-            for(unsigned x = 0; x < X; ++x)
+            for(index_t x = 0; x < X; ++x)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 0
+#if 1
                blockwise_gemm.Run
 #else
                blockwise_gemm.Run_RegisterDoubleBuffer
@@ -309,12 +309,12 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(

        __syncthreads();

-        for(unsigned y = 0; y < Y; ++y)
+        for(index_t y = 0; y < Y; ++y)
        {
-            for(unsigned x = 0; x < X; ++x)
+            for(index_t x = 0; x < X; ++x)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 0
+#if 1
                blockwise_gemm.Run
 #else
                blockwise_gemm.Run_RegisterDoubleBuffer
@@ -331,8 +331,8 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(
    const auto c_thread_mtx_begin =
        blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());

-    const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
-    const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
+    const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
+    const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;

 #if 0
    if(get_block_1d_id() == 0)
@@ -348,20 +348,20 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(
    }
 #endif

-    for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
+    for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
    {
-        for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
+        for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
        {
            const auto c_thread_mtx_distance =
                blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);

-            unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row;
-            unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col;
+            index_t k_data = k_thread_data_begin + c_thread_mtx_distance.row;
+            index_t b_data = b_thread_data_begin + c_thread_mtx_distance.col;

-            unsigned h_data = b_data / (Wi * N);
-            unsigned itmp   = b_data - h_data * (Wi * N);
-            unsigned w_data = itmp / N;
-            unsigned n_data = itmp - w_data * N;
+            index_t h_data = b_data / (Wi * N);
+            index_t itmp   = b_data - h_data * (Wi * N);
+            index_t w_data = itmp / N;
+            index_t n_data = itmp - w_data * N;

            if(n_data < N && h_data < Ho && w_data < Wo)
            {

--- a/src/include/threadwise_2d_tensor_op.hip.hpp
+++ b/src/include/threadwise_2d_tensor_op.hip.hpp
@@ -16,11 +16,11 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re
    }
 #endif

-    for(unsigned did0 = 0; did0 < desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < desc.GetLength(I1); ++did1)
        {
-            const unsigned dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.Get1dIndex(did0, did1);

            f(p[dindex]);
        }
@@ -47,22 +47,22 @@ __device__ void threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_d
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};

-    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
-    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
+    constexpr index_t IR0 = DstFromSrcReorder{}.Get(I0);
+    constexpr index_t IR1 = DstFromSrcReorder{}.Get(I1);

    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});

-    for(unsigned did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            const unsigned aindex = src_desc.Get1dIndex(did0, did1);
+            const index_t aindex = src_desc.Get1dIndex(did0, did1);

-            const unsigned did[2] = {did0, did1};
+            const index_t did[2] = {did0, did1};

-            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+            const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);

            f(p_src[aindex], p_dst[bindex]);
        }
@@ -118,21 +118,21 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
    }
 #endif

-    constexpr unsigned nshift = NShift::mValue;
+    constexpr index_t nshift = NShift::mValue;

-    constexpr unsigned did0_end =
+    constexpr index_t did0_end =
        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);

-    constexpr unsigned did1_end =
+    constexpr index_t did1_end =
        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);

-    for(unsigned did0 = 0; did0 < did0_end; ++did0)
+    for(index_t did0 = 0; did0 < did0_end; ++did0)
    {
-        for(unsigned did1 = 0; did1 < did1_end; ++did1)
+        for(index_t did1 = 0; did1 < did1_end; ++did1)
        {
-            const unsigned dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.Get1dIndex(did0, did1);

-            const unsigned sindex = dindex + nshift * desc.GetStride(IDim{});
+            const index_t sindex = dindex + nshift * desc.GetStride(IDim{});

            p[dindex] = p[sindex];
        }

--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
@@ -18,15 +18,15 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
    }
 #endif

-    for(unsigned did0 = 0; did0 < desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < desc.GetLength(I1); ++did1)
        {
-            for(unsigned did2 = 0; did2 < desc.GetLength(I2); ++did2)
+            for(index_t did2 = 0; did2 < desc.GetLength(I2); ++did2)
            {
-                for(unsigned did3 = 0; did3 < desc.GetLength(I3); ++did3)
+                for(index_t did3 = 0; did3 < desc.GetLength(I3); ++did3)
                {
-                    const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3);
+                    const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3);

                    f(p[dindex]);
                }
@@ -58,28 +58,28 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_d
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

-    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
-    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
-    constexpr unsigned IR2 = DstFromSrcReorder{}.Get(I2);
-    constexpr unsigned IR3 = DstFromSrcReorder{}.Get(I3);
+    constexpr index_t IR0 = DstFromSrcReorder{}.Get(I0);
+    constexpr index_t IR1 = DstFromSrcReorder{}.Get(I1);
+    constexpr index_t IR2 = DstFromSrcReorder{}.Get(I2);
+    constexpr index_t IR3 = DstFromSrcReorder{}.Get(I3);

    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});

-    for(unsigned did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            for(unsigned did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
+            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
            {
-                for(unsigned did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
+                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
                {
-                    const unsigned aindex = src_desc.Get1dIndex(did0, did1, did2, did3);
+                    const index_t aindex = src_desc.Get1dIndex(did0, did1, did2, did3);

-                    const unsigned did[4] = {did0, did1, did2, did3};
+                    const index_t did[4] = {did0, did1, did2, did3};

-                    const unsigned bindex =
+                    const index_t bindex =
                        dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);

                    f(p_src[aindex], p_dst[bindex]);
@@ -129,7 +129,7 @@ __device__ void threadwise_4d_tensor_copy(
 }

 // need to assume src and dst is aligned
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, unsigned DataPerRead>
+template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
 __device__ void threadwise_4d_tensor_copy_v2(SrcDesc,
                                             const Float* __restrict__ p_src,
                                             DstDesc,
@@ -163,24 +163,24 @@ __device__ void threadwise_4d_tensor_copy_v2(SrcDesc,
                      DstDesc{}.GetStride(I2) % DataPerRead == 0,
                  "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");

-    constexpr unsigned L3 = SrcOpLengths{}.Get(I3);
+    constexpr index_t L3 = SrcOpLengths{}.Get(I3);

    static_assert(L3 % DataPerRead == 0, "wrong! L3 should be evenly divided by DataPerRead");

-    constexpr unsigned nloop_d3 = L3 / DataPerRead;
+    constexpr index_t nloop_d3 = L3 / DataPerRead;

-    for(unsigned did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            for(unsigned did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
+            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
            {
-                for(unsigned iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
+                for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
                {
-                    const unsigned src_index =
+                    const index_t src_index =
                        src_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);

-                    const unsigned dst_index =
+                    const index_t dst_index =
                        dst_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);

                    if(DataPerRead == 1)
@@ -224,31 +224,31 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
    }
 #endif

-    constexpr unsigned nshift = NShift::mValue;
+    constexpr index_t nshift = NShift::mValue;

-    constexpr unsigned did0_end =
+    constexpr index_t did0_end =
        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);

-    constexpr unsigned did1_end =
+    constexpr index_t did1_end =
        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);

-    constexpr unsigned did2_end =
+    constexpr index_t did2_end =
        is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2);

-    constexpr unsigned did3_end =
+    constexpr index_t did3_end =
        is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3);

-    for(unsigned did0 = 0; did0 < did0_end; ++did0)
+    for(index_t did0 = 0; did0 < did0_end; ++did0)
    {
-        for(unsigned did1 = 0; did1 < did1_end; ++did1)
+        for(index_t did1 = 0; did1 < did1_end; ++did1)
        {
-            for(unsigned did2 = 0; did2 < did2_end; ++did2)
+            for(index_t did2 = 0; did2 < did2_end; ++did2)
            {
-                for(unsigned did3 = 0; did3 < did3_end; ++did3)
+                for(index_t did3 = 0; did3 < did3_end; ++did3)
                {
-                    const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3);
+                    const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3);

-                    const unsigned sindex = dindex + nshift * desc.GetStride(IDim{});
+                    const index_t sindex = dindex + nshift * desc.GetStride(IDim{});

                    p[dindex] = p[sindex];
                }

--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
@@ -28,28 +28,28 @@ __device__ void threadwise_direct_convolution_1(InDesc,
    }
 #endif

-    for(unsigned n = 0; n < out_desc.GetLength(I0); ++n)
+    for(index_t n = 0; n < out_desc.GetLength(I0); ++n)
    {
-        for(unsigned k = 0; k < out_desc.GetLength(I1); ++k)
+        for(index_t k = 0; k < out_desc.GetLength(I1); ++k)
        {
-            for(unsigned ho = 0; ho < out_desc.GetLength(I2); ++ho)
+            for(index_t ho = 0; ho < out_desc.GetLength(I2); ++ho)
            {
-                for(unsigned wo = 0; wo < out_desc.GetLength(I3); ++wo)
+                for(index_t wo = 0; wo < out_desc.GetLength(I3); ++wo)
                {
-                    for(unsigned c = 0; c < wei_desc.GetLength(I1); ++c)
+                    for(index_t c = 0; c < wei_desc.GetLength(I1); ++c)
                    {
-                        for(unsigned y = 0; y < wei_desc.GetLength(I2); ++y)
+                        for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
                        {
-                            for(unsigned x = 0; x < wei_desc.GetLength(I3); ++x)
+                            for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
                            {
-                                const unsigned hi = ho + y;
-                                const unsigned wi = wo + x;
+                                const index_t hi = ho + y;
+                                const index_t wi = wo + x;

-                                const unsigned in_index = in_desc.Get1dIndex(n, c, hi, wi);
+                                const index_t in_index = in_desc.Get1dIndex(n, c, hi, wi);

-                                const unsigned wei_index = wei_desc.Get1dIndex(k, c, y, x);
+                                const index_t wei_index = wei_desc.Get1dIndex(k, c, y, x);

-                                const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
+                                const index_t out_index = out_desc.Get1dIndex(n, k, ho, wo);

                                fused_multiply_accumulate(
                                    p_out[out_index], p_wei[wei_index], p_in[in_index]);
@@ -125,7 +125,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
    Data p_in_reg[in_reg_desc.GetElementSpace()];
    Data p_wei_reg[wei_reg_desc.GetElementSpace()];

-    constexpr unsigned in_w_new_read = 1;
+    constexpr index_t in_w_new_read = 1;

    constexpr auto in_desc_reg_new_read =
        make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
@@ -136,7 +136,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
 #if 0
    // this verison reused old input data in register, and read new data from LDS
    // loop over vertical direction
-    for(unsigned y = 0; y < wei_desc.GetLength(I2); ++y)
+    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
    {
        // read first input
        threadwise_4d_tensor_copy(in_desc,
@@ -157,7 +157,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
            in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);

        // loop over horizontal direction
-        for(unsigned x = 1; x < wei_desc.GetLength(I3); ++x)
+        for(index_t x = 1; x < wei_desc.GetLength(I3); ++x)
        {
            // read new weight
            threadwise_4d_tensor_copy(wei_desc,
@@ -186,10 +186,10 @@ __device__ void threadwise_direct_convolution_3(InDesc,
 #elif 1
    // this version read all input from LDS when filter moves
    // loop over vertical direction
-    for(unsigned y = 0; y < wei_desc.GetLength(I2); ++y)
+    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
    {
        // loop over horizontal direction
-        for(unsigned x = 0; x < wei_desc.GetLength(I3); ++x)
+        for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
        {
            // read new weight
            threadwise_4d_tensor_copy(wei_desc,

--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
 #pragma once

-template <class Float, class SrcMatrix, class DstMatrix, unsigned NRow, unsigned NCol>
+template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
 __device__ void threadwise_matrix_copy(SrcMatrix,
                                       const Float* __restrict__ p_src,
                                       DstMatrix,
@@ -10,16 +10,39 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};

-    for(unsigned i = 0; i < NRow; ++i)
+#if 0
+    for(index_t i = 0; i < NRow; ++i)
    {
-        for(unsigned j = 0; j < NCol; ++j)
+        for(index_t j = 0; j < NCol; ++j)
        {
-            const unsigned src_index = src_mtx.Get1dIndex(i, j);
-            const unsigned dst_index = dst_mtx.Get1dIndex(i, j);
+            const index_t src_index = src_mtx.Get1dIndex(i, j);
+            const index_t dst_index = dst_mtx.Get1dIndex(i, j);

            p_dst[dst_index] = p_src[src_index];
        }
    }
+#elif 1
+    static_assert(NCol == 4, "only for NCol == 4");
+
+    using vector_t = typename vector_type<Float, 4>::MemoryType;
+
+    for(index_t i = 0; i < NRow; ++i)
+    {
+        const index_t src_index = src_mtx.Get1dIndex(i, 0);
+        const index_t dst_index = dst_mtx.Get1dIndex(i, 0);
+
+#if 1
+        *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
+            *(reinterpret_cast<const vector_t*>(p_src + src_index));
+#elif 1
+        asm volatile("\n \
+                    ds_read_b128 %0, %1, offset:0 \n \
+                    "
+                     : "=v"(*(reinterpret_cast<vector_t*>(p_dst+dst_index)))
+                     : "v"((uint32_t)(p_src + src_index)));
+#endif
+    }
+#endif
 }

 template <class MatrixA,
@@ -49,21 +72,31 @@ __device__ void threadwise_gemm(MatrixA,
        constexpr auto b_mtx = MatrixB{};
        constexpr auto c_mtx = MatrixC{};

-        constexpr unsigned M = c_mtx.NRow();
-        constexpr unsigned N = c_mtx.NCol();
-        constexpr unsigned K = a_mtx.NRow(); // A is transposed
+        constexpr index_t M = c_mtx.NRow();
+        constexpr index_t N = c_mtx.NCol();
+        constexpr index_t K = a_mtx.NRow(); // A is transposed

-        for(unsigned k = 0; k < K; ++k)
+        for(index_t k = 0; k < K; ++k)
        {
-            for(unsigned i = 0; i < M; ++i)
+            for(index_t i = 0; i < M; ++i)
            {
-                for(unsigned j = 0; j < N; ++j)
+                for(index_t j = 0; j < N; ++j)
                {
-                    const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed
-                    const unsigned bindex = b_mtx.Get1dIndex(k, j);
-                    const unsigned cindex = c_mtx.Get1dIndex(i, j);
+                    const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed
+                    const index_t bindex = b_mtx.Get1dIndex(k, j);
+                    const index_t cindex = c_mtx.Get1dIndex(i, j);

+#if 0
                    f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
+#elif 1
+                    asm volatile("\n \
+                                v_mac_f32 %0, %1, %2 \n \
+                                "
+                                 : "=v"(p_c_thread[cindex])
+                                 : "v"(p_a_thread[aindex]),
+                                   "v"(p_b_thread[bindex]),
+                                   "0"(p_c_thread[cindex]));
+#endif
                }
            }
        }

--- a/src/include/threadwise_nd_tensor_op.hip.hpp
+++ b/src/include/threadwise_nd_tensor_op.hip.hpp
@@ -2,7 +2,7 @@
 #include "ConstantTensorDescriptor.hip.hpp"

 // need to assume src and dst is aligned
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, unsigned DataPerRead>
+template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
 __device__ void threadwise_6d_tensor_copy(SrcDesc,
                                          const Float* __restrict__ p_src,
                                          DstDesc,
@@ -37,28 +37,28 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
                      DstDesc{}.GetStride(I4) % DataPerRead == 0,
                  "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");

-    constexpr unsigned L5 = SrcOpLengths{}.Get(I5);
+    constexpr index_t L5 = SrcOpLengths{}.Get(I5);

    static_assert(L5 % DataPerRead == 0, "wrong! L5 should be evenly divided by DataPerRead");

-    constexpr unsigned nloop_d5 = L5 / DataPerRead;
+    constexpr index_t nloop_d5 = L5 / DataPerRead;

-    for(unsigned did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            for(unsigned did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
+            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
            {
-                for(unsigned did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
+                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
                {
-                    for(unsigned did4 = 0; did4 < ref_desc.GetLength(I4); ++did4)
+                    for(index_t did4 = 0; did4 < ref_desc.GetLength(I4); ++did4)
                    {
-                        for(unsigned iloop_d5 = 0; iloop_d5 < nloop_d5; ++iloop_d5)
+                        for(index_t iloop_d5 = 0; iloop_d5 < nloop_d5; ++iloop_d5)
                        {
-                            const unsigned src_index = src_desc.Get1dIndex(
+                            const index_t src_index = src_desc.Get1dIndex(
                                did0, did1, did2, did3, did4, iloop_d5 * DataPerRead);

-                            const unsigned dst_index = dst_desc.Get1dIndex(
+                            const index_t dst_index = dst_desc.Get1dIndex(
                                did0, did1, did2, did3, did4, iloop_d5 * DataPerRead);

                            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
@@ -72,7 +72,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
 }

 // need to assume src and dst is aligned
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, unsigned DataPerRead>
+template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
 __device__ void threadwise_8d_tensor_copy(SrcDesc,
                                          const Float* __restrict__ p_src,
                                          DstDesc,
@@ -109,29 +109,29 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc,
                      DstDesc{}.GetStride(I6) % DataPerRead == 0,
                  "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");

-    constexpr unsigned L7 = SrcOpLengths{}.Get(I7);
+    constexpr index_t L7 = SrcOpLengths{}.Get(I7);

    static_assert(L7 % DataPerRead == 0, "wrong! L7 should be evenly divided by DataPerRead");

-    constexpr unsigned nloop_d7 = L7 / DataPerRead;
+    constexpr index_t nloop_d7 = L7 / DataPerRead;

-    for(unsigned did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            for(unsigned did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
+            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
            {
-                for(unsigned did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
+                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
                {
-                    for(unsigned did4 = 0; did4 < ref_desc.GetLength(I4); ++did4)
+                    for(index_t did4 = 0; did4 < ref_desc.GetLength(I4); ++did4)
                    {
-                        for(unsigned did5 = 0; did5 < ref_desc.GetLength(I5); ++did5)
+                        for(index_t did5 = 0; did5 < ref_desc.GetLength(I5); ++did5)
                        {
-                            for(unsigned did6 = 0; did6 < ref_desc.GetLength(I6); ++did6)
+                            for(index_t did6 = 0; did6 < ref_desc.GetLength(I6); ++did6)
                            {
-                                for(unsigned iloop_d7 = 0; iloop_d7 < nloop_d7; ++iloop_d7)
+                                for(index_t iloop_d7 = 0; iloop_d7 < nloop_d7; ++iloop_d7)
                                {
-                                    const unsigned src_index =
+                                    const index_t src_index =
                                        src_desc.Get1dIndex(did0,
                                                            did1,
                                                            did2,
@@ -141,7 +141,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc,
                                                            did6,
                                                            iloop_d7 * DataPerRead);

-                                    const unsigned dst_index =
+                                    const index_t dst_index =
                                        dst_desc.Get1dIndex(did0,
                                                            did1,
                                                            did2,