experimenting

766b0a9e · Chao Liu · f35c64eb · 766b0a9e · 766b0a9e · 766b0a9e
Commit 766b0a9e authored Mar 24, 2019 by Chao Liu
13 changed files
--- a/src/include/functional.hip.hpp
+++ b/src/include/functional.hip.hpp
 #pragma once
 #include "constant_integral.hip.hpp"

-template <unsigned NLoop>
+template <index_t NLoop>
 struct static_loop_n
 {
    template <class F>
@@ -24,7 +24,7 @@ struct static_loop_n<1>
    }
 };

-template <unsigned NLoop>
+template <index_t NLoop>
 struct static_const_reduce_n
 {
    template <class F, class Reduce>

--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -8,18 +8,18 @@ template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned HoPerBlock,
-          unsigned WoPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned HoPerThread,
-          unsigned WoPerThread,
-          unsigned BlockSize,
-          unsigned GridSize>
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t NPerThread,
+          index_t KPerThread,
+          index_t CPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t BlockSize,
+          index_t GridSize>
 __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_in_global,
                                              const Float* const __restrict__ p_wei_global,
                                              Float* const __restrict__ p_out_global)
@@ -33,16 +33,16 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    constexpr auto wei_global_desc = WeiGlobalDesc{};
    constexpr auto out_global_desc = OutGlobalDesc{};

-    constexpr unsigned Y = wei_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_global_desc.GetLength(I3);
+    constexpr index_t Y = wei_global_desc.GetLength(I2);
+    constexpr index_t X = wei_global_desc.GetLength(I3);

-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;

-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+    constexpr index_t NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr index_t KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr index_t HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr index_t WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

    constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());
@@ -59,31 +59,31 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    constexpr auto out_block_desc =
        make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());

-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
-    constexpr unsigned out_block_size = out_block_desc.GetElementSpace();
+    constexpr index_t in_block_size  = in_block_desc.GetElementSpace();
+    constexpr index_t wei_block_size = wei_block_desc.GetElementSpace();
+    constexpr index_t out_block_size = out_block_desc.GetElementSpace();

    __shared__ Float p_in_block[in_block_size];
    __shared__ Float p_wei_block[wei_block_size];
    __shared__ Float p_out_block[out_block_size];

-    const unsigned block_id = blockIdx.x;
+    const index_t block_id = blockIdx.x;

-    unsigned itmp            = block_id;
-    unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    index_t itmp            = block_id;
+    index_t n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
-    unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    index_t k_block_work_id = itmp / (HBlockWork * WBlockWork);
    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
-    unsigned h_block_work_id = itmp / WBlockWork;
-    unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+    index_t h_block_work_id = itmp / WBlockWork;
+    index_t w_block_work_id = itmp - h_block_work_id * WBlockWork;

-    unsigned n_block_work_begin  = n_block_work_id * NPerBlock;
-    unsigned k_block_work_begin  = k_block_work_id * KPerBlock;
-    unsigned ho_block_work_begin = h_block_work_id * HoPerBlock;
-    unsigned wo_block_work_begin = w_block_work_id * WoPerBlock;
+    index_t n_block_work_begin  = n_block_work_id * NPerBlock;
+    index_t k_block_work_begin  = k_block_work_id * KPerBlock;
+    index_t ho_block_work_begin = h_block_work_id * HoPerBlock;
+    index_t wo_block_work_begin = w_block_work_id * WoPerBlock;

-    unsigned hi_block_work_begin = ho_block_work_begin; // minus padding
-    unsigned wi_block_work_begin = wo_block_work_begin; // minus padding
+    index_t hi_block_work_begin = ho_block_work_begin; // minus padding
+    index_t wi_block_work_begin = wo_block_work_begin; // minus padding

    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
@@ -109,7 +109,7 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    // set output tensor in LDS to 0
    blockwise_4d_tensor_set_zero<BlockSize>(out_block_desc, p_out_block);

-    for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
+    for(index_t c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
        c_block_work_begin += CPerBlock)
    {
        // copy input tensor to LDS

--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -11,20 +11,20 @@ template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned HoPerBlock,
-          unsigned WoPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned HoPerThread,
-          unsigned WoPerThread,
-          unsigned InBlockCopyDataPerRead,
-          unsigned WeiBlockCopyDataPerRead,
-          unsigned BlockSize,
-          unsigned GridSize>
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t NPerThread,
+          index_t KPerThread,
+          index_t CPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t InBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead,
+          index_t BlockSize,
+          index_t GridSize>
 __global__ void
 gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
                                             const Float* const __restrict__ p_wei_global,
@@ -39,17 +39,17 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
    constexpr auto out_nkhw_global_desc = OutGlobalDesc{};

-    constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
-    constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
-    constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
-    constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
+    constexpr index_t N = in_nchw_global_desc.GetLength(I0);
+    constexpr index_t K = wei_kcyx_global_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_global_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_global_desc.GetLength(I3);

    constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy

-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;

    constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
@@ -63,12 +63,12 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});

    // shared mem
-    constexpr unsigned in_block_size =
+    constexpr index_t in_block_size =
        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
-    constexpr unsigned wei_block_size =
+    constexpr index_t wei_block_size =
        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

-    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
                                      ? InBlockCopyDataPerRead
                                      : WeiBlockCopyDataPerRead;

@@ -76,8 +76,8 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];

    // threadwise tensors
-    constexpr unsigned HiPerThread = HoPerThread + Y - 1;
-    constexpr unsigned WiPerThread = WoPerThread + X - 1;
+    constexpr index_t HiPerThread = HoPerThread + Y - 1;
+    constexpr index_t WiPerThread = WoPerThread + X - 1;

    constexpr auto in_nchw_thread_block_desc =
        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
@@ -93,56 +93,54 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];

    // divide block work
-    constexpr unsigned NBlockWork =
-        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork =
-        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork =
+    constexpr index_t NBlockWork = (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr index_t KBlockWork = (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr index_t HBlockWork =
        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork =
+    constexpr index_t WBlockWork =
        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

-    const unsigned block_id = blockIdx.x;
+    const index_t block_id = blockIdx.x;

-    unsigned itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    index_t itmp                  = block_id;
+    const index_t n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
-    const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    const index_t k_block_work_id = itmp / (HBlockWork * WBlockWork);
    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
-    const unsigned h_block_work_id = itmp / WBlockWork;
-    const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+    const index_t h_block_work_id = itmp / WBlockWork;
+    const index_t w_block_work_id = itmp - h_block_work_id * WBlockWork;

-    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
-    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
+    const index_t n_block_data_begin  = n_block_work_id * NPerBlock;
+    const index_t k_block_data_begin  = k_block_work_id * KPerBlock;
+    const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;

-    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
-    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
+    const index_t hi_block_data_begin = ho_block_data_begin; // minus padding
+    const index_t wi_block_data_begin = wo_block_data_begin; // minus padding

    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
-    constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
+    constexpr index_t NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
+    constexpr index_t KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
+    constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
+    constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;

-    const unsigned thread_id = threadIdx.x;
+    const index_t thread_id = threadIdx.x;

    itmp                           = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
+    const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
    itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
-    const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
+    const index_t k_thread_work_id = itmp / (HThreadWork * WThreadWork);
    itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
-    const unsigned h_thread_work_id = itmp / WThreadWork;
-    const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
+    const index_t h_thread_work_id = itmp / WThreadWork;
+    const index_t w_thread_work_id = itmp - h_thread_work_id * WThreadWork;

-    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
-    const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
-    const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
+    const index_t n_thread_data_begin  = n_thread_work_id * NPerThread;
+    const index_t k_thread_data_begin  = k_thread_work_id * KPerThread;
+    const index_t ho_thread_data_begin = h_thread_work_id * HoPerThread;
+    const index_t wo_thread_data_begin = w_thread_work_id * WoPerThread;

-    const unsigned hi_thread_data_begin = ho_thread_data_begin;
-    const unsigned wi_thread_data_begin = wo_thread_data_begin;
+    const index_t hi_thread_data_begin = ho_thread_data_begin;
+    const index_t wi_thread_data_begin = wo_thread_data_begin;

    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
@@ -172,7 +170,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    // set threadwise output tensor to 0
    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);

-    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
+    for(index_t c_block_data_begin = 0; c_block_data_begin < C;
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
@@ -191,7 +189,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i

        __syncthreads();

-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
+        for(index_t c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
        {
 // threadwise convolution
 #if 1

--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
--- a/src/include/threadwise_2d_tensor_op.hip.hpp
+++ b/src/include/threadwise_2d_tensor_op.hip.hpp
@@ -16,11 +16,11 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re
    }
 #endif

-    for(unsigned did0 = 0; did0 < desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < desc.GetLength(I1); ++did1)
        {
-            const unsigned dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.Get1dIndex(did0, did1);

            f(p[dindex]);
        }
@@ -47,22 +47,22 @@ __device__ void threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_d
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};

-    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
-    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
+    constexpr index_t IR0 = DstFromSrcReorder{}.Get(I0);
+    constexpr index_t IR1 = DstFromSrcReorder{}.Get(I1);

    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});

-    for(unsigned did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            const unsigned aindex = src_desc.Get1dIndex(did0, did1);
+            const index_t aindex = src_desc.Get1dIndex(did0, did1);

-            const unsigned did[2] = {did0, did1};
+            const index_t did[2] = {did0, did1};

-            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+            const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);

            f(p_src[aindex], p_dst[bindex]);
        }
@@ -118,21 +118,21 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
    }
 #endif

-    constexpr unsigned nshift = NShift::mValue;
+    constexpr index_t nshift = NShift::mValue;

-    constexpr unsigned did0_end =
+    constexpr index_t did0_end =
        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);

-    constexpr unsigned did1_end =
+    constexpr index_t did1_end =
        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);

-    for(unsigned did0 = 0; did0 < did0_end; ++did0)
+    for(index_t did0 = 0; did0 < did0_end; ++did0)
    {
-        for(unsigned did1 = 0; did1 < did1_end; ++did1)
+        for(index_t did1 = 0; did1 < did1_end; ++did1)
        {
-            const unsigned dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.Get1dIndex(did0, did1);

-            const unsigned sindex = dindex + nshift * desc.GetStride(IDim{});
+            const index_t sindex = dindex + nshift * desc.GetStride(IDim{});

            p[dindex] = p[sindex];
        }

--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
@@ -28,28 +28,28 @@ __device__ void threadwise_direct_convolution_1(InDesc,
    }
 #endif

-    for(unsigned n = 0; n < out_desc.GetLength(I0); ++n)
+    for(index_t n = 0; n < out_desc.GetLength(I0); ++n)
    {
-        for(unsigned k = 0; k < out_desc.GetLength(I1); ++k)
+        for(index_t k = 0; k < out_desc.GetLength(I1); ++k)
        {
-            for(unsigned ho = 0; ho < out_desc.GetLength(I2); ++ho)
+            for(index_t ho = 0; ho < out_desc.GetLength(I2); ++ho)
            {
-                for(unsigned wo = 0; wo < out_desc.GetLength(I3); ++wo)
+                for(index_t wo = 0; wo < out_desc.GetLength(I3); ++wo)
                {
-                    for(unsigned c = 0; c < wei_desc.GetLength(I1); ++c)
+                    for(index_t c = 0; c < wei_desc.GetLength(I1); ++c)
                    {
-                        for(unsigned y = 0; y < wei_desc.GetLength(I2); ++y)
+                        for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
                        {
-                            for(unsigned x = 0; x < wei_desc.GetLength(I3); ++x)
+                            for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
                            {
-                                const unsigned hi = ho + y;
-                                const unsigned wi = wo + x;
+                                const index_t hi = ho + y;
+                                const index_t wi = wo + x;

-                                const unsigned in_index = in_desc.Get1dIndex(n, c, hi, wi);
+                                const index_t in_index = in_desc.Get1dIndex(n, c, hi, wi);

-                                const unsigned wei_index = wei_desc.Get1dIndex(k, c, y, x);
+                                const index_t wei_index = wei_desc.Get1dIndex(k, c, y, x);

-                                const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
+                                const index_t out_index = out_desc.Get1dIndex(n, k, ho, wo);

                                fused_multiply_accumulate(
                                    p_out[out_index], p_wei[wei_index], p_in[in_index]);
@@ -125,7 +125,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
    Data p_in_reg[in_reg_desc.GetElementSpace()];
    Data p_wei_reg[wei_reg_desc.GetElementSpace()];

-    constexpr unsigned in_w_new_read = 1;
+    constexpr index_t in_w_new_read = 1;

    constexpr auto in_desc_reg_new_read =
        make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
@@ -136,7 +136,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
 #if 0
    // this verison reused old input data in register, and read new data from LDS
    // loop over vertical direction
-    for(unsigned y = 0; y < wei_desc.GetLength(I2); ++y)
+    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
    {
        // read first input
        threadwise_4d_tensor_copy(in_desc,
@@ -157,7 +157,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
            in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);

        // loop over horizontal direction
-        for(unsigned x = 1; x < wei_desc.GetLength(I3); ++x)
+        for(index_t x = 1; x < wei_desc.GetLength(I3); ++x)
        {
            // read new weight
            threadwise_4d_tensor_copy(wei_desc,
@@ -186,10 +186,10 @@ __device__ void threadwise_direct_convolution_3(InDesc,
 #elif 1
    // this version read all input from LDS when filter moves
    // loop over vertical direction
-    for(unsigned y = 0; y < wei_desc.GetLength(I2); ++y)
+    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
    {
        // loop over horizontal direction
-        for(unsigned x = 0; x < wei_desc.GetLength(I3); ++x)
+        for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
        {
            // read new weight
            threadwise_4d_tensor_copy(wei_desc,

--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
--- a/src/include/threadwise_nd_tensor_op.hip.hpp
+++ b/src/include/threadwise_nd_tensor_op.hip.hpp