experimenting

766b0a9e · Chao Liu · f35c64eb · 766b0a9e · 766b0a9e · 766b0a9e
Commit 766b0a9e authored Mar 24, 2019 by Chao Liu
13 changed files
--- a/src/include/functional.hip.hpp
+++ b/src/include/functional.hip.hpp
 #pragma once
 #include "constant_integral.hip.hpp"
-template <unsigned NLoop>
+template <index_t NLoop>
 struct static_loop_n
 {
    template <class F>
@@ -24,7 +24,7 @@ struct static_loop_n<1>
    }
 };
-template <unsigned NLoop>
+template <index_t NLoop>
 struct static_const_reduce_n
 {
    template <class F, class Reduce>

--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -8,18 +8,18 @@ template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned NPerBlock,
+          index_t NPerBlock,
-          unsigned KPerBlock,
+          index_t KPerBlock,
-          unsigned CPerBlock,
+          index_t CPerBlock,
-          unsigned HoPerBlock,
+          index_t HoPerBlock,
-          unsigned WoPerBlock,
+          index_t WoPerBlock,
-          unsigned NPerThread,
+          index_t NPerThread,
-          unsigned KPerThread,
+          index_t KPerThread,
-          unsigned CPerThread,
+          index_t CPerThread,
-          unsigned HoPerThread,
+          index_t HoPerThread,
-          unsigned WoPerThread,
+          index_t WoPerThread,
-          unsigned BlockSize,
+          index_t BlockSize,
-          unsigned GridSize>
+          index_t GridSize>
 __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_in_global,
                                              const Float* const __restrict__ p_wei_global,
                                              Float* const __restrict__ p_out_global)
@@ -33,16 +33,16 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    constexpr auto wei_global_desc = WeiGlobalDesc{};
    constexpr auto out_global_desc = OutGlobalDesc{};
-    constexpr unsigned Y = wei_global_desc.GetLength(I2);
+    constexpr index_t Y = wei_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_global_desc.GetLength(I3);
+    constexpr index_t X = wei_global_desc.GetLength(I3);
-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr index_t NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr index_t KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr index_t HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+    constexpr index_t WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
    constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());
@@ -59,31 +59,31 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    constexpr auto out_block_desc =
        make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());
-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
+    constexpr index_t in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
+    constexpr index_t wei_block_size = wei_block_desc.GetElementSpace();
-    constexpr unsigned out_block_size = out_block_desc.GetElementSpace();
+    constexpr index_t out_block_size = out_block_desc.GetElementSpace();
    __shared__ Float p_in_block[in_block_size];
    __shared__ Float p_wei_block[wei_block_size];
    __shared__ Float p_out_block[out_block_size];
-    const unsigned block_id = blockIdx.x;
+    const index_t block_id = blockIdx.x;
-    unsigned itmp            = block_id;
+    index_t itmp            = block_id;
-    unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    index_t n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
-    unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    index_t k_block_work_id = itmp / (HBlockWork * WBlockWork);
    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
-    unsigned h_block_work_id = itmp / WBlockWork;
+    index_t h_block_work_id = itmp / WBlockWork;
-    unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+    index_t w_block_work_id = itmp - h_block_work_id * WBlockWork;
-    unsigned n_block_work_begin  = n_block_work_id * NPerBlock;
+    index_t n_block_work_begin  = n_block_work_id * NPerBlock;
-    unsigned k_block_work_begin  = k_block_work_id * KPerBlock;
+    index_t k_block_work_begin  = k_block_work_id * KPerBlock;
-    unsigned ho_block_work_begin = h_block_work_id * HoPerBlock;
+    index_t ho_block_work_begin = h_block_work_id * HoPerBlock;
-    unsigned wo_block_work_begin = w_block_work_id * WoPerBlock;
+    index_t wo_block_work_begin = w_block_work_id * WoPerBlock;
-    unsigned hi_block_work_begin = ho_block_work_begin; // minus padding
+    index_t hi_block_work_begin = ho_block_work_begin; // minus padding
-    unsigned wi_block_work_begin = wo_block_work_begin; // minus padding
+    index_t wi_block_work_begin = wo_block_work_begin; // minus padding
    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
@@ -109,7 +109,7 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    // set output tensor in LDS to 0
    blockwise_4d_tensor_set_zero<BlockSize>(out_block_desc, p_out_block);
-    for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
+    for(index_t c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
        c_block_work_begin += CPerBlock)
    {
        // copy input tensor to LDS

--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -11,20 +11,20 @@ template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
-          unsigned NPerBlock,
+          index_t NPerBlock,
-          unsigned KPerBlock,
+          index_t KPerBlock,
-          unsigned CPerBlock,
+          index_t CPerBlock,
-          unsigned HoPerBlock,
+          index_t HoPerBlock,
-          unsigned WoPerBlock,
+          index_t WoPerBlock,
-          unsigned NPerThread,
+          index_t NPerThread,
-          unsigned KPerThread,
+          index_t KPerThread,
-          unsigned CPerThread,
+          index_t CPerThread,
-          unsigned HoPerThread,
+          index_t HoPerThread,
-          unsigned WoPerThread,
+          index_t WoPerThread,
-          unsigned InBlockCopyDataPerRead,
+          index_t InBlockCopyDataPerRead,
-          unsigned WeiBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead,
-          unsigned BlockSize,
+          index_t BlockSize,
-          unsigned GridSize>
+          index_t GridSize>
 __global__ void
 gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
                                             const Float* const __restrict__ p_wei_global,
@@ -39,17 +39,17 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
    constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
-    constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
+    constexpr index_t N = in_nchw_global_desc.GetLength(I0);
-    constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
+    constexpr index_t K = wei_kcyx_global_desc.GetLength(I0);
-    constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
+    constexpr index_t C = wei_kcyx_global_desc.GetLength(I1);
-    constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
+    constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
+    constexpr index_t X = wei_kcyx_global_desc.GetLength(I3);
    constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
-    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
+    constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
-    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+    constexpr index_t WiPerBlock = WoPerBlock + X - 1;
    constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
@@ -63,21 +63,21 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
    // shared mem
-    constexpr unsigned in_block_size =
+    constexpr index_t in_block_size =
        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
-    constexpr unsigned wei_block_size =
+    constexpr index_t wei_block_size =
        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
-    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                       ? InBlockCopyDataPerRead
+                                      ? InBlockCopyDataPerRead
-                                       : WeiBlockCopyDataPerRead;
+                                      : WeiBlockCopyDataPerRead;
    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
    // threadwise tensors
-    constexpr unsigned HiPerThread = HoPerThread + Y - 1;
+    constexpr index_t HiPerThread = HoPerThread + Y - 1;
-    constexpr unsigned WiPerThread = WoPerThread + X - 1;
+    constexpr index_t WiPerThread = WoPerThread + X - 1;
    constexpr auto in_nchw_thread_block_desc =
        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
@@ -93,56 +93,54 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
    // divide block work
-    constexpr unsigned NBlockWork =
+    constexpr index_t NBlockWork = (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr index_t KBlockWork = (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned KBlockWork =
+    constexpr index_t HBlockWork =
-        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork =
        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork =
+    constexpr index_t WBlockWork =
        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
-    const unsigned block_id = blockIdx.x;
+    const index_t block_id = blockIdx.x;
-    unsigned itmp                  = block_id;
+    index_t itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    const index_t n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
-    const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    const index_t k_block_work_id = itmp / (HBlockWork * WBlockWork);
    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
-    const unsigned h_block_work_id = itmp / WBlockWork;
+    const index_t h_block_work_id = itmp / WBlockWork;
-    const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+    const index_t w_block_work_id = itmp - h_block_work_id * WBlockWork;
-    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
+    const index_t n_block_data_begin  = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
+    const index_t k_block_data_begin  = k_block_work_id * KPerBlock;
-    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
-    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
+    const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;
-    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
+    const index_t hi_block_data_begin = ho_block_data_begin; // minus padding
-    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
+    const index_t wi_block_data_begin = wo_block_data_begin; // minus padding
    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
+    constexpr index_t NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
+    constexpr index_t KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
+    constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
-    constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
+    constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
-    const unsigned thread_id = threadIdx.x;
+    const index_t thread_id = threadIdx.x;
-    itmp                            = thread_id;
+    itmp                           = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
+    const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
    itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
-    const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
+    const index_t k_thread_work_id = itmp / (HThreadWork * WThreadWork);
    itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
-    const unsigned h_thread_work_id = itmp / WThreadWork;
+    const index_t h_thread_work_id = itmp / WThreadWork;
-    const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
+    const index_t w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
-    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
+    const index_t n_thread_data_begin  = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
+    const index_t k_thread_data_begin  = k_thread_work_id * KPerThread;
-    const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
+    const index_t ho_thread_data_begin = h_thread_work_id * HoPerThread;
-    const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
+    const index_t wo_thread_data_begin = w_thread_work_id * WoPerThread;
-    const unsigned hi_thread_data_begin = ho_thread_data_begin;
+    const index_t hi_thread_data_begin = ho_thread_data_begin;
-    const unsigned wi_thread_data_begin = wo_thread_data_begin;
+    const index_t wi_thread_data_begin = wo_thread_data_begin;
    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
@@ -172,7 +170,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
    // set threadwise output tensor to 0
    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
-    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
+    for(index_t c_block_data_begin = 0; c_block_data_begin < C;
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
@@ -191,7 +189,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
        __syncthreads();
-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
+        for(index_t c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
        {
 // threadwise convolution
 #if 1

--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
--- a/src/include/threadwise_2d_tensor_op.hip.hpp
+++ b/src/include/threadwise_2d_tensor_op.hip.hpp
@@ -16,11 +16,11 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re
    }
 #endif
-    for(unsigned did0 = 0; did0 < desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < desc.GetLength(I1); ++did1)
        {
-            const unsigned dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.Get1dIndex(did0, did1);
            f(p[dindex]);
        }
@@ -47,22 +47,22 @@ __device__ void threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_d
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
-    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
+    constexpr index_t IR0 = DstFromSrcReorder{}.Get(I0);
-    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
+    constexpr index_t IR1 = DstFromSrcReorder{}.Get(I1);
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
-    for(unsigned did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            const unsigned aindex = src_desc.Get1dIndex(did0, did1);
+            const index_t aindex = src_desc.Get1dIndex(did0, did1);
-            const unsigned did[2] = {did0, did1};
+            const index_t did[2] = {did0, did1};
-            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+            const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
            f(p_src[aindex], p_dst[bindex]);
        }
@@ -118,21 +118,21 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
    }
 #endif
-    constexpr unsigned nshift = NShift::mValue;
+    constexpr index_t nshift = NShift::mValue;
-    constexpr unsigned did0_end =
+    constexpr index_t did0_end =
        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
-    constexpr unsigned did1_end =
+    constexpr index_t did1_end =
        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
-    for(unsigned did0 = 0; did0 < did0_end; ++did0)
+    for(index_t did0 = 0; did0 < did0_end; ++did0)
    {
-        for(unsigned did1 = 0; did1 < did1_end; ++did1)
+        for(index_t did1 = 0; did1 < did1_end; ++did1)
        {
-            const unsigned dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.Get1dIndex(did0, did1);
-            const unsigned sindex = dindex + nshift * desc.GetStride(IDim{});
+            const index_t sindex = dindex + nshift * desc.GetStride(IDim{});
            p[dindex] = p[sindex];
        }

--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
--- a/src/include/threadwise_nd_tensor_op.hip.hpp
+++ b/src/include/threadwise_nd_tensor_op.hip.hpp