adding ConstantMergedTensorDescriptor, refactering ConstantTensorDescriptor, Sequence

acd7082f · Chao Liu · cd29b09a · acd7082f · acd7082f · acd7082f
Commit acd7082f authored May 21, 2019 by Chao Liu
18 changed files
--- a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
@@ -221,11 +221,11 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
 #if 0
        const Float* p_in_global_block_offset =
-            p_in_global + in_n_c_h_w_global_desc.Get1dIndex(
+            p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                              n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
                    p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
@@ -234,20 +234,20 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
            for(index_t y = 0; y < Y; ++y)
            {
                blockwise_in_copy_reorder.Run(p_in_global_block_offset +
-                                                  in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, 0),
+                                                  in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
                                              p_in_block);
                blockwise_wei_copy.Run(p_wei_global_block_offset +
-                                           wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0),
+                                           wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
                                       p_wei_block);
                __syncthreads();
                for(index_t x = 0; x < X; ++x)
                {
-                    blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
+                    blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
                                             p_in_block +
-                                                 in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
+                                                 in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
                                             p_out_thread);
                }
@@ -259,11 +259,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
        {
            const Float* p_in_global_block_offset =
                p_in_global +
-                in_n_c_h_w_global_desc.Get1dIndex(
+                in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                    n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin);
            const Float* p_wei_global_block_offset =
-                p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin);
+                p_wei_global +
+                wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin);
            for(index_t
                    c_block_data_begin = 0;
@@ -287,10 +288,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
                for(index_t x = 0; x < X; ++x)
                {
-                    blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
+                    blockwise_batch_gemm.Run(
-                                             p_in_block +
+                        p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
-                                                 in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
+                        p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
-                                             p_out_thread);
+                        p_out_thread);
                }
                __syncthreads();
@@ -336,16 +337,16 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
        }
 #endif
-        threadwise_10d_tensor_copy(
+        threadwise_10d_tensor_copy(out_10d_thread_desc,
-            out_10d_thread_desc,
+                                   p_out_thread,
-            p_out_thread,
+                                   out_10d_global_desc,
-            out_10d_global_desc,
+                                   p_out_global +
-            p_out_global +
+                                       out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                           k_block_data_begin + k_thread_data_begin,
-                                                   ho_block_data_begin + ho_thread_data_begin,
+                                           ho_block_data_begin + ho_thread_data_begin,
-                                                   wo_block_data_begin + wo_thread_data_begin,
+                                           wo_block_data_begin + wo_thread_data_begin,
-                                                   n_block_data_begin + n_thread_data_begin),
+                                           n_block_data_begin + n_thread_data_begin),
-            out_10d_thread_desc.GetLengths(),
+                                   out_10d_thread_desc.GetLengths(),
-            Number<OutThreadCopyDataPerWrite_N>{});
+                                   Number<OutThreadCopyDataPerWrite_N>{});
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
@@ -82,7 +82,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
        constexpr auto block_work_desc = make_ConstantTensorDescriptor(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
-        const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
        const index_t n_block_data_begin  = block_work_multi_id[0] * NPerBlock;
        const index_t k_block_data_begin  = block_work_multi_id[1] * KPerBlock;
@@ -225,11 +226,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #if 1
        const Float* p_in_global_block_offset =
            p_in_global +
-            in_c_h_w_n_global_desc.Get1dIndex(
+            in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
                0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global +
+            wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
                    p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
@@ -240,13 +242,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #pragma unroll
                for(index_t x = 0; x < X; ++x)
                {
-                    blockwise_in_copy.Run(p_in_global_block_offset +
+                    blockwise_in_copy.Run(
-                                              in_c_h_w_n_global_desc.Get1dIndex(0, y, x, 0),
+                        p_in_global_block_offset +
-                                          p_in_block);
+                            in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
+                        p_in_block);
-                    blockwise_wei_copy.Run(p_wei_global_block_offset +
+                    blockwise_wei_copy.Run(
-                                               wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
+                        p_wei_global_block_offset +
-                                           p_wei_block);
+                            wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
+                        p_wei_block);
                    __syncthreads();
@@ -263,11 +267,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
            {
                const Float* p_in_global_block_offset =
                    p_in_global +
-                    in_c_h_w_n_global_desc.Get1dIndex(
+                    in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
                        0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin);
                const Float* p_wei_global_block_offset =
-                    p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
+                    p_wei_global +
+                    wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
                for(index_t c_block_data_begin = 0; c_block_data_begin < C;
                    c_block_data_begin += CPerBlock,
@@ -347,17 +352,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        }).else_([&](auto f_dummy) {
            static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                              GemmNPerThreadSubC % NPerThread == 0,
@@ -397,17 +402,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        });
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
@@ -85,10 +85,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
        constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
-        constexpr auto block_work_desc = make_ConstantTensorDescriptor(
+        constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
            Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
-        const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
        const index_t k_block_data_begin  = block_work_multi_id[0] * KPerBlock;
        const index_t ho_block_data_begin = block_work_multi_id[1] * HoPerBlock;
@@ -108,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);
-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockCopyDataPerRead_N>{});
@@ -117,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");
-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
 // blockwise copy
@@ -243,11 +244,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
            {
                const Float* p_in_global_block_offset =
                    p_in_global +
-                    in_c_h_w_n_global_desc.Get1dIndex(
+                    in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
                        0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin);
                const Float* p_wei_global_block_offset =
-                    p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
+                    p_wei_global +
+                    wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
                // LDS double buffer: preload data into LDS
                {
@@ -399,17 +401,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
            }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        }).else_([&](auto fwd) {
            static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                              GemmNPerThreadSubC % NPerThread == 0,
@@ -450,17 +452,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
            }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        });
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
@@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
-        constexpr auto block_work_desc = make_ConstantTensorDescriptor(
+        constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
-        const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
        const index_t n_block_data_begin  = block_work_multi_id[0] * NPerBlock;
        const index_t k_block_data_begin  = block_work_multi_id[1] * KPerBlock;
@@ -101,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        // global tensor view
        constexpr auto wei_c_k_global_desc =
-            make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
+            make_ranked_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
        // LDS tensor view
        //   be careful of alignment
@@ -110,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);
-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockReorderDataPerWrite_N>{});
@@ -119,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");
-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
        // blockwise copy
@@ -241,11 +242,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
            {
                const Float* p_in_global_block_offset =
                    p_in_global +
-                    in_n_c_h_w_global_desc.Get1dIndex(
+                    in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                        n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
                const Float* p_wei_global_block_offset =
-                    p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
+                    p_wei_global +
+                    wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
                // LDS double buffer: preload data into LDS
                {
@@ -359,13 +361,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
        const index_t n_thread_data_begin  = c_thread_mtx_begin.col % NPerBlock;
-        static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto f_dummy) { // f_dummy do nothing but
+        static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) { // fwd do nothing but
                                                                         // perfect forwarding.
                                                                         // Using this trick to
            // make this lambda a generic lambda, so it won't be compiled until
            // instantiated
            static_assert(
-                (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
+                (fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
                "wrong!");
            // output is a 10d tensor
@@ -373,12 +375,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
            constexpr index_t N1 = NPerBlock / N2;
            constexpr index_t W2 =
-                (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC);
+                (GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC);
            constexpr index_t W1 = WoPerBlock / W2;
            constexpr index_t K2 = GemmMPerThreadSubC;
            constexpr index_t K1 = KPerBlock / KPerThread;
+#if 0
            constexpr auto out_10d_global_desc =
                make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
                                                       K1,
@@ -387,12 +390,23 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
                                                       Wo / (W1 * W2),
                                                       W1,
                                                       W2,
-                                                       N / f_dummy(N1 * N2),
+                                                       N / fwd(N1 * N2),
                                                       N1,
                                                       N2>{});
            constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
                Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
+#else
+            constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc)
+                                                     .Fold(I3, Number<N1>{}, Number<N2>{})
+                                                     .Fold(I2, Number<W1>{}, Number<W2>{})
+                                                     .Fold(I0, Number<K1>{}, Number<K2>{});
+            constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc)
+                                                     .Fold(I3, Number<1>{}, Number<N2>{})
+                                                     .Fold(I2, Number<W1>{}, Number<1>{})
+                                                     .Fold(I0, Number<1>{}, Number<K2>{});
+#endif
 #if 0
                if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
@@ -407,19 +421,19 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
-        }).else_([&](auto f_dummy) {
+        }).else_([&](auto fwd) {
-            static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
+            static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                              GemmNPerThreadSubC % NPerThread == 0,
                          "wrong!");
@@ -428,16 +442,30 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
            constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock;
            constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster;
-            constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3);
+            constexpr index_t W1 = WoPerBlock / fwd(W2 * W3);
            constexpr index_t K2 = GemmMPerThreadSubC;
            constexpr index_t K1 = KPerBlock / KPerThread;
-            constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(
+#if 0
+            constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor(
                Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
-            constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
+            constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor(
                Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
+#else
+            constexpr auto out_10d_global_desc =
+                fwd(out_k_h_w_n_global_desc)
+                    .Fold(I3, Number<N1>{})
+                    .Fold(I2, Number<W1>{}, Number<W2>{}, Number<W3>{})
+                    .Fold(I0, Number<K1>{}, Number<K2>{});
+            constexpr auto out_10d_thread_desc =
+                fwd(out_k_h_w_n_thread_desc)
+                    .Fold(I3, Number<N1>{})
+                    .Fold(I2, Number<W1>{}, Number<1>{}, Number<W3>{})
+                    .Fold(I0, Number<1>{}, Number<K2>{});
+#endif
 #if 0
                if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
@@ -457,17 +485,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        });
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
        constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
-        constexpr auto block_work_desc = make_ConstantTensorDescriptor(
+        constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
-        const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
        const index_t n_block_data_begin  = block_work_multi_id[0] * NPerBlock;
        const index_t k_block_data_begin  = block_work_multi_id[1] * KPerBlock;
@@ -109,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);
-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockReorderDataPerWrite_N>{});
@@ -118,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");
-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
        // blockwise copy
@@ -240,11 +241,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
            {
                const Float* p_in_global_block_offset =
                    p_in_global +
-                    in_n_c_h_w_global_desc.Get1dIndex(
+                    in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                        n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
                const Float* p_wei_global_block_offset =
-                    p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
+                    p_wei_global +
+                    wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
                // LDS double buffer: preload data into LDS
                {
@@ -407,10 +409,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
                p_out_thread,
                out_10d_global_desc,
                p_out_global +
-                    out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                    out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
-                                                       k_block_data_begin + k_thread_data_begin,
+                        n_block_data_begin + n_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                        k_block_data_begin + k_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+                        ho_block_data_begin + ho_thread_data_begin,
+                        wo_block_data_begin + wo_thread_data_begin),
                out_10d_thread_desc.GetLengths(),
                map_out_global2thread);
            // Number<OutThreadCopyDataPerWrite_W>{});
@@ -461,10 +464,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
                p_out_thread,
                out_10d_global_desc,
                p_out_global +
-                    out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                    out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
-                                                       k_block_data_begin + k_thread_data_begin,
+                        n_block_data_begin + n_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                        k_block_data_begin + k_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+                        ho_block_data_begin + ho_thread_data_begin,
+                        wo_block_data_begin + wo_thread_data_begin),
                out_10d_thread_desc.GetLengths(),
                map_out_global2thread);
            // Number<OutThreadCopyDataPerWrite_W>{});

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
@@ -236,11 +236,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
 #if 1
        const Float* p_in_global_block_offset =
            p_in_global +
-            in_n_c_h_w_global_desc.Get1dIndex(
+            in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global +
+            wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
                    p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
@@ -251,23 +252,27 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
                for(index_t x = 0; x < X; ++x)
                {
 #if 1
-                    blockwise_in_copy_reorder.Run(p_in_global_block_offset +
+                    blockwise_in_copy_reorder.Run(
-                                                      in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
+                        p_in_global_block_offset +
-                                                  p_in_block);
+                            in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
+                        p_in_block);
-                    blockwise_wei_copy.Run(p_wei_global_block_offset +
-                                               wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
+                    blockwise_wei_copy.Run(
-                                           p_wei_block);
+                        p_wei_global_block_offset +
+                            wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
+                        p_wei_block);
 #else
                    Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
                    Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
                    blockwise_in_copy_reorder.RunLoadRegisterClipboard(
-                        p_in_global_block_offset + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
+                        p_in_global_block_offset +
+                            in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
                        p_in_clipboard);
                    blockwise_wei_copy.RunLoadRegisterClipboard(
-                        p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
+                        p_wei_global_block_offset +
+                            wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                        p_wei_clipboard);
                    blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
@@ -291,11 +296,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
            {
                const Float* p_in_global_block_offset =
                    p_in_global +
-                    in_n_c_h_w_global_desc.Get1dIndex(
+                    in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                        n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
                const Float* p_wei_global_block_offset =
-                    p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
+                    p_wei_global +
+                    wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
                for(index_t c_block_data_begin = 0; c_block_data_begin < C;
                    c_block_data_begin += CPerBlock,
@@ -390,17 +396,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        }).else_([&](auto f_dummy) {
            static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                              GemmNPerThreadSubC % NPerThread == 0,
@@ -440,17 +446,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        });
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
@@ -86,7 +86,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
        constexpr auto block_work_desc = make_ConstantTensorDescriptor(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
-        const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
        const index_t n_block_data_begin  = block_work_multi_id[0] * NPerBlock;
        const index_t k_block_data_begin  = block_work_multi_id[1] * KPerBlock;
@@ -234,11 +235,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
 #if 0
        const Float* p_in_global_block_offset =
            p_in_global +
-            in_n_c_h_w_global_desc.Get1dIndex(
+            in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
                    p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
@@ -250,22 +251,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                {
 #if 1
                    blockwise_in_copy_reorder.Run(p_in_global_block_offset +
-                                                      in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
+                                                      in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
                                                  p_in_block);
                    blockwise_wei_copy.Run(p_wei_global_block_offset +
-                                               wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
+                                               wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                                           p_wei_block);
 #else
                    Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
                    Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
                    blockwise_in_copy_reorder.RunLoadRegisterClipboard(
-                        p_in_global_block_offset + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
+                        p_in_global_block_offset + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
                        p_in_clipboard);
                    blockwise_wei_copy.RunLoadRegisterClipboard(
-                        p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
+                        p_wei_global_block_offset + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                        p_wei_clipboard);
                    blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
@@ -289,11 +290,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
            {
                const Float* p_in_global_block_offset =
                    p_in_global +
-                    in_n_c_h_w_global_desc.Get1dIndex(
+                    in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
                        n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
                const Float* p_wei_global_block_offset =
-                    p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
+                    p_wei_global +
+                    wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
                for(index_t c_block_data_begin = 0; c_block_data_begin < C;
                    c_block_data_begin += CPerBlock,
@@ -395,10 +397,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                p_out_thread,
                out_10d_global_desc,
                p_out_global +
-                    out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                    out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
-                                                       k_block_data_begin + k_thread_data_begin,
+                        n_block_data_begin + n_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                        k_block_data_begin + k_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+                        ho_block_data_begin + ho_thread_data_begin,
+                        wo_block_data_begin + wo_thread_data_begin),
                out_10d_thread_desc.GetLengths(),
                map_out_global2thread);
            // Number<OutThreadCopyDataPerWrite_W>{});
@@ -444,10 +447,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                p_out_thread,
                out_10d_global_desc,
                p_out_global +
-                    out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                    out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
-                                                       k_block_data_begin + k_thread_data_begin,
+                        n_block_data_begin + n_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                        k_block_data_begin + k_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+                        ho_block_data_begin + ho_thread_data_begin,
+                        wo_block_data_begin + wo_thread_data_begin),
                out_10d_thread_desc.GetLengths(),
                map_out_global2thread);
            // Number<OutThreadCopyDataPerWrite_W>{});

--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
@@ -193,10 +193,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
        __shared__ Float p_wei_block[wei_block_space];
        const Float* p_in_global_block_offset =
-            p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
+            p_in_global + in_cb_global_desc.GetOffsetFromMultiIndex(0, b_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global +
+            wei_cyxk_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        // register
        Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
@@ -236,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #elif 1
                    blockwise_gemm.Run_asm
 #endif
-                        (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                        (p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                         p_in_block + y * Wi + x,
                         p_out_thread);
                }
@@ -267,8 +268,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
                if(n_data < N && h_data < Ho && w_data < Wo)
                {
-                    p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
+                    p_out_global[out_khwn_global_desc.GetOffsetFromMultiIndex(
-                        p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
+                        k_data, h_data, w_data, n_data)] =
+                        p_out_thread[out_kb_thread_desc.GetOffsetFromMultiIndex(k, b)];
                }
            }
        }

--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -198,10 +198,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
        __shared__ Float p_wei_block_double[2 * wei_block_space];
        const Float* p_in_global_block_offset =
-            p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
+            p_in_global + in_cb_global_desc.GetOffsetFromMultiIndex(0, b_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global +
+            wei_cyxk_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        // preload data into LDS
        {
@@ -269,7 +270,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                        blockwise_gemm.Run_asm
 #endif
-                            (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                            (p_wei_block_now +
+                                 wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                             p_in_block_now + y * Wi + x,
                             p_out_thread);
                    }
@@ -310,7 +312,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                    blockwise_gemm.Run_asm
 #endif
-                        (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                        (p_wei_block_double +
+                             wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                         p_in_block_double + y * Wi + x,
                         p_out_thread);
                }
@@ -336,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                    blockwise_gemm.Run_asm
 #endif
                        (p_wei_block_double + wei_block_space +
-                             wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                             wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                         p_in_block_double + in_block_space + y * Wi + x,
                         p_out_thread);
                }
@@ -365,14 +368,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
            constexpr auto out_kb_global_desc = make_ConstantTensorDescriptor(Sequence<K, B>{});
-            threadwise_6d_tensor_copy(
+            threadwise_6d_tensor_copy(out_6d_thread_desc,
-                out_6d_thread_desc,
+                                      p_out_thread,
-                p_out_thread,
+                                      out_6d_global_desc,
-                out_6d_global_desc,
+                                      p_out_global +
-                p_out_global +
+                                          out_kb_global_desc.GetOffsetFromMultiIndex(
-                    out_kb_global_desc.Get1dIndex(k_thread_data_begin, b_thread_data_begin),
+                                              k_thread_data_begin, b_thread_data_begin),
-                out_6d_thread_desc.GetLengths(),
+                                      out_6d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite>{});
+                                      Number<OutThreadCopyDataPerWrite>{});
        }
        else
        {
@@ -393,9 +396,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                    if(n_data < N && h_data < Ho && w_data < Wo)
                    {
-                        p_out_global[out_khwn_global_desc.Get1dIndex(
+                        p_out_global[out_khwn_global_desc.GetOffsetFromMultiIndex(
                            k_data, h_data, w_data, n_data)] =
-                            p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
+                            p_out_thread[out_kb_thread_desc.GetOffsetFromMultiIndex(k, b)];
                    }
                }
            }

--- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
@@ -83,7 +83,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        constexpr auto block_work_desc =
            make_ConstantTensorDescriptor(Sequence<KBlockWork, BBlockWork>{});
-        const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
        const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock;
        const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
@@ -219,10 +220,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
            {
                // calculate origin of block input and weight tensor on global memory
                const Float* p_in_block_on_global =
-                    p_in_global + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x);
+                    p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x);
                const Float* p_wei_block_on_global =
-                    p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0);
+                    p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0);
                for(index_t
                        c_block_data_on_global = 0;
@@ -285,7 +286,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
            //     origin of thread tensor in global memory
            const index_t p_out_thread_on_global =
                p_out_global +
-                out_k_n1_b_n2_global_merged_desc.Get1dIndex(
+                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor
            // copy

--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -190,18 +190,19 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_vec_global +
+        blockwise_in_copy.Run(
-                                  in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
+            p_in_vec_global +
-                                                                     c_block_data_begin,
+                in_nchw_vec_global_desc.GetOffsetFromMultiIndex(n_block_data_begin,
-                                                                     hi_block_data_begin,
+                                                                c_block_data_begin,
-                                                                     wi_block_data_begin),
+                                                                hi_block_data_begin,
-                              p_in_vec_block);
+                                                                wi_block_data_begin),
+            p_in_vec_block);
        // copy weight tensor to LDS
-        blockwise_wei_copy.Run(
+        blockwise_wei_copy.Run(p_wei_vec_global +
-            p_wei_vec_global +
+                                   wei_kcyx_vec_global_desc.GetOffsetFromMultiIndex(
-                wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+                                       k_block_data_begin, c_block_data_begin, 0, 0),
-            p_wei_vec_block);
+                               p_wei_vec_block);
        __syncthreads();
@@ -212,26 +213,28 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
            threadwise_direct_convolution_2(
                in_nchw_vec_thread_block_desc,
                p_in_vec_block +
-                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                    in_nchw_vec_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
-                                                      c_thread_data,
+                                                                   c_thread_data,
-                                                      hi_thread_data_begin,
+                                                                   hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                                                                   wi_thread_data_begin),
                wei_kcyx_vec_thread_block_desc,
                p_wei_vec_block +
-                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                    wei_kcyx_vec_block_desc.GetOffsetFromMultiIndex(
+                        k_thread_data_begin, c_thread_data, 0, 0),
                out_nkhw_thread_desc,
                p_out_thread);
 #elif 0
            threadwise_direct_convolution_3(
                in_nchw_vec_thread_block_desc,
                p_in_vec_block +
-                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                    in_nchw_vec_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
-                                                      c_thread_data,
+                                                                   c_thread_data,
-                                                      hi_thread_data_begin,
+                                                                   hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                                                                   wi_thread_data_begin),
                wei_kcyx_vec_thread_block_desc,
                p_wei_vec_block +
-                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                    wei_kcyx_vec_block_desc.GetOffsetFromMultiIndex(
+                        k_thread_data_begin, c_thread_data, 0, 0),
                out_nkhw_thread_desc,
                p_out_thread);
 #endif
@@ -239,14 +242,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
    }
    // copy output tensor from register to global mem
-    threadwise_4d_tensor_copy(
+    threadwise_4d_tensor_copy(out_nkhw_thread_desc,
-        out_nkhw_thread_desc,
+                              p_out_thread,
-        p_out_thread,
+                              out_nkhw_global_desc,
-        out_nkhw_global_desc,
+                              p_out_global +
-        p_out_global +
+                                  out_nkhw_global_desc.GetOffsetFromMultiIndex(
-            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                      n_block_data_begin + n_thread_data_begin,
-                                            k_block_data_begin + k_thread_data_begin,
+                                      k_block_data_begin + k_thread_data_begin,
-                                            ho_block_data_begin + ho_thread_data_begin,
+                                      ho_block_data_begin + ho_thread_data_begin,
-                                            wo_block_data_begin + wo_thread_data_begin),
+                                      wo_block_data_begin + wo_thread_data_begin),
-        out_nkhw_thread_desc.GetLengths());
+                              out_nkhw_thread_desc.GetLengths());
 }
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
@@ -217,7 +217,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
    threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
    const Float* p_wei_global_block_begin =
-        p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin);
+        p_wei_global + wei_ek_global_desc.GetOffsetFromMultiIndex(0, k_block_data_begin);
    for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
                p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0),
@@ -251,10 +251,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-                blockwise_batch_gemm.Run(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                blockwise_batch_gemm.Run(
-                                         p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0),
+                    p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
-                                         p_out_thread,
+                    p_in_block + in_chwn_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
-                                         f_accum);
+                    p_out_thread,
+                    f_accum);
            }
        }
    }
@@ -284,10 +285,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
        p_out_thread,
        out_khwn_global_desc,
        p_out_global +
-            out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+            out_khwn_global_desc.GetOffsetFromMultiIndex(k_block_data_begin + k_thread_data_begin,
-                                            ho_block_data_begin + ho_thread_data_begin,
+                                                         ho_block_data_begin + ho_thread_data_begin,
-                                            wo_block_data_begin + wo_thread_data_begin,
+                                                         wo_block_data_begin + wo_thread_data_begin,
-                                            n_block_data_begin + n_thread_data_begin),
+                                                         n_block_data_begin + n_thread_data_begin),
        out_hkwn_thread_desc.GetLengths(),
        reorder_khwn_from_hkwn);
 }
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
@@ -93,7 +93,7 @@ struct TensorDescriptor
    const std::vector<std::size_t>& GetStrides() const;
    template <class... Is>
-    std::size_t Get1dIndex(Is... is) const
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
    {
        assert(sizeof...(Is) == this->GetNumOfDimension());
        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
@@ -246,13 +246,13 @@ struct Tensor
    template <class... Is>
    T& operator()(Is... is)
    {
-        return mData[mDesc.Get1dIndex(is...)];
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
    }
    template <class... Is>
    const T& operator()(Is... is) const
    {
-        return mData[mDesc.Get1dIndex(is...)];
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
    }
    typename std::vector<T>::iterator begin() { return mData.begin(); }

--- a/src/include/threadwise_2d_tensor_op.hip.hpp
+++ b/src/include/threadwise_2d_tensor_op.hip.hpp
@@ -20,7 +20,7 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re
    {
        for(index_t did1 = 0; did1 < desc.GetLength(I1); ++did1)
        {
-            const index_t dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1);
            f(p[dindex]);
        }
@@ -53,11 +53,11 @@ __device__ void threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_d
    {
        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
        {
-            const index_t aindex = src_desc.Get1dIndex(did0, did1);
+            const index_t aindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
            const index_t did[2] = {did0, did1};
-            const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+            const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
            f(p_src[aindex], p_dst[bindex]);
        }
@@ -127,7 +127,7 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
    {
        for(index_t did1 = 0; did1 < did1_end; ++did1)
        {
-            const index_t dindex = desc.Get1dIndex(did0, did1);
+            const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1);
            const index_t sindex = dindex + nshift * desc.GetStride(IDim{});

--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
@@ -26,7 +26,7 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
            {
                for(index_t did3 = 0; did3 < desc.GetLength(I3); ++did3)
                {
-                    const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3);
+                    const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
                    f(p[dindex]);
                }
@@ -75,12 +75,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_given_ds
            {
                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
                {
-                    const index_t aindex = src_desc.Get1dIndex(did0, did1, did2, did3);
+                    const index_t aindex = src_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
                    const index_t did[4] = {did0, did1, did2, did3};
                    const index_t bindex =
-                        dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
+                        dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
                    f(p_src[aindex], p_dst[bindex]);
@@ -178,7 +178,7 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
            {
                for(index_t did3 = 0; did3 < did3_end; ++did3)
                {
-                    const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3);
+                    const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
                    const index_t sindex = dindex + nshift * desc.GetStride(IDim{});

--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
@@ -46,11 +46,14 @@ __device__ void threadwise_direct_convolution_1(InDesc,
                                const index_t hi = ho + y;
                                const index_t wi = wo + x;
-                                const index_t in_index = in_desc.Get1dIndex(n, c, hi, wi);
+                                const index_t in_index =
+                                    in_desc.GetOffsetFromMultiIndex(n, c, hi, wi);
-                                const index_t wei_index = wei_desc.Get1dIndex(k, c, y, x);
+                                const index_t wei_index =
+                                    wei_desc.GetOffsetFromMultiIndex(k, c, y, x);
-                                const index_t out_index = out_desc.Get1dIndex(n, k, ho, wo);
+                                const index_t out_index =
+                                    out_desc.GetOffsetFromMultiIndex(n, k, ho, wo);
                                fused_multiply_accumulate(
                                    p_out[out_index], p_wei[wei_index], p_in[in_index]);
@@ -143,14 +146,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
    {
        // read first input
        threadwise_4d_tensor_copy(in_desc,
-                                  p_in + in_desc.Get1dIndex(0, 0, y, 0),
+                                  p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
                                  in_reg_desc,
                                  p_in_reg,
                                  in_reg_desc.GetLengths());
        // read first 1x1 weight
        threadwise_4d_tensor_copy(wei_desc,
-                                  p_wei + wei_desc.Get1dIndex(0, 0, y, 0),
+                                  p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
                                  wei_reg_desc,
                                  p_wei_reg,
                                  wei_reg_desc.GetLengths());
@@ -164,7 +167,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
        {
            // read new weight
            threadwise_4d_tensor_copy(wei_desc,
-                                      p_wei + wei_desc.Get1dIndex(0, 0, y, x),
+                                      p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
                                      wei_reg_desc,
                                      p_wei_reg,
                                      wei_reg_desc.GetLengths());
@@ -175,10 +178,10 @@ __device__ void threadwise_direct_convolution_3(InDesc,
            // read new input
            threadwise_4d_tensor_copy(
                in_desc,
-                p_in + in_desc.Get1dIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
+                p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
                in_reg_desc,
                p_in_reg +
-                    in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
+                    in_reg_desc.GetOffsetFromMultiIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
                in_desc_reg_new_read.GetLengths());
            // do 1x1 conv
@@ -196,14 +199,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
        {
            // read new weight
            threadwise_4d_tensor_copy(wei_desc,
-                                      p_wei + wei_desc.Get1dIndex(0, 0, y, x),
+                                      p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
                                      wei_reg_desc,
                                      p_wei_reg,
                                      wei_reg_desc.GetLengths());
            // read new input
            threadwise_4d_tensor_copy(in_desc,
-                                      p_in + in_desc.Get1dIndex(0, 0, y, x),
+                                      p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x),
                                      in_reg_desc,
                                      p_in_reg,
                                      in_reg_desc.GetLengths());

--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -9,7 +9,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
    {
        for(index_t j = 0; j < Matrix::NCol(); ++j)
        {
-            const index_t id = Matrix::Get1dIndex(i, j);
+            const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
            p_thread[id]     = 0;
        }
    }
@@ -39,8 +39,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
    {
        for(index_t j = 0; j < NCol; j += DataPerRead)
        {
-            const index_t src_index = src_mtx.Get1dIndex(i, j);
+            const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j);
-            const index_t dst_index = dst_mtx.Get1dIndex(i, j);
+            const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j);
            *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
                *reinterpret_cast<const vector_t*>(&p_src[src_index]);
@@ -83,9 +83,9 @@ __device__ void threadwise_gemm(MatrixA,
            {
                for(index_t j = 0; j < N; ++j)
                {
-                    const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed
+                    const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed
-                    const index_t bindex = b_mtx.Get1dIndex(k, j);
+                    const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j);
-                    const index_t cindex = c_mtx.Get1dIndex(i, j);
+                    const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j);
                    p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex];
                }

--- a/src/include/threadwise_tensor_slice_op.hip.hpp
+++ b/src/include/threadwise_tensor_slice_op.hip.hpp
@@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
+    constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
 #if 0
    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
@@ -53,9 +53,9 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
        static_for<0, nRead, 1>{}([&](auto IRead) {
            constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{});
-            const index_t src_index = src_desc.Get1dIndex(multi_id);
+            const index_t src_index = src_desc.GetOffsetFromMultiIndex(multi_id);
-            const index_t dst_index = dst_desc.Get1dIndex(multi_id);
+            const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(multi_id);
            *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
                *(reinterpret_cast<const vector_t*>(&p_src[src_index]));
@@ -84,9 +84,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v1(SrcDesc,
    ford<SrcOpLengths>{}([&](auto src_multi_id) {
        const auto dst_multi_id = reorder_array_given_new2old(src_multi_id, MapDst2Src{});
-        const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
+        const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
-        const index_t src_index = src_desc.Get1dIndex(src_multi_id);
+        const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
        p_dst[dst_index] = p_src[src_index];
    });
@@ -115,9 +115,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v2(SrcDesc,
    ford<decltype(dst_op_lengths)>{}([&](auto dst_multi_id) {
        const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
-        const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
+        const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
-        const index_t src_index = src_desc.Get1dIndex(src_multi_id);
+        const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
        p_dst[dst_index] = p_src[src_index];
    });
@@ -177,7 +177,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
                const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
-                const index_t src_index = src_desc.Get1dIndex(src_multi_id);
+                const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
                vector_type<Float, DstDataPerWrite>::SetScalar(
                    dst_vec_data, p_src[src_index], IDstData);
@@ -186,7 +186,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
            // write data
            const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite);
-            const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
+            const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
            *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) = dst_vec_data;
        });
@@ -204,5 +204,21 @@ threadwise_tensor_slice_copy_generic(SrcDesc,
                                     SliceLengths,
                                     DimAccessOrder)
 {
-    // not implemented
+    constexpr auto src_desc = SrcDesc{};
+    constexpr auto dst_desc = DstDesc{};
+    constexpr auto slice_lengths_in_access_order =
+        SliceLengths{}.ReorderGivenNew2Old(DimAccessOrder{});
+    ford<decltype(slice_lengths_in_access_order)>{}([&](auto data_multi_id_in_access_order) {
+        const auto data_multi_id =
+            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
+        const index_t dst_index =
+            dst_desc.GetOffsetFromMultiIndex(src_multi_offset + data_multi_id);
+        const index_t src_index =
+            src_desc.GetOffsetFromMultiIndex(dst_multi_offset + data_multi_id);
+        p_dst[dst_index] = p_src[src_index];
+    });
 }