adding ConstantMergedTensorDescriptor, refactering ConstantTensorDescriptor, Sequence

acd7082f · Chao Liu · cd29b09a · acd7082f · acd7082f · acd7082f
Commit acd7082f authored May 21, 2019 by Chao Liu
20 changed files
--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());
    // reorder input
-    auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
+    auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
@@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());
    // output
-    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
+    auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -50,7 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());
    // output
-    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
+    auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
    auto lower_pads = Sequence<HPad, WPad>{};
    auto upper_pads = Sequence<HPad, WPad>{};
-    auto in_nchw_desc  = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
+    auto in_nchw_desc  = make_packed_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
-    auto wei_kcyx_desc = make_ConstantTensorDescriptor(Sequence<K, C, Y, X>{});
+    auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence<K, C, Y, X>{});
    auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
        in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);

--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
@@ -16,6 +16,8 @@ struct Array
    {
    }
+    __host__ __device__ constexpr index_t GetSize() const { return NSize; }
    __host__ __device__ const TData& operator[](index_t i) const { return mData[i]; }
    __host__ __device__ TData& operator[](index_t i) { return mData[i]; }
@@ -67,6 +69,23 @@ __host__ __device__ auto reorder_array_given_old2new(const Array<TData, NSize>&
    return new_array;
 }
+template <class TData, index_t NSize, class ExtractSeq>
+__host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
+{
+    Array<TData, ExtractSeq::GetSize()> new_array;
+    constexpr index_t new_size = ExtractSeq::GetSize();
+    static_assert(new_size <= NSize, "wrong! too many extract");
+    static_for<0, new_size, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+        new_array[i]        = old_array[ExtractSeq{}.Get(I)];
+    });
+    return new_array;
+}
 template <class TData, index_t NSize>
 __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
                                             const Array<TData, NSize>& b)

--- a/src/include/ConstantMatrixDescriptor.hip.hpp
+++ b/src/include/ConstantMatrixDescriptor.hip.hpp
@@ -21,7 +21,7 @@ struct ConstantMatrixDescriptor
    __host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; }
-    __host__ __device__ index_t Get1dIndex(index_t irow, index_t icol) const
+    __host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const
    {
        return irow * RowStride_ + icol;
    }

--- a/src/include/ConstantMergedTensorDescriptor.hip.hpp
+++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp
@@ -2,94 +2,118 @@
 #include "common.hip.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
-// TensorDesc: ConstantTensorDescriptor<...>
+// OriginalTensorDesc : ConstantTensorDescriptor<...>
-// MergedDimRanges: Sequence<FirstMergedDim, LastMergedDim>
+//     it's the tensor whose dimensions are to be merged
-template <class TensorDesc, class... MergedDimRanges>
+// OriginalDimMergeSeqs : Sequence<...>...
+//     each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
+template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
 struct ConstantMergedTensorDescriptor
 {
-    static constexpr index_t nOriginalDim = GetNumOfOriginalDimension();
+    static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
-    static constexpr index_t nDim         = GetNumOfDimension();
+    static constexpr index_t nDim         = std::tuple_size<mOriginalDimMergeSeqs>::value;
+    static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension();
-    template <class... Is>
    __host__ __device__ constexpr ConstantMergedTensorDescriptor()
    {
-        constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges{}...);
+        static_assert(nDim <= nOriginalDim, "wrong!");
-        static_for<0, sizeof...(MergedDimRanges), 1>{}([&](auto I) {
+        // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
-            constexpr index_t i             = I.Get();
+        // OriginalTensorDesc::nDim number of dimensions
-            constexpr auto merged_dim_range = std::get<i>(merged_dim_ranges);
+        // TODO: check there is no duplication in OriginalDimMergeSeqs
-            static_assert(merged_dim_range.GetSize() == 2,
-                          "wrong! should specify first and last dimension to be merged");
+        // TODO: check OriginalDimMergeSeqs contains all original dimensions
-            static_assert(merged_dim_range.Get(Number<0>{}) < GetNumOfUnmergedDimension(),
-                          "wrong!");
-            static_assert(merged_dim_range.Get(Number<1>{}) < GetNumOfUnmergedDimension(),
-                          "wrong!");
-            static_assert(merged_dim_range.Get(Number<0>{}) <= merged_dim_range.Get(Number<1>{}),
-                          "wrong!");
-        });
    }
-    __host__ __device__ static constexpr index_t GetNumOfDimension()
+    __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
+    __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim }
+    template <index_t IDim>
+    __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
    {
-        constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges...);
+        return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1);
+    }
-        struct f_calculate_num_of_lost_dim
+    template <index_t IDim>
-        {
+    __host__ __device__ static constexpr index_t GetLength(Number<IDim>)
-            __host__ __device__ constexpr index_t operator()(auto I) const
+    {
-            {
+        constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs);
-                constexpr index_t i             = I.Get();
-                constexpr auto merged_dim_range = std::get<i>(merged_dim_ranges);
-                return merged_dim_range.Get(Number<1>{}) - merged_dim_range.Get(Number<0>{});
+        return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
-            }
+    }
-        };
+    template <index_t IDim>
+    __host__ __device__ static constexpr index_t GetStride(Number<IDim>)
+    {
+        static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
+                      "wrong! stride of a merged dimension is undefined");
-        constexpr index_t num_lost_dim = static_const_reduce_n<sizeof...(MergedDimRanges)>{}(
+        constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front();
-            f_calculate_num_of_lost_dim, std::plus<index_t>{});
-        return TensorDesc::GetNumOfDimension() - num_lost_dim;
+        return OriginalTensorDesc::GetStride(Number<idim_original>{});
    }
-    __host__ __device__ static constexpr index_t GetNumOfOriginalDimension()
+    __host__ __device__ static constexpr auto GetLengths()
    {
-        return TensorDesc::GetNumOfDimension();
+        return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{};
    }
-    template <index_t IDim>
+    __host__ __device__ static constexpr index_t GetElementSize()
-    __host__ __device__ static constexpr bool IsMergedDimension(Number<IDim>)
    {
-        // not implemented
+        return OriginalTensorDesc::GetElementSize();
    }
-    template <index_t IDim>
+    __host__ __device__ static auto
-    __host__ __device__ static constexpr bool GetLength(Number<IDim>)
+    GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
    {
-        // not implemented
+        Array<index_t, nOriginalDim> original_multi_id;
+        static_for<0, nDim, 1>{}([&](auto IDim) {
+            constexpr index_t idim               = IDim.Get();
+            constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
+            // get partial original-multi-id corresponding to this merged dimension
+            constexpr auto original_multi_id_partial =
+                OriginalTensorDesc::Extract(original_dims_partial)
+                    .GetMultiIndexFrom1dIndex(multi_id[idim]);
+            // make sure compiler unroll this loop and propagate all the constants
+            for(index_t i = 0; i < original_dims_partial.GetSize(); ++i)
+            {
+                index_t idim_original = original_dims_partial[i];
+                original_multi_id[idim_original] = original_multi_id_partial[i]
+            }
+        });
+        return original_multi_id;
    }
-    template <index_t IDim>
+    __host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
-    __host__ __device__ static constexpr bool GetStride(Number<IDim>)
    {
-        static_assert(!IsMergedDimension(Number<IDim>{}, "wrong! stride of a merged dimension is undefined")
+        const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
-        // not implemented
+        return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id);
    }
-    template <class... Is>
+    template <index_t... Is>
-    __host__ __device__ auto MultiIndex2OriginalMultiIndex(Is... is) const
+    __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
    {
-        // not implemented
+        return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
    }
-    template <class... Is>
+    __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
-    __host__ __device__ auto OriginalMultiIndex2MultiIndex(Is... is) const
    {
-        // not implemented
+        constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths());
+        return dummy_desc.GetMultiIndexFrom1dIndex(id);
    }
 };
-template <class TensorDesc, class... MergedDimRanges>
+template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
-constexpr auto make_ConstantMergedTensorDescriptor(TensorDesc, MergedDimRanges...)
+constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...)
 {
-    return ConstantMergedTensorDescriptor<TensorDesc, MergedDimRanges...>{};
+    return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
 }
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
--- a/src/include/Sequence.hip.hpp
+++ b/src/include/Sequence.hip.hpp
@@ -9,76 +9,100 @@ struct Sequence
    static constexpr index_t mSize = sizeof...(Is);
-    const index_t mData[mSize + 1] = {
-        Is..., 0}; // the last element is dummy, to prevent compiler complain on empty Sequence
    __host__ __device__ static constexpr index_t GetSize() { return mSize; }
    template <index_t I>
-    __host__ __device__ constexpr index_t Get(Number<I>) const
+    __host__ __device__ static constexpr index_t Get(Number<I>)
    {
+        static_assert(I < mSize, "wrong! I too large");
+        // the last dummy element is to prevent compiler complain about empty Sequence
+        const index_t mData[mSize + 1] = {Is..., 0};
        return mData[I];
    }
-    __host__ __device__ index_t operator[](index_t i) const { return mData[i]; }
+    __host__ __device__ index_t operator[](index_t i) const
+    {
+        const index_t mData[mSize + 1] = {Is..., 0};
+        return mData[i];
+    }
    template <index_t... IRs>
-    __host__ __device__ constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/) const
+    __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
    {
-        static_assert(mSize == sizeof...(IRs), "mSize not consistent");
+#if 0 // require sequence_sort, which is not implemented yet
+        static_assert(is_same<sequence_sort<Sequence<IRs...>>::SortedSeqType,
-        constexpr auto old = Type{};
+                              arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
+                      "wrong! invalid new2old map");
+#endif
-        return Sequence<old.Get(Number<IRs>{})...>{};
+        return Sequence<Type{}.Get(Number<IRs>{})...>{};
    }
-    template <index_t... IRs>
+#if 0 // require sequence_sort, which is not implemented yet
-    __host__ __device__ constexpr auto ReorderGivenOld2New(Sequence<IRs...> /*old2new*/) const
+    template <class MapOld2New>
+    __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/)
    {
-        // TODO: don't know how to implement this
+        static_assert(is_same<sequence_sort<MapOld2New>::SortedSeqType,
-        printf("Sequence::ReorderGivenOld2New not implemented");
+                              arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
-        assert(false);
+                      "wrong! invalid old2new map");
+        constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{};
+        return ReorderGivenNew2Old(map_new2old);
    }
+#endif
-    __host__ __device__ constexpr auto Reverse() const;
+    __host__ __device__ static constexpr auto Reverse();
-    __host__ __device__ constexpr index_t Front() const { return mData[0]; }
+    __host__ __device__ static constexpr index_t Front()
+    {
+        const index_t mData[mSize + 1] = {Is..., 0};
+        return mData[0];
+    }
-    __host__ __device__ constexpr index_t Back() const { return mData[mSize - 1]; }
+    __host__ __device__ static constexpr index_t Back()
+    {
+        const index_t mData[mSize + 1] = {Is..., 0};
+        return mData[mSize - 1];
+    }
    template <index_t I>
-    __host__ __device__ constexpr auto PushFront(Number<I>) const
+    __host__ __device__ static constexpr auto PushFront(Number<I>)
    {
        return Sequence<I, Is...>{};
    }
    template <index_t I>
-    __host__ __device__ constexpr auto PushBack(Number<I>) const
+    __host__ __device__ static constexpr auto PushBack(Number<I>)
    {
        return Sequence<Is..., I>{};
    }
-    __host__ __device__ constexpr auto PopFront() const;
+    __host__ __device__ static constexpr auto PopFront();
-    __host__ __device__ constexpr auto PopBack() const;
+    __host__ __device__ static constexpr auto PopBack();
    template <index_t... Xs>
-    __host__ __device__ constexpr auto Append(Sequence<Xs...>) const
+    __host__ __device__ static constexpr auto Append(Sequence<Xs...>)
    {
        return Sequence<Is..., Xs...>{};
    }
    template <index_t... Ns>
-    __host__ __device__ constexpr auto Extract(Number<Ns>...) const
+    __host__ __device__ static constexpr auto Extract(Number<Ns>...)
    {
        return Sequence<Type{}.Get(Number<Ns>{})...>{};
    }
    template <index_t... Ns>
-    __host__ __device__ constexpr auto Extract(Sequence<Ns...>) const
+    __host__ __device__ static constexpr auto Extract(Sequence<Ns...>)
    {
        return Sequence<Type{}.Get(Number<Ns>{})...>{};
    }
+    template <index_t I, index_t X>
+    __host__ __device__ static constexpr auto Modify(Number<I>, Number<X>);
 };
 template <class, class>
@@ -91,43 +115,36 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
 };
 template <index_t IBegin, index_t NSize, index_t Increment>
-struct increasing_sequence_gen_impl
+struct arithmetic_sequence_gen_impl
 {
    static constexpr index_t NSizeLeft = NSize / 2;
    using SeqType = typename sequence_merge<
-        typename increasing_sequence_gen_impl<IBegin, NSizeLeft, Increment>::SeqType,
+        typename arithmetic_sequence_gen_impl<IBegin, NSizeLeft, Increment>::SeqType,
-        typename increasing_sequence_gen_impl<IBegin + NSizeLeft * Increment,
+        typename arithmetic_sequence_gen_impl<IBegin + NSizeLeft * Increment,
                                              NSize - NSizeLeft,
                                              Increment>::SeqType>::SeqType;
 };
 template <index_t IBegin, index_t Increment>
-struct increasing_sequence_gen_impl<IBegin, 1, Increment>
+struct arithmetic_sequence_gen_impl<IBegin, 1, Increment>
 {
    using SeqType = Sequence<IBegin>;
 };
 template <index_t IBegin, index_t Increment>
-struct increasing_sequence_gen_impl<IBegin, 0, Increment>
+struct arithmetic_sequence_gen_impl<IBegin, 0, Increment>
 {
    using SeqType = Sequence<>;
 };
 template <index_t IBegin, index_t IEnd, index_t Increment>
-struct increasing_sequence_gen
+struct arithmetic_sequence_gen
 {
    using SeqType =
-        typename increasing_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::SeqType;
+        typename arithmetic_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::SeqType;
 };
-template <index_t IBegin, index_t IEnd, index_t Increment>
-__host__ __device__ constexpr auto
-    make_increasing_sequence(Number<IBegin>, Number<IEnd>, Number<Increment>)
-{
-    return typename increasing_sequence_gen<IBegin, IEnd, Increment>::SeqType{};
-}
 template <class, class>
 struct sequence_reverse_inclusive_scan;
@@ -161,8 +178,8 @@ struct sequence_split
 {
    static constexpr index_t NSize = Seq{}.GetSize();
-    using range0 = typename increasing_sequence_gen<0, I, 1>::SeqType;
+    using range0 = typename arithmetic_sequence_gen<0, I, 1>::SeqType;
-    using range1 = typename increasing_sequence_gen<I, NSize, 1>::SeqType;
+    using range1 = typename arithmetic_sequence_gen<I, NSize, 1>::SeqType;
    using SeqType0 = typename sequence_extract<Seq, range0>::SeqType;
    using SeqType1 = typename sequence_extract<Seq, range1>::SeqType;
@@ -191,6 +208,63 @@ struct sequence_reverse<Sequence<I0, I1>>
    using SeqType = Sequence<I1, I0>;
 };
+#if 0 // not fully implemented
+template <class KeySeq0, class ValSeq0, class KeySeq1, class ValSeq1>
+struct sequence_sort_merge_impl;
+template <index_t Key0,
+          index_t... Keys0,
+          index_t Val0,
+          index_t... Vals0,
+          index_t Key1,
+          index_t... Keys1,
+          index_t Val0,
+          index_t... Vals1>
+struct sequence_sort_merge_impl<Sequence<Key0, Keys0...>,
+                                Sequence<Val0, Vals0...>,
+                                Sequence<Key1, Keys1...>,
+                                Sequence<Val1, Vals1...>>
+{
+};
+template <class>
+struct sequence_sort;
+template <index_t... Is>
+struct sequence_sort<Sequence<Is...>>
+{
+    using OriginalSeqType        = Sequence<Is...>;
+    using SortedSeqType          = xxxxx;
+    using MapSorted2OriginalType = xxx;
+};
+template <class Seq, class IsValidSeqMap>
+struct sequence_map_inverse_impl;
+// impl for valid map, no impl for invalid map
+template <index_t... Is>
+struct sequence_map_inverse_impl<Sequence<Is...>, true>
+{
+    using SeqMapType = sequence_sort<Sequence<Is...>>::MapSorted2OriginalType;
+};
+template <class>
+struct sequence_map_inverse;
+template <class Is...>
+struct sequence_map_inverse<Sequence<Is...>>
+{
+    // TODO: make sure the map to be inversed is valid: [0, sizeof...(Is))
+    static constexpr bool is_valid_sequence_map =
+        is_same<typename sequence_sort<Sequence<Is...>>::SortedSeqType,
+                typename arithmetic_sequence_gen<0, sizeof...(Is), 1>::SeqType>::value;
+    // make compiler fails, if is_valid_map != true
+    using SeqMapType =
+        typename sequence_map_inverse_impl<Sequence<Is...>, is_valid_map>::SeqMapType;
+};
+#endif
 template <index_t... Xs, index_t... Ys>
 __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
 {
@@ -243,7 +317,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
 template <index_t... Xs, index_t Y>
 __host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>)
 {
-#if 0 // doesn't compile
+#if 0 // TODO: turn it on. Doesn't compile
    constexpr auto seq_x = Sequence<Xs...>{};
    static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
@@ -313,14 +387,13 @@ __host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
 template <index_t I, index_t... Is>
 __host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>)
 {
-    static_assert(sizeof...(Is) > 0, "empty Sequence!");
    return Sequence<Is...>{};
 }
 template <class Seq>
 __host__ __device__ constexpr auto sequence_pop_back(Seq)
 {
-    static_assert(Seq{}.GetSize() > 0, "empty Sequence!");
+    static_assert(Seq{}.GetSize() > 0, "wrong! cannot pop an empty Sequence!");
    return sequence_pop_front(Seq{}.Reverse()).Reverse();
 }
@@ -349,16 +422,16 @@ transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
    return Sequence<f(Xs, Ys, Zs)...>{};
 }
-template <index_t... Is>
+template <class Seq, class Reduce>
-__host__ __device__ constexpr auto Sequence<Is...>::PopFront() const
+__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce)
 {
-    return sequence_pop_front(Type{});
+    return typename sequence_reverse_inclusive_scan<Seq, Reduce>::SeqType{};
 }
-template <index_t... Is>
+template <class Seq, class Reduce>
-__host__ __device__ constexpr auto Sequence<Is...>::PopBack() const
+__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce)
 {
-    return sequence_pop_back(Type{});
+    return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse();
 }
 template <class Seq>
@@ -381,19 +454,32 @@ __host__ __device__ constexpr index_t
 }
 template <index_t... Is>
-__host__ __device__ constexpr auto Sequence<Is...>::Reverse() const
+__host__ __device__ constexpr auto Sequence<Is...>::PopFront()
 {
-    return typename sequence_reverse<Sequence<Is...>>::SeqType{};
+    return sequence_pop_front(Type{});
 }
-template <class Seq, class Reduce>
+template <index_t... Is>
-__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce)
+__host__ __device__ constexpr auto Sequence<Is...>::PopBack()
 {
-    return typename sequence_reverse_inclusive_scan<Seq, Reduce>::SeqType{};
+    return sequence_pop_back(Type{});
 }
-template <class Seq, class Reduce>
+template <index_t... Is>
-__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce)
+__host__ __device__ constexpr auto Sequence<Is...>::Reverse()
 {
-    return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse();
+    return typename sequence_reverse<Sequence<Is...>>::SeqType{};
+}
+template <index_t... Is>
+template <index_t I, index_t X>
+__host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
+{
+    static_assert(I < GetSize(), "wrong!");
+    using seq_split          = sequence_split<Type, I>;
+    constexpr auto seq_left  = typename seq_split::SeqType0{};
+    constexpr auto seq_right = typename seq_split::SeqType1{}.PopFront();
+    return seq_left.PushBack(Number<X>{}).Append(seq_right);
 }
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -33,7 +33,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
        const index_t did1 = is / desc.GetStride(I1);
-        const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+        const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
        f(p_dst[dindex]);
    }
@@ -52,7 +52,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
            const index_t did1 = is / desc.GetStride(I1);
-            const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+            const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
            f(p_dst[dindex]);
        }
@@ -102,9 +102,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
        did[1] = is / ref_desc.GetStride(I1);
-        const index_t aindex = src_desc.Get1dIndex(did[0], did[1]);
+        const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]);
-        const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+        const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
        f(p_src[aindex], p_dst[bindex]);
    }
@@ -125,9 +125,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
            did[1] = is / ref_desc.GetStride(I1);
-            const index_t aindex = src_desc.Get1dIndex(did[0], did[1]);
+            const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]);
-            const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+            const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
            f(p_src[aindex], p_dst[bindex]);
        }
@@ -224,8 +224,10 @@ struct Blockwise2dTensorCopy1
            did[1] = is / ref_desc.GetStride(I1);
-            const index_t src_index = src_desc.Get1dIndex(did[0], did[1] * DataPerRead);
+            const index_t src_index =
-            const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1] * DataPerRead);
+                src_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead);
+            const index_t dst_index =
+                dst_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead);
            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
                *(reinterpret_cast<const vector_t*>(p_src + src_index));
@@ -328,8 +330,8 @@ struct Blockwise2dTensorCopy2
            {
                index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
-                const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                *(reinterpret_cast<Float4*>(p_dst + dindex)) =
                    *(reinterpret_cast<const Float4*>(p_src + sindex));
@@ -341,8 +343,8 @@ struct Blockwise2dTensorCopy2
                index_t did1 =
                    Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1;
-                const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                *(reinterpret_cast<Float2*>(p_dst + dindex)) =
                    *(reinterpret_cast<const Float2*>(p_src + sindex));
@@ -354,8 +356,8 @@ struct Blockwise2dTensorCopy2
                index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
                               d1v1loop * ThreadPerDim1 + mThreadId1;
-                const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                p_dst[dindex] = p_src[sindex];
            }
@@ -368,8 +370,8 @@ struct Blockwise2dTensorCopy2
                if(did1 < L1)
                {
-                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                    p_dst[dindex] = p_src[sindex];
                }
@@ -389,8 +391,8 @@ struct Blockwise2dTensorCopy2
                {
                    index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
-                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                    *(reinterpret_cast<Float4*>(p_dst + dindex)) =
                        *(reinterpret_cast<const Float4*>(p_src + sindex));
@@ -402,8 +404,8 @@ struct Blockwise2dTensorCopy2
                    index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 +
                                   2 * mThreadId1;
-                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                    *(reinterpret_cast<Float2*>(p_dst + dindex)) =
                        *(reinterpret_cast<const Float2*>(p_src + sindex));
@@ -415,8 +417,8 @@ struct Blockwise2dTensorCopy2
                    index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
                                   d1v1loop * ThreadPerDim1 + mThreadId1;
-                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                    p_dst[dindex] = p_src[sindex];
                }
@@ -429,8 +431,8 @@ struct Blockwise2dTensorCopy2
                    if(did1 < L1)
                    {
-                        const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                        const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
-                        const index_t dindex = dst_desc.Get1dIndex(did0, did1);
+                        const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
                        p_dst[dindex] = p_src[sindex];
                    }
@@ -497,8 +499,10 @@ struct Blockwise2dTensorCopy3
        const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
        const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
-        mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
+        mSrcMyThreadOffset =
-        mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
+            SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
+        mDstMyThreadOffset =
+            DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
    }
    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const

--- a/src/include/blockwise_3d_tensor_op.hip.hpp
+++ b/src/include/blockwise_3d_tensor_op.hip.hpp
@@ -71,8 +71,10 @@ struct Blockwise3dTensorCopy1
            did[2] = is / ref_desc.GetStride(I2);
-            const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead);
+            const index_t src_index =
-            const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead);
+                src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead);
+            const index_t dst_index =
+                dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead);
            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
                *(reinterpret_cast<const vector_t*>(p_src + src_index));
@@ -167,12 +169,13 @@ struct Blockwise3dTensorCopy3
        }
        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{});
-        const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
+        const auto thread_multi_id =
+            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
-        mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(
+        mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
            thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead);
-        mDstMyThreadOffset = DstDesc{}.Get1dIndex(
+        mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
            thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead);
    }
@@ -214,14 +217,14 @@ struct Blockwise3dTensorCopy3
                for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
                {
                    const index_t src_offset =
-                        SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                        SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
-                                             iloop_d1 * thread_per_d1,
+                                                          iloop_d1 * thread_per_d1,
-                                             iloop_d2 * thread_per_d2 * DataPerRead);
+                                                          iloop_d2 * thread_per_d2 * DataPerRead);
                    const index_t dst_offset =
-                        DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                        DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
-                                             iloop_d1 * thread_per_d1,
+                                                          iloop_d1 * thread_per_d1,
-                                             iloop_d2 * thread_per_d2 * DataPerRead);
+                                                          iloop_d2 * thread_per_d2 * DataPerRead);
                    *(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) = *(
                        reinterpret_cast<const vector_t*>(&p_src[src_offset + mSrcMyThreadOffset]));
@@ -295,12 +298,12 @@ struct Blockwise3dTensorCopy3
                for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
                {
                    const index_t src_offset =
-                        SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                        SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
-                                             iloop_d1 * thread_per_d1,
+                                                          iloop_d1 * thread_per_d1,
-                                             iloop_d2 * thread_per_d2 * DataPerRead);
+                                                          iloop_d2 * thread_per_d2 * DataPerRead);
-                    const index_t clipboard_offset =
+                    const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
-                        clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
+                        iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
                    *(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) = *(
                        reinterpret_cast<const vector_t*>(&p_src[src_offset + mSrcMyThreadOffset]));
@@ -350,13 +353,13 @@ struct Blockwise3dTensorCopy3
 #pragma unroll
                for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
                {
-                    const index_t clipboard_offset =
+                    const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
-                        clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
+                        iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
                    const index_t dst_offset =
-                        DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                        DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
-                                             iloop_d1 * thread_per_d1,
+                                                          iloop_d1 * thread_per_d1,
-                                             iloop_d2 * thread_per_d2 * DataPerRead);
+                                                          iloop_d2 * thread_per_d2 * DataPerRead);
                    *(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) =
                        *(reinterpret_cast<const vector_t*>(&p_clipboard[clipboard_offset]));

--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
    constexpr auto dst_desc = DstDesc{};
-    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
+    constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths());
 #if 0
    if(get_thread_local_1d_id() == 0)
@@ -43,7 +43,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
        const index_t did3 = is / desc.GetStride(I3);
-        const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+        const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
        f(p_dst[dindex]);
    }
@@ -70,7 +70,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
            const index_t did3 = is / desc.GetStride(I3);
-            const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+            const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
            f(p_dst[dindex]);
        }
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
+    constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
    constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
@@ -132,9 +132,10 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
        did[3] = is / ref_desc.GetStride(I3);
-        const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+        const index_t src_index = src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
-        const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
+        const index_t dst_index =
+            dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
        f(p_src[src_index], p_dst[dst_index]);
    }
@@ -163,9 +164,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
            did[3] = is / ref_desc.GetStride(I3);
-            const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+            const index_t src_index =
+                src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
-            const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
+            const index_t dst_index =
+                dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
            f(p_src[src_index], p_dst[dst_index]);
        }
@@ -256,7 +259,7 @@ struct Blockwise4dTensorCopy1
        constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
        constexpr auto ref_desc =
-            make_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
+            make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
        constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
@@ -278,9 +281,9 @@ struct Blockwise4dTensorCopy1
            did[3] = is / ref_desc.GetStride(I3);
            const index_t src_index =
-                src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
+                src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead);
            const index_t dst_index =
-                dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
+                dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead);
            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
                *(reinterpret_cast<const vector_t*>(p_src + src_index));
@@ -333,19 +336,19 @@ struct BlockwiseChwnTensorCopyPadded
        constexpr auto src_desc = SrcDesc{};
        constexpr auto dst_desc = DstDesc{};
-        constexpr auto ref_desc = make_ConstantTensorDescriptor(DstOpLengths{});
+        constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{});
        constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
        constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
        constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
-        const Float* p_src_tmp =
+        const Float* p_src_tmp = p_src +
-            p_src +
+                                 src_desc.GetOffsetFromMultiIndex(
-            src_desc.Get1dIndex(c_block_data_begin,
+                                     c_block_data_begin,
-                                (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
+                                     (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
-                                (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
+                                     (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
-                                n_block_data_begin);
+                                     n_block_data_begin);
 #if 0
        if(get_thread_local_1d_id() == 0)
@@ -389,13 +392,13 @@ struct BlockwiseChwnTensorCopyPadded
            did[3] = is / ref_desc.GetStride(I3);
-            const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+            const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
            p_dst[bindex] =
                (did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
                 did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
                    ? Float(0)
-                    : p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])];
+                    : p_src_tmp[src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3])];
        }
        constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
@@ -422,14 +425,16 @@ struct BlockwiseChwnTensorCopyPadded
                did[3] = is / ref_desc.GetStride(I3);
-                const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+                const index_t bindex =
+                    dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
                p_dst[bindex] =
                    (did[1] < h_block_pad_low ||
                     did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
                     did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
                        ? Float(0)
-                        : p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])];
+                        : p_src_tmp[src_desc.GetOffsetFromMultiIndex(
+                              did[0], did[1], did[2], did[3])];
            }
        }
    }
@@ -505,18 +510,19 @@ struct Blockwise4dTensorCopy3
            }
        }
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{});
+        constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{});
-        const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
+        const auto thread_multi_id =
+            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
-        mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_multi_id[0],
+        mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0],
-                                                  thread_multi_id[1],
+                                                               thread_multi_id[1],
-                                                  thread_multi_id[2],
+                                                               thread_multi_id[2],
-                                                  thread_multi_id[3] * DataPerRead);
+                                                               thread_multi_id[3] * DataPerRead);
-        mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_multi_id[0],
+        mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0],
-                                                  thread_multi_id[1],
+                                                               thread_multi_id[1],
-                                                  thread_multi_id[2],
+                                                               thread_multi_id[2],
-                                                  thread_multi_id[3] * DataPerRead);
+                                                               thread_multi_id[3] * DataPerRead);
    }
    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
@@ -564,17 +570,17 @@ struct Blockwise4dTensorCopy3
 #pragma unroll
                    for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
                    {
-                        const index_t src_offset =
+                        const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
-                            SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                            iloop_d0 * thread_per_d0,
-                                                 iloop_d1 * thread_per_d1,
+                            iloop_d1 * thread_per_d1,
-                                                 iloop_d2 * thread_per_d2,
+                            iloop_d2 * thread_per_d2,
-                                                 iloop_d3 * thread_per_d3 * DataPerRead);
+                            iloop_d3 * thread_per_d3 * DataPerRead);
-                        const index_t dst_offset =
+                        const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
-                            DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                            iloop_d0 * thread_per_d0,
-                                                 iloop_d1 * thread_per_d1,
+                            iloop_d1 * thread_per_d1,
-                                                 iloop_d2 * thread_per_d2,
+                            iloop_d2 * thread_per_d2,
-                                                 iloop_d3 * thread_per_d3 * DataPerRead);
+                            iloop_d3 * thread_per_d3 * DataPerRead);
                        *(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) =
                            *(reinterpret_cast<const vector_t*>(
@@ -646,7 +652,7 @@ struct Blockwise4dTensorCopy3
        constexpr index_t nloop_d2 = L2 / thread_per_d2;
        constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
-        constexpr auto clipboard_desc = make_ConstantTensorDescriptor(
+        constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
            Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
 #pragma unroll
@@ -661,13 +667,13 @@ struct Blockwise4dTensorCopy3
 #pragma unroll
                    for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
                    {
-                        const index_t src_offset =
+                        const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
-                            SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                            iloop_d0 * thread_per_d0,
-                                                 iloop_d1 * thread_per_d1,
+                            iloop_d1 * thread_per_d1,
-                                                 iloop_d2 * thread_per_d2,
+                            iloop_d2 * thread_per_d2,
-                                                 iloop_d3 * thread_per_d3 * DataPerRead);
+                            iloop_d3 * thread_per_d3 * DataPerRead);
-                        const index_t clipboard_offset = clipboard_desc.Get1dIndex(
+                        const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
                            iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead);
                        *(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) =
@@ -713,7 +719,7 @@ struct Blockwise4dTensorCopy3
        constexpr index_t nloop_d2 = L2 / thread_per_d2;
        constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
-        constexpr auto clipboard_desc = make_ConstantTensorDescriptor(
+        constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
            Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
 #pragma unroll
@@ -728,14 +734,14 @@ struct Blockwise4dTensorCopy3
 #pragma unroll
                    for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
                    {
-                        const index_t clipboard_offset = clipboard_desc.Get1dIndex(
+                        const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
                            iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead);
-                        const index_t dst_offset =
+                        const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
-                            DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
+                            iloop_d0 * thread_per_d0,
-                                                 iloop_d1 * thread_per_d1,
+                            iloop_d1 * thread_per_d1,
-                                                 iloop_d2 * thread_per_d2,
+                            iloop_d2 * thread_per_d2,
-                                                 iloop_d3 * thread_per_d3 * DataPerRead);
+                            iloop_d3 * thread_per_d3 * DataPerRead);
                        *(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) =
                            *(reinterpret_cast<const vector_t*>(&p_clipboard[clipboard_offset]));

--- a/src/include/blockwise_batched_gemm.hip.hpp
+++ b/src/include/blockwise_batched_gemm.hip.hpp
@@ -87,10 +87,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
        mMyThreadOffsetA = c_thread_mtx_index.batch * BlockMatrixStrideA +
-                           a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row);
+                           a_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.row);
        mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB +
-                           b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col);
+                           b_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
 #if 0
        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
@@ -221,10 +221,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                        threadwise_matrix_copy(
                            a_block_mtx,
                            p_a_block +
-                                a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
+                                a_block_mtx.GetOffsetFromMultiIndex(k_begin,
+                                                                    m_repeat * MPerLevel1Cluster) +
                                ib * BlockMatrixStrideA + mMyThreadOffsetA,
                            a_thread_mtx,
-                            p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                            p_a_thread +
+                                a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC),
                            a_thread_sub_mtx.GetLengths(),
                            Number<DataPerReadA>{});
                    }
@@ -238,10 +240,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                        threadwise_matrix_copy(
                            b_block_mtx,
                            p_b_block +
-                                b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
+                                b_block_mtx.GetOffsetFromMultiIndex(k_begin,
+                                                                    n_repeat * NPerLevel1Cluster) +
                                ib * BlockMatrixStrideB + mMyThreadOffsetB,
                            b_thread_mtx,
-                            p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                            p_b_thread +
+                                b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC),
                            b_thread_sub_mtx.GetLengths(),
                            Number<DataPerReadB>{});
                    }
@@ -343,9 +347,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]);
        reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]);
        reg_b[1] = *reinterpret_cast<const Float4*>(
-            &p_b_block[b_block_mtx.Get1dIndex(0, NPerLevel1Cluster) + mMyThreadOffsetB]);
+            &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(0, NPerLevel1Cluster) +
+                       mMyThreadOffsetB]);
        reg_a[1] = *reinterpret_cast<const Float4*>(
-            &p_a_block[a_block_mtx.Get1dIndex(0, MPerLevel1Cluster) + mMyThreadOffsetA]);
+            &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(0, MPerLevel1Cluster) +
+                       mMyThreadOffsetA]);
        outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
        outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
@@ -353,15 +359,17 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        for(index_t k = 1; k < K; ++k)
        {
            reg_a[0] = *reinterpret_cast<const Float4*>(
-                &p_a_block[a_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetA]);
+                &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetA]);
            outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
            reg_b[0] = *reinterpret_cast<const Float4*>(
-                &p_b_block[b_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetB]);
+                &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetB]);
            outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
            reg_b[1] = *reinterpret_cast<const Float4*>(
-                &p_b_block[b_block_mtx.Get1dIndex(k, NPerLevel1Cluster) + mMyThreadOffsetB]);
+                &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, NPerLevel1Cluster) +
+                           mMyThreadOffsetB]);
            reg_a[1] = *reinterpret_cast<const Float4*>(
-                &p_a_block[a_block_mtx.Get1dIndex(k, MPerLevel1Cluster) + mMyThreadOffsetA]);
+                &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, MPerLevel1Cluster) +
+                           mMyThreadOffsetA]);
            outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
            outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
        }
@@ -489,7 +497,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        const index_t c_thread_offset =
            c_thread_mtx_begin.batch * BlockMatrixStrideC +
-            c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
+            c_block_mtx.GetOffsetFromMultiIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
        for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
        {
@@ -498,12 +506,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                threadwise_matrix_copy(
                    c_thread_sub_mtx,
                    p_c_thread +
-                        c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
+                        c_thread_sub_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster,
-                                                    n_repeat * NPerLevel1Cluster),
+                                                                 n_repeat * NPerLevel1Cluster),
                    c_block_mtx,
                    p_c_block +
-                        c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
+                        c_block_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster,
-                                               n_repeat * NPerLevel1Cluster) +
+                                                            n_repeat * NPerLevel1Cluster) +
                        c_thread_offset,
                    c_thread_sub_mtx.GetLengths());
            }

--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -51,8 +51,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-        mMyThreadOffsetA = BlockMatrixA::Get1dIndex(0, c_thread_mtx_index.row);
+        mMyThreadOffsetA = BlockMatrixA::GetOffsetFromMultiIndex(0, c_thread_mtx_index.row);
-        mMyThreadOffsetB = BlockMatrixB::Get1dIndex(0, c_thread_mtx_index.col);
+        mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
    }
    __device__ static auto GetThreadMatrixCLengths()
@@ -248,10 +248,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
            {
                threadwise_matrix_copy(
                    a_block_mtx,
-                    p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
+                    p_a_block +
+                        a_block_mtx.GetOffsetFromMultiIndex(k_begin, m_repeat * MPerLevel1Cluster) +
                        mMyThreadOffsetA,
                    a_thread_mtx,
-                    p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                    p_a_thread + a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC),
                    a_thread_sub_mtx.GetLengths(),
                    Number<DataPerReadA>{});
            }
@@ -262,10 +263,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
            {
                threadwise_matrix_copy(
                    b_block_mtx,
-                    p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
+                    p_b_block +
+                        b_block_mtx.GetOffsetFromMultiIndex(k_begin, n_repeat * NPerLevel1Cluster) +
                        mMyThreadOffsetB,
                    b_thread_mtx,
-                    p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                    p_b_thread + b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC),
                    b_thread_sub_mtx.GetLengths(),
                    Number<DataPerReadB>{});
            }

--- a/src/include/blockwise_merged_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_merged_tensor_slice_op.hip.hpp
@@ -11,7 +11,7 @@ template <index_t BlockSize,
          class SliceLengths,
          class SubLengths,
          class ClusterLengths,
-          class ThreadArrangeOrder,
+          class ThreadClusterArrangeOrder,
          class SrcAccessOrder,
          class DstAccessOrder>
 struct BlockwiseTensorSliceCopy_generic_v1
@@ -21,35 +21,142 @@ struct BlockwiseTensorSliceCopy_generic_v1
    index_t mSrcMyThreadOffset;
    index_t mDstMyThreadOffset;
-    __device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_id_offset,
+    __device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset,
-                                                   Array<index_t, nDim> dst_block_multi_id_offset)
+                                                   Array<index_t, nDim> dst_block_multi_offset)
    {
-        // only support SrcSubLengths.GetLength() == 1 on merged dimension, for now
-        // check SrcDataPerRead should be 1, if last dimension is a merged dimension
        // check NDim consistent
+        static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");
+        constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(
+            ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
+        // BlockSize
+        static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
+        // divide work
+        static_for<0, nDim, 1>{}([&](auto IDim) {
+            static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
+                          "wrong! cannot evenly divide sliced tensor into sub-tensor");
+        });
+        constexpr auto thread_work_desc =
+            make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
+        static_for<0, nDim, 1>{}([&](auto IDim) {
+            static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0,
+                          "wrong! cannot evenly divide work to cluster");
+        });
+        // only support SubLengths.Get() == 1 on merged dimension, for now
+        static_for<0, nDim, 1>{}([&](auto IDim) {
+            static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) ||
+                       DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) {
+                static_assert(fwd(SubLengths{}).Get(IDim) == 1,
+                              "wrong! Sub-Lengths on merged dimension should be 1");
+            });
+        });
+        // calculate mSrcMyThreadOffset, mDstMyThreadOffset
+        const auto thread_cluster_multi_id =
+            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
-        // calculate mSrcMyThreadOffset
+        const auto data_cluster_multi_id =
-        // calculate mDstMyThreadOffset
+            reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
+        const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{};
+        mSrcMythreadOffset =
+            SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset);
+        mSrcMythreadOffset =
+            DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
    }
-    __device__ static constexpr index_t GetRegisterClipboardSize() {}
+    __device__ static constexpr index_t GetRegisterClipboardSize()
+    {
+        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
+        constexpr auto thread_tensor_desc =
+            make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths);
+        return thread_tensor_desc.GetElementSpaceSize();
+    }
    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
                                             Float* __restrict__ p_clipboard) const
    {
+        constexpr auto thread_sub_tensor_lengths = SubLengths{};
+        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
+        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
+        constexpr auto thread_tensor_desc =
+            make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
+        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
+            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
+            constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
+            constexpr auto clipboard_data_multi_offset =
+                repeat_multi_id * thread_sub_tensor_lengths;
+            constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
+            constexpr index_t clipboard_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
+            threadwise_tensor_slice_copy_generic(SrcDesc{},
+                                                 p_src + src_offset + mSrcMyThreadOffset,
+                                                 thread_tensor_desc,
+                                                 zero_array<index_t, nDim>{},
+                                                 thread_tensor_desc,
+                                                 p_clipboard + clipboard_offset,
+                                                 zero_array<index_t, nDim>{},
+                                                 thread_sub_tensor_lengths,
+                                                 SrcAccessOrder{});
+        });
    }
    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
                                              Float* __restrict__ p_dst) const
    {
+        constexpr auto thread_sub_tensor_lengths = SubLengths{};
+        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
+        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
+        constexpr auto thread_tensor_desc =
+            make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
+        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
+            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
+            constexpr auto clipboard_data_multi_offset =
+                repeat_multi_id * thread_sub_tensor_lengths;
+            constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
+            constexpr index_t clipboard_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset);
+            constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset);
+            threadwise_tensor_slice_copy_generic(thread_tensor_desc,
+                                                 p_clipboard + clipboard_offset,
+                                                 zero_array<index_t, nDim>{},
+                                                 DstDesc{},
+                                                 p_dst + dst_offset + mDstMyThreadOffset,
+                                                 zero_array<index_t, nDim>{},
+                                                 thread_sub_tensor_lengths,
+                                                 DstAccessOrder{});
    }
    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
-        Float p_clipboard[GetRegisterClipboardSize()];
+            Float p_clipboard[GetRegisterClipboardSize()];
-        RunLoadRegisterClipboard(p_src, p_clipboard);
+            RunLoadRegisterClipboard(p_src, p_clipboard);
-        RunStoreRegisterClipboard(p_clipboard, p_dst);
+            RunStoreRegisterClipboard(p_clipboard, p_dst);
    }
-};
+    };
--- a/src/include/blockwise_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_tensor_slice_op.hip.hpp
@@ -39,7 +39,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_cluster_lengths =
            src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths);
+        constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
        // sanity check: data type
        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
@@ -105,7 +105,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
            }
        }
-        const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
+        const auto thread_multi_id =
+            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
        // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
        // regsiters, or only one copy???
@@ -115,17 +116,21 @@ struct BlockwiseTensorSliceReorderCopy_v3
        static_for<0, nDim, 1>{}([&](auto IDim) {
            constexpr auto I    = decltype(IDim){};
            constexpr index_t i = I.Get();
-            // compiler: will it really compute index here, or be merged with Get1dIndex and
+            // compiler: will it really compute index here, or be merged with
+            // GetOffsetFromMultiIndex and
            // optimized away???
            src_data_multi_id[i] *= src_sub_lengths.Get(I);
        });
-        // compiler: will it really compute index here, or be merged with Get1dIndex and
+        // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
+        // and
        // optimized away???
        const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);
-        mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id + src_block_data_multi_id_begin);
+        mSrcMyThreadOffset =
-        mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id + dst_block_data_multi_id_begin);
+            src_desc.GetOffsetFromMultiIndex(src_data_multi_id + src_block_data_multi_id_begin);
+        mDstMyThreadOffset =
+            dst_desc.GetOffsetFromMultiIndex(dst_data_multi_id + dst_block_data_multi_id_begin);
    }
    __device__ static constexpr index_t GetRegisterClipboardSize()
@@ -142,7 +147,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
+        constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
        return thread_tensor_desc.GetElementSpace();
    }
@@ -162,7 +167,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
+        constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
@@ -171,9 +176,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
            constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths;
-            constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id);
+            constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
            constexpr index_t clipboard_offset =
-                thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
            threadwise_tensor_slice_copy(SrcDesc{},
                                         p_src + src_offset + mSrcMyThreadOffset,
@@ -199,7 +204,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
+        constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
@@ -212,9 +217,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
            constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{});
            constexpr index_t clipboard_offset =
-                thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
-            constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id);
+            constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id);
 // write in the order of dst
 #if 1

--- a/src/include/conv_common.hip.hpp
+++ b/src/include/conv_common.hip.hpp
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
    constexpr auto HO = HI + 1 - Y;
    constexpr auto WO = WI + 1 - X;
-    return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
+    return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
 }
 template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
    constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
    constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
-    return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
+    return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
 }
 template <class InDesc, class WeiDesc, class OutDesc>

--- a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
@@ -180,18 +180,19 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
            c_block_data_begin += CPerBlock, __syncthreads())
        {
            // copy input tensor to LDS
-            blockwise_in_copy.Run(p_in_global +
+            blockwise_in_copy.Run(
-                                      in_nchw_global_desc.Get1dIndex(n_block_data_begin,
+                p_in_global +
-                                                                     c_block_data_begin,
+                    in_nchw_global_desc.GetOffsetFromMultiIndex(n_block_data_begin,
-                                                                     hi_block_data_begin,
+                                                                c_block_data_begin,
-                                                                     wi_block_data_begin),
+                                                                hi_block_data_begin,
-                                  p_in_block);
+                                                                wi_block_data_begin),
+                p_in_block);
            // copy weight tensor to LDS
-            blockwise_wei_copy.Run(
+            blockwise_wei_copy.Run(p_wei_global +
-                p_wei_global +
+                                       wei_kcyx_global_desc.GetOffsetFromMultiIndex(
-                    wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+                                           k_block_data_begin, c_block_data_begin, 0, 0),
-                p_wei_block);
+                                   p_wei_block);
            __syncthreads();
@@ -202,26 +203,28 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
                threadwise_direct_convolution_2(
                    in_nchw_thread_block_desc,
                    p_in_block +
-                        in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                        in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
-                                                      c_thread_data,
+                                                                   c_thread_data,
-                                                      hi_thread_data_begin,
+                                                                   hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                                                                   wi_thread_data_begin),
                    wei_kcyx_thread_block_desc,
                    p_wei_block +
-                        wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                        wei_kcyx_block_desc.GetOffsetFromMultiIndex(
+                            k_thread_data_begin, c_thread_data, 0, 0),
                    out_nkhw_thread_desc,
                    p_out_thread);
 #elif 0
                threadwise_direct_convolution_3(
                    in_nchw_thread_block_desc,
                    p_in_block +
-                        in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                        in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
-                                                      c_thread_data,
+                                                                   c_thread_data,
-                                                      hi_thread_data_begin,
+                                                                   hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                                                                   wi_thread_data_begin),
                    wei_kcyx_thread_block_desc,
                    p_wei_block +
-                        wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                        wei_kcyx_block_desc.GetOffsetFromMultiIndex(
+                            k_thread_data_begin, c_thread_data, 0, 0),
                    out_nkhw_thread_desc,
                    p_out_thread);
 #endif
@@ -229,16 +232,16 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
        }
        // copy output tensor from register to global mem
-        threadwise_tensor_slice_copy(
+        threadwise_tensor_slice_copy(out_nkhw_thread_desc,
-            out_nkhw_thread_desc,
+                                     p_out_thread,
-            p_out_thread,
+                                     out_nkhw_global_desc,
-            out_nkhw_global_desc,
+                                     p_out_global +
-            p_out_global +
+                                         out_nkhw_global_desc.GetOffsetFromMultiIndex(
-                out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                             n_block_data_begin + n_thread_data_begin,
-                                                k_block_data_begin + k_thread_data_begin,
+                                             k_block_data_begin + k_thread_data_begin,
-                                                ho_block_data_begin + ho_thread_data_begin,
+                                             ho_block_data_begin + ho_thread_data_begin,
-                                                wo_block_data_begin + wo_thread_data_begin),
+                                             wo_block_data_begin + wo_thread_data_begin),
-            out_nkhw_thread_desc.GetLengths(),
+                                     out_nkhw_thread_desc.GetLengths(),
-            Number<1>{});
+                                     Number<1>{});
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
@@ -221,11 +221,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
        const Float* p_in_global_block_offset =
            p_in_global +
-            in_c_h_w_n_global_desc.Get1dIndex(
+            in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
                0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global +
+            wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
                    p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
@@ -261,8 +262,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
 #else
                    blockwise_batch_gemm.Run_asm
 #endif
-                        (p_wei_block + wei_c_y_x_k_block_desc.Get1dIndex(0, y, x, 0),
+                        (p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
-                         p_in_block + in_c_h_w_n_block_desc.Get1dIndex(0, y, x, 0),
+                         p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                         p_out_thread);
                }
            }
@@ -325,17 +326,17 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        }).else_([&](auto f_dummy) {
            static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                              GemmNPerThreadSubC % NPerThread == 0,
@@ -375,17 +376,17 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        });
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
@@ -230,11 +230,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
 #if 1
        const Float* p_in_global_block_offset =
            p_in_global +
-            in_c_h_w_n_global_desc.Get1dIndex(
+            in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
                0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global +
+            wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
        for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
                    p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
@@ -242,22 +243,24 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
        {
            for(index_t y = 0; y < Y; ++y)
            {
-                blockwise_in_copy.Run(p_in_global_block_offset +
+                blockwise_in_copy.Run(
-                                          in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0),
+                    p_in_global_block_offset +
-                                      p_in_block);
+                        in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
+                    p_in_block);
-                blockwise_wei_copy.Run(p_wei_global_block_offset +
+                blockwise_wei_copy.Run(
-                                           wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0),
+                    p_wei_global_block_offset +
-                                       p_wei_block);
+                        wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
+                    p_wei_block);
                __syncthreads();
                for(index_t x = 0; x < X; ++x)
                {
-                    blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
+                    blockwise_batch_gemm.Run(
-                                             p_in_block +
+                        p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
-                                                 in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
+                        p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
-                                             p_out_thread);
+                        p_out_thread);
                }
                __syncthreads();
@@ -269,11 +272,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
        {
            const Float* p_in_global_block_offset =
                p_in_global +
-                in_c_h_w_n_global_desc.Get1dIndex(
+                in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
                    0, hi_block_data_begin + y, wi_block_data_begin, n_block_data_begin);
            const Float* p_wei_global_block_offset =
-                p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin);
+                p_wei_global +
+                wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin);
            for(index_t
                    c_block_data_begin = 0;
@@ -290,10 +294,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
                for(index_t x = 0; x < X; ++x)
                {
-                    blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
+                    blockwise_batch_gemm.Run(
-                                             p_in_block +
+                        p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
-                                                 in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
+                        p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
-                                             p_out_thread);
+                        p_out_thread);
                }
                __syncthreads();
@@ -358,17 +362,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        }).else_([&](auto f_dummy) {
            static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                              GemmNPerThreadSubC % NPerThread == 0,
@@ -408,17 +412,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
                }
 #endif
-            threadwise_tensor_slice_copy(
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
-                out_10d_thread_desc,
+                                         p_out_thread,
-                p_out_thread,
+                                         out_10d_global_desc,
-                out_10d_global_desc,
+                                         p_out_global +
-                p_out_global +
+                                             out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                    out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                 k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                 ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                                 wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                                 n_block_data_begin + n_thread_data_begin),
-                out_10d_thread_desc.GetLengths(),
+                                         out_10d_thread_desc.GetLengths(),
-                Number<OutThreadCopyDataPerWrite_N>{});
+                                         Number<OutThreadCopyDataPerWrite_N>{});
        });
    }
 };