Enable bwd wrw

b37cb71f · Wen-Heng (Jack) Chung · c5143bca · b37cb71f · b37cb71f · b37cb71f
Commit b37cb71f authored Oct 16, 2019 by Wen-Heng (Jack) Chung
6 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -2,465 +2,57 @@
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "tensor_coordinate.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"

-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
-#endif
-
 namespace ck {

-// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
-// memory layout (ordering of dimensions) can be different between src and dst.
-// This functions assume each thread is reading and writing a normal (not merged) tensor,
-// to simplify index calculations. To satisfy this assumption, the user need to make sure
-// that, on a merged dimension that constains multiple original dimensions, the length of
-// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
-// repeat-length on the merged dimension need to be 1. These sanity checks are performed
-// in constructor of BlockwiseGenericTensorSliceCopy_v1
-template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
-          index_t SrcVectorAccessDim,
-          index_t DstVectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v1
-{
-    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    static constexpr index_t nOriginalDimSrc =
-        SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
-    static constexpr index_t nOriginalDimDst =
-        DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
-
-    // per-thread offset
-    index_t mThreadSrcOffset;
-    index_t mThreadDstOffset;
-
-    // "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
-    // "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
-    // updated if slicing-window is moved. However, they will not be used if you always move
-    // the slicing-window along a non-merged dimension. In that case, compiler should be
-    // able to remove these calculation.
-    // TODO: make sure compiler would actually remove them in that case
-
-    // partial offset in each (merged) dimension
-    Array<index_t, nDim> mThreadSrcPartialOffsets;
-    Array<index_t, nDim> mThreadDstPartialOffsets;
-
-    // multi-id of original tensor
-    Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
-    Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
-
-    __device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
-                                                  Array<index_t, nDim> dst_block_data_id_begin)
-    {
-        // check NDim consistency
-        static_assert(
-            nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
-                nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
-                nDim == ThreadClusterLengths::GetSize() &&
-                nDim == ThreadClusterArrangeOrder::GetSize() &&
-                nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
-            "wrong");
-
-        // check thread arrange order and read/write access order are valid
-        static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
-                          is_valid_sequence_map<SrcDimAccessOrder>::value &&
-                          is_valid_sequence_map<DstDimAccessOrder>::value,
-                      "wrong!");
-
-        // thread cluster
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
-            ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
-
-        // BlockSize
-        static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
-
-        // divide work
-        constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
-
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
-                          "wrong! cannot evenly divide sliced tensor into cluster");
-        });
-
-        constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
-
-        // additional check for merged dimension
-        static_for<0, nDim, 1>{}([&](auto IDim_) {
-            // src
-            static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
-                constexpr auto IDim = decltype(IDim_){};
-
-                // on a merged dimension that constains multiple original dimensions,
-                // the length of the last original dimension need to evenly dividable by its
-                // sub-length,
-                // so each thread is effectively reading a normal (not merged) tensor
-                constexpr auto idim_last_original_src =
-                    SrcDesc::GetContainedOriginalDimensions(IDim).Back();
-                static_assert(
-                    SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
-                            SubLengths::Get(IDim) ==
-                        0,
-                    "wrong!");
-
-                // merged dimension should have repeat_lengths = 1
-                static_assert(repeat_lengths[IDim] == 1,
-                              "wrong! repeat_lengths shoud be 1 on merged dimension");
-            });
-
-            // dst
-            static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
-                constexpr auto IDim = decltype(IDim_){};
-
-                // on a merged dimension that constains multiple original dimensions,
-                // the length of the last original dimension need to evenly dividable by its
-                // sub-length,
-                // so each thread is effectively reading a normal (not merged) tensor
-                constexpr auto idim_last_original_dst =
-                    DstDesc::GetContainedOriginalDimensions(IDim).Back();
-                static_assert(
-                    DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
-                            SubLengths::Get(IDim) ==
-                        0,
-                    "wrong!");
-
-                // merged dimension should have repeat_lengths = 1
-                static_assert(repeat_lengths[IDim] == 1,
-                              "wrong! repeat_lengths shoud be 1 on merged dimension");
-            });
-        });
-
-        // calculate mThreadSrcOffset, mThreadDstOffset
-        const auto thread_cluster_id =
-            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
-
-        const auto data_cluster_id =
-            reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
-
-        const auto thread_data_id_begin = data_cluster_id * SubLengths{};
-
-        // original multi-id
-        mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
-            src_block_data_id_begin + thread_data_id_begin);
-
-        mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
-            dst_block_data_id_begin + thread_data_id_begin);
-
-        // partial offset on each dimension
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            constexpr auto src_partial_original_dims =
-                SrcDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto src_partial_original_desc =
-                SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
-
-            mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
-                extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
-        });
-
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            constexpr auto dst_partial_original_dims =
-                DstDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto dst_partial_original_desc =
-                DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
-
-            mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
-                extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
-        });
-
-        // complete offset
-        mThreadSrcOffset = accumulate_on_array(
-            mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-
-        mThreadDstOffset = accumulate_on_array(
-            mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-    }
-
-    __device__ static constexpr auto GetRegisterBufferDescriptor()
-    {
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
-
-        return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
-    }
-
-    __device__ static constexpr index_t GetRegisterBufferSize()
-    {
-        return GetRegisterBufferDescriptor().GetElementSpace();
-    }
-
-    template <class TData>
-    __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
-                                          TData* __restrict__ p_buffer) const
-    {
-        constexpr auto thread_sub_tensor_lengths = SubLengths{};
-
-        constexpr auto data_per_cluster_per_dims =
-            thread_sub_tensor_lengths * ThreadClusterLengths{};
-
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
-
-        constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            constexpr index_t src_offset =
-                SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
-
-            constexpr index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-#else
-        ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
-
-            const index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-#endif
-
-            // By position the origin of the per-thread window at the point, where multi-index
-            // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
-            // is assuming each thread is copy a noraml (not merged) tensor.
-            // To satisfy this assumption, the user need to make sure that, on a merged dimension
-            // that constains multiple original dimensions, the length of the last original
-            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
-            // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1
-            ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
-                                                  decltype(thread_buffer_desc),
-                                                  SubLengths,
-                                                  SrcDimAccessOrder,
-                                                  SrcVectorAccessDim,
-                                                  SrcDataPerAccess,
-                                                  1>(make_zero_array<index_t, nDim>(),
-                                                     make_zero_array<index_t, nDim>())
-                .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
-        });
-    }
-
-    template <class TData>
-    __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
-                                           TData* __restrict__ p_dst) const
-    {
-        constexpr auto thread_sub_tensor_lengths = SubLengths{};
-
-        constexpr auto data_per_cluster_per_dims =
-            thread_sub_tensor_lengths * ThreadClusterLengths{};
-
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
-
-        constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            constexpr index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-
-            constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
-#else
-        ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            const index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-
-            const index_t dst_offset       = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
-#endif
-
-            // By position the origin of the per-thread window at the point, where multi-index
-            // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
-            // is assuming each thread is copy a noraml (not merged) tensor.
-            // To satisfy this assumption, the user need to make sure that, on a merged dimension
-            // that constains multiple original dimensions, the length of the last original
-            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
-            // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
-                                                  DstDesc,
-                                                  SubLengths,
-                                                  DstDimAccessOrder,
-                                                  DstVectorAccessDim,
-                                                  1,
-                                                  DstDataPerAccess>(
-                make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
-                .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
-        });
-    }
-
-    template <class TData>
-    __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
-    {
-        TData p_buffer[GetRegisterBufferSize()];
-
-        RunLoadRegisterBuffer(p_src, p_buffer);
-        RunStoreRegisterBuffer(p_buffer, p_dst);
-    }
-
-    // When moving the slicing windows along a merged dimension, if the strides of the
-    // contained (by the merged dimension) original dimensions are not in descending order,
-    // then there is no guarantee that the new offset will be larger than the old offset
-    // for movement in positive direction (vice versue for movement in negative direction).
-    // As a result, there is the possiblity that the offset calculation may result in
-    // unsigned integer underflow (due to "-" operation). However, this hazard should not
-    // happen, as long as the users make sure the slicing window would not be moved out of
-    // the boundary of the tensor being sliced. This functions doesn't do runtime sanity
-    // check on out-of-bound slicing window, for performance reason
-    template <index_t IDim_, index_t StepSize, bool PositiveDirection>
-    __device__ void MoveSlicingWindowOnSourceTensor(
-        Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
-    {
-        constexpr auto IDim = Number<IDim_>{};
-
-        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
-            // logic for a merged dimension, also works for non-merged dimension, but its logic may
-            // be unncessarily complicated for compiler to remove calculations that are useless for
-            // a non-merged dimension
-
-            // extract partial original dimensions
-            constexpr auto src_partial_original_dims =
-                SrcDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto src_partial_original_desc =
-                SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
-
-            // calculate new partial original multi-id
-            auto old_src_partial_original_id =
-                extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
-
-            auto new_src_partial_original_id =
-                src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
-                    old_src_partial_original_id, StepSize, direction);
-
-            // update "mThreadSrcOriginalMultiId"
-            static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
-                constexpr auto IDimOriginal = src_partial_original_dims[I];
-
-                mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
-            });
-
-            // calculate new partial offset on this merged dimension
-            const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
-
-            const index_t new_src_partial_offset =
-                src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
-
-            // update "mThreadSrcPartialOffsets"
-            mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
-
-            // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
-            mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
-        }).Else([&](auto) {
-            // Logic for non-merged dimension. If you are never going to move the slicing window on
-            // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
-            // which are being calculated here, will never be used later. In this case, compiler
-            // should be able to remove these calculations.
-            // TODO: make sure compiler would actually remove them in this case.
-
-            // It is the user's responsiblity to make sure the slicing window will not be moved out
-            // of the boundary of the tensor being sliced. Otherwise, there might be hazard like
-            // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
-
-            constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
-
-            static_if<PositiveDirection>{}([&](auto fwd) {
-                mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
-
-                mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
-
-                mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
-            }).Else([&](auto fwd) {
-                mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
-
-                mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
-
-                mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
-            });
-        });
-    }
-
-    template <class T, bool PositiveDirection>
-    __device__ void
-    MoveSrcSlicingWindow(T step_sizes,
-                         integral_constant<bool, PositiveDirection> positive_direction)
-    {
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            if(step_sizes[idim] != 0)
-            {
-                MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
-            }
-        });
-    }
-};
-
 template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SrcCoordinate,
-          class DstCoordinate,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename BlockSrcDesc,
+          typename BlockDstDesc,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
          index_t SrcVectorAccessDim,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
          index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v2
+struct BlockwiseGenericTensorSliceCopy_v4
 {
-    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
-                                                            DstCoordinate dst_block_slice_origin)
-    {
-        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == SubLengths::GetSize() &&
-                          nDim == ThreadClusterLengths::GetSize() &&
-                          nDim == ThreadClusterArrangeOrder::GetSize(),
+    static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;
+
+    __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
+                                                            const Index& dst_block_slice_origin)
+    {
+        static_assert(nDim == BlockSrcDesc::GetNumOfDimension() &&
+                          nDim == BlockDstDesc::GetNumOfDimension() &&
+                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
                      "wrong! nDim not consistent");

-        static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
-                      "wrong! threads should be mapped to cover entire slicing window");
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");

-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
-            ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
+        // map threads to cluster
+        constexpr auto thread_cluster_desc =
+            make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

        static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
                      "wrong! BlockSize not consistent with ThreadClusterLengths");

        const auto thread_cluster_id =
-            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
+            thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());

-        const auto data_cluster_id =
-            reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
-
-        const auto thread_data_id_begin = data_cluster_id * SubLengths{};
+        const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};

        mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
        mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
@@ -469,438 +61,166 @@ struct BlockwiseGenericTensorSliceCopy_v2
        mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
    }

-    __device__ static constexpr index_t GetRegisterBufferSize()
+    __device__ static constexpr index_t GetThreadBufferSize()
    {
-        return RegisterBufferDesc::GetElementSpace();
+        return ThreadBufferDesc::GetElementSpace();
    }

-    template <class TData>
-    __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
+    template <typename BlockSrcData,
+              typename ThreadBufferData,
+              AddressSpace BlockSrcAddressSpace,
+              AddressSpace ThreadBufferAddressSpace>
+    __device__ void
+    RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                        ThreadBufferData* p_thread_buffer,
+                        integral_constant<AddressSpace, BlockSrcAddressSpace>,
+                        integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
+    {
+        constexpr auto block_src_address_space =
+            integral_constant<AddressSpace, BlockSrcAddressSpace>{};
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+
+        constexpr bool has_optimized_address_calculation =
+            decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
+
+        // TODO: threadwise copy is still being tweaked
+        if(has_optimized_address_calculation)
+        {
+            mThreadwiseLoad.Run_optimized_src_address_calculation(
+                p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
+        }
+        else
+        {
+            mThreadwiseLoad.Run(
+                p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
+        }
+    }
+
+    template <typename BlockSrcData, typename ThreadBufferData>
+    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                                        ThreadBufferData* p_thread_buffer) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
+    }
+
+    template <typename ThreadBufferData,
+              typename BlockDstData,
+              AddressSpace ThreadBufferAddressSpace,
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                         BlockDstData* p_block_dst,
+                         integral_constant<AddressSpace, ThreadBufferAddressSpace>,
+                         integral_constant<AddressSpace, BlockDstAddressSpace>) const
+    {
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+        constexpr auto block_dst_address_space =
+            integral_constant<AddressSpace, BlockDstAddressSpace>{};
+
+        constexpr bool has_optimized_address_calculation =
+            decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
+
+        // TODO: threadwise copy is still being tweaked
+        if(has_optimized_address_calculation)
+        {
+            mThreadwiseStore.Run_optimized_dst_address_calculation(
+                p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
+        }
+        else
+        {
+            mThreadwiseStore.Run(
+                p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
+        }
+    }
+
+    template <typename ThreadBufferData, typename BlockDstData>
+    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                                         BlockDstData* p_block_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunStoreThreadBuffer(
+            p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
+    }
+
+    template <typename BlockSrcData,
+              typename BlockDstData,
+              AddressSpace BlockSrcAddressSpace,
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    Run(const BlockSrcData* p_block_src,
+        BlockDstData* p_block_dst,
+        integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
+        integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
    {
-        mThreadwiseLoad.Run(p_src, p_buffer);
-    }
+        BlockSrcData p_thread_buffer[GetThreadBufferSize()];

-    template <class TData>
-    __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
-    {
-        mThreadwiseStore.Run(p_buffer, p_dst);
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
+
+        // if there is type conversion, it's done during store
+        RunStoreThreadBuffer(
+            p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
    }

-    template <class TData>
-    __device__ void Run(const TData* p_src, TData* p_dst) const
+    template <typename BlockSrcData, typename BlockDstData>
+    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
    {
-        TData p_buffer[GetRegisterBufferSize()];
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};

-        mThreadwiseLoad.Run(p_src, p_buffer);
-        mThreadwiseStore.Run(p_buffer, p_dst);
+        Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
-    MoveSrcSlicingWindow(T step_sizes,
-                         integral_constant<bool, PositiveDirection> positive_direction)
+    MoveSrcSliceWindow(const T& step_sizes,
+                       integral_constant<bool, PositiveDirection> positive_direction)
    {
-        mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
+        mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
-    MoveDstSlicingWindow(T step_sizes,
-                         integral_constant<bool, PositiveDirection> positive_direction)
+    MoveDstSliceWindow(const T& step_sizes,
+                       integral_constant<bool, PositiveDirection> positive_direction)
    {
-        mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
+        mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
    }

    private:
-    using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
-
-    using ThreadwiseLoad =
-        ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
-                                              RegisterBufferDesc,
-                                              SrcCoordinate,
-                                              NormalTensorCoordinate<RegisterBufferDesc>,
-                                              SubLengths,
-                                              SrcDimAccessOrder,
-                                              SrcDimAccessOrder,
-                                              SrcVectorAccessDim,
-                                              SrcVectorAccessDim,
-                                              SrcDataPerAccess,
-                                              1>;
-
-    using ThreadwiseStore =
-        ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
-                                              DstDesc,
-                                              NormalTensorCoordinate<RegisterBufferDesc>,
-                                              DstCoordinate,
-                                              SubLengths,
-                                              DstDimAccessOrder,
-                                              DstDimAccessOrder,
-                                              DstVectorAccessDim,
-                                              DstVectorAccessDim,
-                                              1,
-                                              DstDataPerAccess>;
+    using ThreadBufferDesc = decltype(make_native_tensor_descriptor_packed(ThreadSliceLengths{}));
+
+    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<BlockSrcDesc,
+                                                                 ThreadBufferDesc,
+                                                                 ThreadSliceLengths,
+                                                                 SrcDimAccessOrder,
+                                                                 SrcVectorAccessDim,
+                                                                 SrcDataPerAccess,
+                                                                 1>;
+
+    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<ThreadBufferDesc,
+                                                                  BlockDstDesc,
+                                                                  ThreadSliceLengths,
+                                                                  DstDimAccessOrder,
+                                                                  DstVectorAccessDim,
+                                                                  1,
+                                                                  DstDataPerAccess>;

    ThreadwiseLoad mThreadwiseLoad;
    ThreadwiseStore mThreadwiseStore;
 };

-// this will be deprecated
-// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
-// memory layout (ordering of dimensions) can be different between src and dst
-// For now, only support SubLengths[...] == 1 on a merged dimension
-template <index_t BlockSize,
-          class Float,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class DataClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcAccessOrder,
-          class DstAccessOrder,
-          index_t SrcDataPerRead,
-          index_t DstDataPerWrite>
-struct BlockwiseGenericTensorSliceCopy_v1_deprecated
-{
-    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    static constexpr index_t nOriginalDimSrc =
-        SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
-    static constexpr index_t nOriginalDimDst =
-        DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
-
-    // per-thread offset
-    index_t mThreadSrcOffset;
-    index_t mThreadDstOffset;
-
-    // "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
-    // "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
-    // updated if slicing-window is moved. However, they will not be used if you always move
-    // the slicing-window along a non-merged dimension. In that case, compiler should be
-    // able to remove these calculation.
-    // TODO: make sure compiler would actually remove them in that case
-
-    // partial offset in each (merged) dimension
-    Array<index_t, nDim> mThreadSrcPartialOffsets;
-    Array<index_t, nDim> mThreadDstPartialOffsets;
-
-    // multi-id of original tensor
-    Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
-    Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
-
-    __device__ BlockwiseGenericTensorSliceCopy_v1_deprecated(
-        Array<index_t, nDim> src_block_data_multi_id_begin,
-        Array<index_t, nDim> dst_block_data_multi_id_begin)
-    {
-        // check NDim consistency
-        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
-                          nDim == ThreadClusterArrangeOrder::GetSize() &&
-                          nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
-                      "wrong");
-
-        // check thread arrange order and read/write access order are valid
-        static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
-                          is_valid_sequence_map<SrcAccessOrder>::value &&
-                          is_valid_sequence_map<DstAccessOrder>::value,
-                      "wrong!");
-
-        // thread cluster
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
-            DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
-
-        // BlockSize
-        static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
-                      "wrong! block size doesn't match with thread cluster size.");
-
-        // divide work
-        constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
-
-        static_for<0, nDim, 1>{}([&](auto IDim_) {
-            constexpr auto IDim = decltype(IDim_){};
-
-            static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
-                          "wrong! cannot evenly divide sliced tensor into sub-tensor");
-
-            static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
-                          "wrong! cannot evenly divide sliced tensor into cluster");
-        });
-
-        // for now, only support SubLengths == 1 on a merged dimension that constains
-        // multiple original dimensions
-        static_for<0, nDim, 1>{}([&](auto IDim_) {
-            constexpr auto IDim = decltype(IDim_){};
-
-            static_assert(SubLengths::Get(IDim) == 1 ||
-                              (!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
-                               !DstDesc::ContainMultipleOriginalDimensions(IDim)),
-                          "wrong! only support Sub-Length == 1 on a merged dimension");
-        });
-
-        // calculate mThreadSrcOffset, mThreadDstOffset
-        const auto thread_cluster_multi_id =
-            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
-
-        const auto data_cluster_multi_id =
-            reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
-
-        const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
-
-        // original multi-id
-        mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
-            src_block_data_multi_id_begin + thread_data_multi_id_begin);
-
-        mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
-            dst_block_data_multi_id_begin + thread_data_multi_id_begin);
-
-        // partial offset on each dimension
-        static_for<0, nDim, 1>{}([&](auto IDim_) {
-            constexpr auto IDim    = decltype(IDim_){};
-            constexpr index_t idim = IDim;
-
-            constexpr auto src_partial_original_dims =
-                SrcDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto src_partial_original_desc =
-                SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
-
-            mThreadSrcPartialOffsets(idim) = src_partial_original_desc.GetOffsetFromMultiIndex(
-                extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
-        });
-
-        static_for<0, nDim, 1>{}([&](auto IDim_) {
-            constexpr auto IDim    = decltype(IDim_){};
-            constexpr index_t idim = IDim;
-
-            constexpr auto dst_partial_original_dims =
-                DstDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto dst_partial_original_desc =
-                DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
-
-            mThreadDstPartialOffsets(idim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
-                extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
-        });
-
-        // complete offset
-        mThreadSrcOffset = accumulate_on_array(
-            mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-
-        mThreadDstOffset = accumulate_on_array(
-            mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-    }
-
-    __device__ static constexpr index_t GetRegisterClipboardSize()
-    {
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
-
-        constexpr auto thread_tensor_desc =
-            make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
-
-        return thread_tensor_desc.GetElementSpace();
-    }
-
-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
-    {
-        constexpr auto thread_sub_tensor_lengths = SubLengths{};
-
-        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
-
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
-
-        constexpr auto thread_tensor_desc =
-            make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
-
-        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
-
-            constexpr auto src_thread_data_multi_id_begin =
-                repeat_multi_id * data_per_cluster_per_dims;
-
-            constexpr auto clipboard_data_multi_id_begin =
-                repeat_multi_id * thread_sub_tensor_lengths;
-
-            constexpr index_t src_offset =
-                SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
-
-            constexpr index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-#else
-            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
-
-            const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
-
-            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
-
-            const index_t src_offset =
-                SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
-
-            const index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-#endif
-
-            threadwise_generic_tensor_slice_copy_v1_deprecated(SrcDesc{},
-                                                               p_src + src_offset +
-                                                                   mThreadSrcOffset,
-                                                               make_zero_array<index_t, nDim>(),
-                                                               thread_tensor_desc,
-                                                               p_clipboard + clipboard_offset,
-                                                               make_zero_array<index_t, nDim>(),
-                                                               thread_sub_tensor_lengths,
-                                                               SrcAccessOrder{},
-                                                               Number<SrcDataPerRead>{});
-        });
-    }
-
-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
-    {
-        constexpr auto thread_sub_tensor_lengths = SubLengths{};
-
-        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
-
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
-
-        constexpr auto thread_tensor_desc =
-            make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
-
-        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
-
-            constexpr auto clipboard_data_multi_id_begin =
-                repeat_multi_id * thread_sub_tensor_lengths;
-
-            constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
-
-            constexpr index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-
-            constexpr index_t dst_offset =
-                DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
-#else
-            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
-
-            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
-
-            const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
-
-            const index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-
-            const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
-#endif
-
-            threadwise_generic_tensor_slice_copy_v1_deprecated(thread_tensor_desc,
-                                                               p_clipboard + clipboard_offset,
-                                                               make_zero_array<index_t, nDim>(),
-                                                               DstDesc{},
-                                                               p_dst + dst_offset +
-                                                                   mThreadDstOffset,
-                                                               make_zero_array<index_t, nDim>(),
-                                                               thread_sub_tensor_lengths,
-                                                               DstAccessOrder{},
-                                                               Number<DstDataPerWrite>{});
-        });
-    }
-
-    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
-    {
-        Float p_clipboard[GetRegisterClipboardSize()];
-
-        RunLoadRegisterClipboard(p_src, p_clipboard);
-        RunStoreRegisterClipboard(p_clipboard, p_dst);
-    }
-
-    // When moving the slicing windows along a merged dimension, if the strides of the
-    // contained (by the merged dimension) original dimensions are not in descending order,
-    // then there is no guarantee that the new offset will be larger than the old offset
-    // for movement in positive direction (vice versue for movement in negative direction).
-    // As a result, there is the possiblity that the offset calculation may result in
-    // unsigned integer underflow (due to "-" operation). However, this hazard should not
-    // happen, as long as the users make sure the slicing window would not be moved out of
-    // the boundary of the tensor being sliced. This functions doesn't do runtime sanity
-    // check on out-of-bound slicing window, for performance reason
-    template <index_t IDim_, index_t StepSize, bool PositiveDirection>
-    __device__ void MoveSlicingWindowOnSourceTensor(
-        Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
-    {
-        constexpr auto IDim    = Number<IDim_>{};
-        constexpr index_t idim = IDim;
-
-        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
-            // logic for a merged dimension, also works for non-merged dimension, but its logic may
-            // be unncessarily complicated for compiler to remove calculations that are useless for
-            // a non-merged dimension
-
-            // extract partial original dimensions
-            constexpr auto src_partial_original_dims =
-                SrcDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto src_partial_original_desc =
-                SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
-
-            // calculate new partial original multi-id
-            auto old_src_partial_original_multi_id =
-                extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
-
-            auto new_src_partial_original_multi_id =
-                src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
-                    old_src_partial_original_multi_id, StepSize, direction);
-
-            // update "mThreadSrcOriginalMultiId"
-            static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) {
-                constexpr auto I                = decltype(I_){};
-                constexpr index_t idim_original = src_partial_original_dims.Get(I);
-
-                mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I];
-            });
-
-            // calculate new partial offset on this merged dimension
-            const index_t old_src_partial_offset = mThreadSrcPartialOffsets[idim];
-
-            const index_t new_src_partial_offset =
-                src_partial_original_desc.GetOffsetFromMultiIndex(
-                    new_src_partial_original_multi_id);
-
-            // update "mThreadSrcPartialOffsets"
-            mThreadSrcPartialOffsets(idim) = new_src_partial_offset;
-
-            // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
-            mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
-        }).Else([&](auto) {
-            // Logic for non-merged dimension. If you are never going to move the slicing window on
-            // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
-            // which are being calculated here, will never be used later. In this case, compiler
-            // should be able to remove these calculations.
-            // TODO: make sure compiler would actually remove them in this case.
-
-            // It is the user's responsiblity to make sure the slicing window will not be moved out
-            // of the boundary of the tensor being sliced. Otherwise, there might be hazard like
-            // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
-
-            constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
-
-            static_if<PositiveDirection>{}([&](auto fwd) {
-                mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
-
-                mThreadSrcOriginalMultiId(idim_original) += StepSize;
-
-                mThreadSrcPartialOffsets(idim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
-            }).Else([&](auto fwd) {
-                mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
-
-                mThreadSrcOriginalMultiId(idim_original) -= StepSize;
-
-                mThreadSrcPartialOffsets(idim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
-            });
-        });
-    }
-};
-
 } // namespace ck

 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -7,98 +7,160 @@

 namespace ck {

-template <class Float, class Matrix>
+template <typename Float, class Matrix>
 __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
 {
    for(index_t i = 0; i < Matrix::NRow(); ++i)
    {
        for(index_t j = 0; j < Matrix::NCol(); ++j)
        {
-            const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
+            const index_t id = Matrix::CalculateOffset(i, j);
            p_thread[id]     = Float(0);
        }
    }
 }

-template <class Float,
-          class SrcMatrix,
-          class DstMatrix,
-          index_t NRow,
-          index_t NCol,
-          index_t DataPerRead>
-__device__ void threadwise_matrix_copy(SrcMatrix,
-                                       const Float* __restrict__ p_src,
-                                       DstMatrix,
-                                       Float* __restrict__ p_dst,
-                                       Sequence<NRow, NCol>,
-                                       Number<DataPerRead>)
+template <typename SrcMatrix,
+          typename DstMatrix,
+          index_t NSliceRow,
+          index_t NSliceCol,
+          index_t DataPerAccess>
+struct ThreadwiseMatrixSliceCopy
 {
-    static_assert(NCol % DataPerRead == 0, "wrong! should be NCol % == DataPerRead == 0");
+    __device__ constexpr ThreadwiseMatrixSliceCopy()
+    {
+        static_assert(SrcMatrix::RowStride() % DataPerAccess == 0 &&
+                          DstMatrix::RowStride() % DataPerAccess == 0,
+                      "wrong! wrong alignment");
+        static_assert(NSliceCol % DataPerAccess == 0,
+                      "wrong! should be NSliceCol % DataPerAccess == 0");
+    }

-    constexpr auto src_mtx = SrcMatrix{};
-    constexpr auto dst_mtx = DstMatrix{};
-    using vector_t         = typename vector_type<Float, DataPerRead>::MemoryType;
-    for(index_t i = 0; i < NRow; ++i)
+    template <typename Data>
+    __device__ static void Run(const Data* p_src, Data* p_dst)
    {
-        for(index_t j = 0; j < NCol; j += DataPerRead)
+        using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
+
+        for(index_t i = 0; i < NSliceRow; ++i)
        {
-            const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j);
-            const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j);
+            for(index_t j = 0; j < NSliceCol; j += DataPerAccess)
+            {
+                const index_t src_index = SrcMatrix::CalculateOffset(i, j);
+                const index_t dst_index = DstMatrix::CalculateOffset(i, j);

-            *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
-                *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+                *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+                    *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+            }
        }
    }
-}
+};

-template <class MatrixA,
-          class MatrixB,
-          class MatrixC,
-          bool TransA,
-          bool TransB,
-          bool TransC,
-          class FloatA,
-          class FloatB,
-          class FloatC>
-__device__ void threadwise_gemm(MatrixA,
-                                integral_constant<bool, TransA>,
-                                const FloatA* __restrict__ p_a_thread,
-                                MatrixB,
-                                integral_constant<bool, TransB>,
-                                const FloatB* __restrict__ p_b_thread,
-                                MatrixC,
-                                integral_constant<bool, TransC>,
-                                FloatC* __restrict__ p_c_thread)
+// C += transpose(A) * B
+//   Element of matrix can be vectorized data
+template <typename MatrixA, typename MatrixB, typename MatrixC>
+struct ThreadwiseGemmTransANormalBNormalC
 {
-    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) {
-        constexpr auto a_mtx = MatrixA{};
-        constexpr auto b_mtx = MatrixB{};
-        constexpr auto c_mtx = MatrixC{};
+    __device__ constexpr ThreadwiseGemmTransANormalBNormalC()
+    {
+        static_assert(MatrixA::NRow() == MatrixB::NRow() && MatrixA::NCol() == MatrixC::NRow() &&
+                          MatrixB::NCol() == MatrixC::NCol(),
+                      "wrong!");
+    }

-        constexpr index_t M = c_mtx.NRow();
-        constexpr index_t N = c_mtx.NCol();
-        constexpr index_t K = a_mtx.NRow(); // A is transposed
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+        constexpr index_t M = MatrixC::NRow();
+        constexpr index_t N = MatrixC::NCol();
+        constexpr index_t K = MatrixA::NRow(); // A is transposed

        for(index_t k = 0; k < K; ++k)
        {
-            for(index_t i = 0; i < M; ++i)
+            for(index_t m = 0; m < M; ++m)
            {
-                for(index_t j = 0; j < N; ++j)
+                for(index_t n = 0; n < N; ++n)
                {
-                    const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed
-                    const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j);
-                    const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j);
+                    const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
+                    const index_t bindex = MatrixB::CalculateOffset(k, n);
+                    const index_t cindex = MatrixC::CalculateOffset(m, n);

-                    p_c_thread[cindex] += math::inner_product_with_conversion<FloatC>{}(
-                        p_a_thread[aindex], p_b_thread[bindex]);
+                    p_c[cindex] +=
+                        inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
                }
            }
        }
-    }).Else([&](auto fwd) {
-        // not implemented
-        static_assert(fwd(false), "wrong! support for this config is not implemented");
-    });
-}
+    }
+
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+        constexpr index_t M = MatrixC::NRow();
+        constexpr index_t N = MatrixC::NCol();
+        constexpr index_t K = MatrixA::NRow(); // A is transposed
+
+        static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
+
+        for(index_t k = 0; k < K; ++k)
+        {
+            for(index_t m = 0; m < M; ++m)
+            {
+                const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
+
+                static_if<N == 2>{}([&](auto) {
+                    const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
+                    const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
+
+                    const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
+                    const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
+
+                    __outer_product_1x2(
+                        p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
+                });
+
+                static_if<N == 4>{}([&](auto) {
+                    const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
+                    const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
+                    const index_t bindex_2 = MatrixB::CalculateOffset(k, 2);
+                    const index_t bindex_3 = MatrixB::CalculateOffset(k, 3);
+
+                    const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
+                    const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
+                    const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
+                    const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
+
+                    __outer_product_1x4(p_a[aindex],
+                                        p_b[bindex_0],
+                                        p_b[bindex_1],
+                                        p_b[bindex_2],
+                                        p_b[bindex_3],
+                                        p_c[cindex_0],
+                                        p_c[cindex_1],
+                                        p_c[cindex_2],
+                                        p_c[cindex_3]);
+                });
+            }
+        }
+    }
+#endif
+
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+        constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
+                                     ((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
+                                      (is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
+                                      (is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
+
+        static_if<has_amd_asm>{}([&](auto fwd) {
+            Run_amd_asm(p_a, p_b, fwd(p_c));
+        }).Else([&](auto) { Run_source(p_a, p_b, p_c); });
+#else
+        Run_source(p_a, p_b, p_c);
+#endif
+    }
+};

 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -2,336 +2,218 @@
 #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "tensor_coordinate.hpp"
-#include "float_types.h"
-
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
-#endif
-
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
-#endif
-
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
-#endif

 namespace ck {

+// This version use multi-index transformation
 // This threadwise copy allow vector access of src and dst.
-// It allows the dimensions of vector access to be different on src and dst.
-// It also allows the vector size to be different on src and dst.
-// It also allows order of access to be different on src and dst.
-// It use register as buffer to hold all data moving from src to dst.
-// It is designed for copying small amount of data, and src and dst are
-// device memory or LDS.
-// When copying large amout of data, let's hope compiler will reduce register
-// used for the buffer.
-template <class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
-          index_t SrcVectorAccessDim,
-          index_t DstVectorAccessDim,
+// It allows the vector size to be different on src and dst.
+// The dimensions of vector access should be the same on src and dst.
+// The dimension access order should be the same on src and dst.
+// It is designed for cases, where one of src and dst is register, and
+// the other is device memory or LDS
+template <typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t VectorAccessDim,
          index_t SrcDataPerAccess,
          index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v1r1
+struct ThreadwiseGenericTensorSliceCopy_v4r2
 {
-    static constexpr index_t nDim = SliceLengths::GetSize();
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;

-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
-        Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
+    using SrcCoord = typename TensorCoordinate<SrcDesc>::type;
+    using DstCoord = typename TensorCoordinate<DstDesc>::type;
+
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
+                                                               const Index& dst_slice_origin)
        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
    {
        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == SrcDimAccessOrder::GetSize() &&
-                          nDim == DstDimAccessOrder::GetSize(),
+                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
+                          nDim == DimAccessOrder::Size(),
                      "wrong! # of dimensions not the same");

-        static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
-                          is_valid_sequence_map<DstDimAccessOrder>::value,
-                      "wrong! map is not valid");
-
-        static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
-                          SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
-                      "wrong! cannot evenly divide");
-
-        // check vectorized memory access
-        constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-        constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-
-        static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
-                     SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
+        static_assert(is_valid_sequence_map<DimAccessOrder>{}, "wrong! map is not valid");

-        static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
-                     DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
+        static_assert(
+            SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
+            "wrong! cannot evenly divide");
+
+        // TODO:: sanity-check if vectorized memory access is allowed on src and dst
    }

-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
-        : ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
+        : ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
                                                make_zero_array<index_t, nDim>())
    {
    }

-    __device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
+    __device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin)
    {
        mSrcSliceOrigin = src_slice_origin;
    }

-    __device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
+    __device__ void SetDstSliceOrigin(DstCoord dst_slice_origin)
    {
        mDstSliceOrigin = dst_slice_origin;
    }

-    template <class SrcData, class DstData>
-    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+    // Will do padding check on src data: Read 0 if src data is in padding area.
+    // Will do padding check on dst data: No write if dst data is in paddin area.
+    template <typename SrcData,
+              typename DstData,
+              AddressSpace SrcAddressSpace,
+              AddressSpace DstAddressSpace>
+    __device__ void Run(const SrcData* p_src,
+                        DstData* p_dst,
+                        integral_constant<AddressSpace, SrcAddressSpace>,
+                        integral_constant<AddressSpace, DstAddressSpace>) const
    {
-        constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
-
-        SrcData p_src_buffer_[buffer_desc.GetElementSpace()];
-        SrcData* p_src_buffer = p_src_buffer_;
+        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
+        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;

-        // copy data from src into src buffer
-        {
-            using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
+        constexpr auto vector_access_dim = Number<VectorAccessDim>{};

-            constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-            constexpr auto src_data_per_access   = Number<SrcDataPerAccess>{};
+        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
+        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};

-            constexpr auto src_access_lengths = SliceLengths::Modify(
-                src_vector_access_dim,
-                SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
+        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};

-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
-            static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
-                constexpr auto src_data_begin_id = src_access_id.Modify(
-                    src_vector_access_dim,
-                    src_access_id[src_vector_access_dim] * src_data_per_access);
+        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
+            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);

-                const index_t src_offset =
-                    SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
+        ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
+            auto long_vector_access_id) {

-                // load vector from src
-                const src_vector_t src_vector_data =
-                    *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
+            // data id w.r.t slicing-window
+            auto long_vector_data_begin_id = long_vector_access_id;
+            long_vector_data_begin_id(vector_access_dim) =
+                long_vector_size * long_vector_access_id[vector_access_dim];

-                // unpack vector into buffer
-                static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
-                    constexpr auto scalar_id =
-                        typename uniform_sequence_gen<nDim, 0>::type{}.Modify(src_vector_access_dim,
-                                                                              i);
+            // buffer to hold a src long-vector
+            SrcData p_src_long_vector[long_vector_size];

-                    constexpr index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
+            // zero out buffer
+            for(index_t i = 0; i < long_vector_size; ++i)
+            {
+                p_src_long_vector[i] = 0;
+            }

-                    p_src_buffer[buffer_offset] =
-                        reinterpret_cast<const SrcData*>(&src_vector_data)[i];
-                });
-            });
-#else
-            ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
-                auto src_data_begin_id = src_access_id;
-                src_data_begin_id(src_vector_access_dim) =
-                    src_access_id[src_vector_access_dim] * src_data_per_access;
+            // load data from src to the long-vector buffer
+            for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
+            {
+                auto scalar_id               = make_zero_array<index_t, nDim>();
+                scalar_id(vector_access_dim) = i * src_data_per_access;

-                const index_t src_offset =
-                    SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
+                const index_t buffer_offset = i * src_data_per_access;

-                // load vector from src
-                const src_vector_t src_vector_data =
-                    *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
+                const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);

-                // unpack vector into buffer
-                for(index_t i = 0; i < SrcDataPerAccess; ++i)
+                // Check src vector's padding situation, only check the first data in this src
+                //   vector. It's user's responsiblity to make sure all data in the src vector
+                //   has the same padding situation
+                if(src_coord.IsUpperIndexMappedToValidOffset())
                {
-                    auto scalar_id                   = make_zero_array<index_t, nDim>();
-                    scalar_id(src_vector_access_dim) = i;
-
-                    const index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
-
-                    p_src_buffer[buffer_offset] =
-                        reinterpret_cast<const SrcData*>(&src_vector_data)[i];
-                }
-            });
+                    static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
+#if CK_USE_AMD_BUFFER_ADDRESSING
+                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
+                            __buffer_load<SrcData, SrcDataPerAccess>(
+                                fwd(p_src), src_coord.GetOffset(), 0);
+#else
+                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
+                            *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
 #endif
-        }
-
-        // copy data from buffer to dst
-        {
-            using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
-
-            constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-            constexpr auto dst_data_per_access   = Number<DstDataPerAccess>{};
-
-            constexpr auto dst_access_lengths = SliceLengths::Modify(
-                dst_vector_access_dim,
-                SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
-            static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
-                constexpr auto dst_data_begin_id = dst_access_id.Modify(
-                    dst_vector_access_dim,
-                    dst_access_id[dst_vector_access_dim] * dst_data_per_access);
-
-                dst_vector_t dst_vector_data;
-
-                // pack vector from buffer and type conversion
-                static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
-                    constexpr auto scalar_id =
-                        typename uniform_sequence_gen<nDim, 0>::type{}.Modify(dst_vector_access_dim,
-                                                                              i);
+                    }).Else([&](auto) {
+                        // src can be all kinds of memory-space.
+                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
+                            *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
+                    });
+                }
+            }

-                    constexpr index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
+            // SrcData to DstData conversion
+            DstData p_dst_long_vector[long_vector_size];

-                    // SrcData to DstData type conversion is done here
-                    reinterpret_cast<DstData*>(&dst_vector_data)[i] =
-                        type_convert<DstData>{}(p_src_buffer[buffer_offset]);
-                });
+            for(index_t i = 0; i < long_vector_size; ++i)
+            {
+                p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
+            }

-                const index_t dst_offset =
-                    DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
+            // store data from the long-vector buffer to dst
+            for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
+            {
+                auto scalar_id               = make_zero_array<index_t, nDim>();
+                scalar_id(vector_access_dim) = i * dst_data_per_access;

-                // store vector into dst
-                *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = dst_vector_data;
-            });
-#else
-            ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
-                auto dst_data_begin_id = dst_access_id;
-                dst_data_begin_id(dst_vector_access_dim) =
-                    dst_access_id[dst_vector_access_dim] * dst_data_per_access;
+                const index_t buffer_offset = i * dst_data_per_access;

-                dst_vector_t dst_vector_data;
+                const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);

-                // pack vector from buffer and type conversion
-                for(index_t i = 0; i < DstDataPerAccess; ++i)
+                // Check dst vector's padding situation, only check the first data in this dst
+                //   vector. It's user's responsiblity to make sure all data in the dst vector
+                //   has the same padding situation
+                if(dst_coord.IsUpperIndexMappedToValidOffset())
                {
-                    auto scalar_id                   = make_zero_array<index_t, nDim>();
-                    scalar_id(dst_vector_access_dim) = i;
-
-                    const index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
-
-                    // SrcData to DstData type conversion is done here
-                    reinterpret_cast<DstData*>(&dst_vector_data)[i] =
-                        type_convert<DstData>{}(p_src_buffer[buffer_offset]);
-                }
-
-                const index_t dst_offset =
-                    DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
-
-                // store vector into dst
-                *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = dst_vector_data;
-            });
+                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
+#if CK_USE_AMD_BUFFER_ADDRESSING
+                        __buffer_store<DstData, DstDataPerAccess>(
+                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
+                            fwd(p_dst),
+                            dst_coord.GetOffset(),
+                            0);
+#else
+                        *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
+                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
 #endif
-        }
-    }
-
-    private:
-    Array<index_t, nDim> mSrcSliceOrigin;
-    Array<index_t, nDim> mDstSliceOrigin;
-};
-
-// This threadwise copy allow vector access of src and dst.
-// It allows the vector size to be different on src and dst.
-// The dimensions of vector access should be the same on src and dst.
-// The dimension access order should be the same on src and dst.
-// It is designed for cases, where one of src and dst is register, and
-// the other is device memory or LDS
-template <class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class DimAccessOrder,
-          index_t VectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v1r2
-{
-    static constexpr index_t nDim = SliceLengths::GetSize();
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2(
-        Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
-        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
-    {
-        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == DimAccessOrder::GetSize(),
-                      "wrong! # of dimensions not the same");
-
-        static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
-
-        static_assert(
-            SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
-            "wrong! cannot evenly divide");
-
-        // check vectorized memory access
-        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
-
-        static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
-            static_assert(
-                (fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
-                "wrong! vectorized access is allowed only if stride == 1");
-        }).Else([&](auto fwd) {
-            static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
-                           SrcDataPerAccess == 1),
-                          "wrong! vectorized access is allowed only if stride == 1");
-        });
-
-        static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
-            static_assert(
-                (fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
-                "wrong! vectorized access is allowed only if stride == 1");
-        }).Else([&](auto fwd) {
-            static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
-                           DstDataPerAccess == 1),
-                          "wrong! vectorized access is allowed only if stride == 1");
+                    }).Else([&](auto) {
+                        // dst can be all kinds of memory-space
+                        *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
+                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+                    });
+                }
+            }
        });
    }

-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2()
-        : ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(),
-                                                make_zero_array<index_t, nDim>())
+    template <typename SrcData, typename DstData>
+    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
    {
-    }
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};

-    __device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
-    {
-        mSrcSliceOrigin = src_slice_origin;
+        Run(p_src, p_dst, generic_address_space, generic_address_space);
    }

-    __device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
+    // Modify Length to 1, if Mask is set to false
+    // Used for isolating linear dimension from non-linear dimensions
+    template <index_t... Lengths, index_t... Mask>
+    __device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
    {
-        mDstSliceOrigin = dst_slice_origin;
+        return Sequence<(Mask ? Lengths : 1)...>{};
    }

-    template <class SrcData, class DstData>
-    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+    // p_src must be global-memory, p_dst can be any memory-space.
+    // User should make sure p_src is a block-invariant pointer, because
+    //   buffer_load is used for loading from global-memory into register buffer.
+    // Will do padding check on src data: Read 0 if src data is in padding area.
+    // Will do padding check on dst data: No write if dst data is in paddin area.
+    // This version is optimized for address calculation of src tensor
+    // TODO: this function is not compiled to expected ISA
+    template <typename SrcData,
+              typename DstData,
+              AddressSpace SrcAddressSpace,
+              AddressSpace DstAddressSpace>
+    __device__ void
+    Run_optimized_src_address_calculation(const SrcData* p_src,
+                                          DstData* p_dst,
+                                          integral_constant<AddressSpace, SrcAddressSpace>,
+                                          integral_constant<AddressSpace, DstAddressSpace>) const
    {
        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
@@ -346,81 +228,107 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);

-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
-        static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
-            auto long_vector_access_id) {
+        // separate linear dimensions from non-linear dimensions
+        constexpr auto src_linear_dim_mask    = SrcDesc::GetLinearDimensionMask();
+        constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();

-            // data id w.r.t slicing-window
-            constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
-                vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
+        static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
+                          long_vector_size == SrcDataPerAccess,
+                      "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
+                      "would drop");

-            // buffer to hold a long-vector
-            SrcData p_src_long_vector[long_vector_size];
-            DstData p_dst_long_vector[long_vector_size];
+        // separate steps into linear and non-linear components, accoording to src tensor
+        constexpr auto linear_long_vector_access_lengths =
+            mask_lengths(long_vector_access_lengths, src_linear_dim_mask);

-            // load data from src to the long-vector buffer
-            static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
-                constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
-                    vector_access_dim, i * src_data_per_access);
-
-                const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
-                    mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
+        constexpr auto nonlinear_long_vector_access_lengths =
+            mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);

-                constexpr index_t buffer_offset = i * src_data_per_access;
-
-                *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
-                    *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
-            });
+        // loop over src's non-linear dimensions
+        ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
+            auto nonlinear_dim_long_vector_access_id) {

-            // type conversion
-            for(index_t i = 0; i < long_vector_size; ++i)
-            {
-                p_dst_long_vector[i] = type_convert<DstType>{}(p_src_long_vector[i]);
-            }
+            // calculate step-sizes along src's nonlinear dimensions
+            auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
+            nonlinear_dim_data_steps(vector_access_dim) =
+                long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];

-            // store data from the long-vector buffer to dst
-            static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
-                constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
-                    vector_access_dim, i * dst_data_per_access);
+            // move src cooridnate along nonlinear dimensions
+            // this coordinate contains run-time per-thread offset
+            const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;

-                constexpr index_t buffer_offset = i * dst_data_per_access;
+            // loop over src's linear dimensions
+            ford<decltype(linear_long_vector_access_lengths)>{}([&](
+                auto linear_dim_long_vector_access_id) {

-                const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
-                    mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
-
-                *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
-                    *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
-            });
-        });
-#else
-        ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
-            [&](auto long_vector_access_id) {
-
-                // data id w.r.t slicing-window
-                auto long_vector_data_begin_id = long_vector_access_id;
-                long_vector_data_begin_id(vector_access_dim) =
-                    long_vector_size * long_vector_access_id[vector_access_dim];
+                // step-sizes along src's linear dimensions
+                auto linear_dim_data_steps = linear_dim_long_vector_access_id;
+                linear_dim_data_steps(vector_access_dim) =
+                    long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];

                // buffer to hold a long-vector
                SrcData p_src_long_vector[long_vector_size];
-                DstData p_dst_long_vector[long_vector_size];

-                // load data from src to the long-vector buffer
+                // zero out buffer
+                for(index_t i = 0; i < long_vector_size; ++i)
+                {
+                    p_src_long_vector[i] = 0;
+                }
+
+                // Loop over VectorAccessDim, and load data from src to the
+                //   long-vector buffer.
+                // If VectorAccessDim is src's linear dimension, then src's
+                //   offset-diff due to this looping is known at compile-time. If
+                //   VectorAccessDim is src's nonlinear dimension, then src's
+                //   offset-diff due to this looping is only known at run-time. For best
+                //   performance, VectorAccessDim, should be src's linear dimension
                for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
                {
                    auto scalar_id               = make_zero_array<index_t, nDim>();
                    scalar_id(vector_access_dim) = i * src_data_per_access;

-                    const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
-                        mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
-
                    const index_t buffer_offset = i * src_data_per_access;

-                    *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
-                        *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
+                    // move src cooridnate along linear dimensions
+                    const auto src_coord =
+                        src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
+
+#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
+                    // this is src compile-time offset
+                    const index_t src_linear_offset =
+                        src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
+#else
+                    // this is src compile-time offset
+                    const index_t src_linear_offset =
+                        src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
+#endif
+
+                    // Check src vector's padding situation, only check the first data in
+                    //   this src vector. It's user's responsiblity to make sure all data in
+                    //   the src vector has the same padding situation
+                    if(src_coord.IsUpperIndexMappedToValidOffset())
+                    {
+                        static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_BUFFER_ADDRESSING
+                            *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
+                                __buffer_load<SrcData, SrcDataPerAccess>(
+                                    p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
+#else
+                            *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
+                                *reinterpret_cast<const src_vector_t*>(
+                                    &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
+#endif
+                        }).Else([&](auto) {
+                            *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
+                                *reinterpret_cast<const src_vector_t*>(
+                                    &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
+                        });
+                    }
                }

-                // type conversion
+                // SrcData to DstData conversion
+                DstData p_dst_long_vector[long_vector_size];
+
                for(index_t i = 0; i < long_vector_size; ++i)
                {
                    p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
@@ -434,371 +342,222 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2

                    const index_t buffer_offset = i * dst_data_per_access;

-                    const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
-                        mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
+                    // dst offset is calculated here, without explicitly separating into
+                    //   compile-time and per-thread component
+                    const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
+                                                              linear_dim_data_steps + scalar_id);

-                    *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
-                        *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+                    // Check dst vector's padding situation, only check the first data in
+                    //   this dst vector. It's user's responsiblity to make sure all data in
+                    //   the dst vector has the same padding situation
+                    if(dst_coord.IsUpperIndexMappedToValidOffset())
+                    {
+                        *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
+                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+                    }
                }
            });
-#endif
-    }
-
-    private:
-    Array<index_t, nDim> mSrcSliceOrigin;
-    Array<index_t, nDim> mDstSliceOrigin;
-};
-
-// This threadwise copy allow vector access of src and dst.
-// It allows the dimensions of vector access to be different on src and dst.
-// It also allows the vector size to be different on src and dst.
-// It also allows order of access to be different on src and dst.
-// It use register as buffer to hold all data moving from src to dst.
-// It is designed for copying small amount of data, and src and dst are
-// device memory or LDS.
-// When copying large amout of data, let's hope compiler will reduce register
-// used for the buffer.
-template <class SrcDesc,
-          class DstDesc,
-          class SrcCoordinate,
-          class DstCoordinate,
-          class SliceLengths,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
-          index_t SrcVectorAccessDim,
-          index_t DstVectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v2r1
-{
-    static constexpr index_t nDim = SliceLengths::GetSize();
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
-                                                               DstCoordinate dst_slice_origin)
-        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
-    {
-        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == SrcDimAccessOrder::GetSize() &&
-                          nDim == DstDimAccessOrder::GetSize(),
-                      "wrong! # of dimensions not the same");
-
-        static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
-                          is_valid_sequence_map<DstDimAccessOrder>::value,
-                      "wrong! map is not valid");
-
-        static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
-                          SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
-                      "wrong! cannot evenly divide");
-
-        // check vectorized memory access
-        constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-        constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-
-        static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
-                     SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
-
-        static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
-                     DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
-    }
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
-        : ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(),
-                                                make_zero_array<index_t, nDim>())
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
-    {
-        mSrcSliceOrigin = src_slice_origin;
-    }
-
-    __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
-    {
-        mDstSliceOrigin = dst_slice_origin;
+        });
    }

-    template <class TDesc, class Lengths>
-    struct IsolateMergedDimLengths
+    // p_src could be any memory space, d_dst must be global memory.
+    // User should make sure p_dst is a block-invariant pointer, because
+    //   buffer_load is used for storing data from regsiter buffer into global-memory.
+    // Will do padding check on src data: Read 0 if src data is in padding area.
+    // Will do padding check on dst data: No write if dst data is in paddin area.
+    // This version is optimized for address calculation of dst tensor
+    // TODO: this function is not compiled to expected ISA
+    template <typename SrcData,
+              typename DstData,
+              AddressSpace SrcAddressSpace,
+              AddressSpace DstAddressSpace>
+    __device__ void
+    Run_optimized_dst_address_calculation(const SrcData* p_src,
+                                          DstData* p_dst,
+                                          integral_constant<AddressSpace, SrcAddressSpace>,
+                                          integral_constant<AddressSpace, DstAddressSpace>) const
    {
-        template <class IDim>
-        __device__ constexpr index_t operator()(IDim idim) const
-        {
-            return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
-        }
-    };
-
-    template <class SrcTData, class DstTData>
-    __device__ void Run(const SrcTData* p_src, DstTData* p_dst) const
-    {
-        constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
-
-        SrcTData p_buffer_[buffer_desc.GetElementSpace()];
-        SrcTData* p_buffer = p_buffer_;
-
-        // copy data from src into buffer
-        {
-            using src_vector_t = typename vector_type<SrcTData, SrcDataPerAccess>::MemoryType;
-
-            constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-            constexpr auto src_data_per_access   = Number<SrcDataPerAccess>{};
-
-            constexpr auto src_access_lengths = SliceLengths::Modify(
-                src_vector_access_dim,
-                SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
-
-            // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
-            // normal dimensions is known at compile time.
-            // Below is a hack to isolate merged dimension id from normal dimension id, so the
-            // corresponding offset can be calculated seperately at run-time and compile-time.
-            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
-            // merged dimensions, and has value = 1 on normal dimensions;
-            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
-            // normal dimensions, and has value = 1 on merged dimensions;
-            constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
-                nDim,
-                IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
-
-            constexpr auto src_normal_dim_access_lengths =
-                src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
-            // offset w.r.t. merged dimension need to be computed at run-time
-            static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
-                auto src_merged_dim_access_id_) {
-
-                constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){};
-
-                constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify(
-                    src_vector_access_dim,
-                    src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
-
-                const SrcTData* p_src_tmp =
-                    p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
-
-                // offset w.r.t. normal dimension can be computed at compile-time
-                static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
-                    auto src_normal_dim_access_id_) {
-
-                    constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){};
-
-                    constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify(
-                        src_vector_access_dim,
-                        src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
-
-                    constexpr index_t src_normal_offset =
-                        SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
-
-                    // load vector from src
-                    const src_vector_t vector_data =
-                        *reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
-
-                    // unpack vector into buffer
-                    static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
-                        constexpr auto scalar_id =
-                            typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
-                                src_vector_access_dim, i);
-
-                        constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
-                            src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
-
-                        p_buffer[buffer_offset] =
-                            reinterpret_cast<const SrcTData*>(&vector_data)[i];
-                    });
-                });
-            });
-#else
-            ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}(
-                [&](auto src_merged_dim_access_id) {
-
-                    auto src_merged_dim_data_id = src_merged_dim_access_id;
-                    src_merged_dim_data_id(src_vector_access_dim) =
-                        src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
+        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
+        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;

-                    const SrcTData* p_src_tmp =
-                        p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
+        constexpr auto vector_access_dim = Number<VectorAccessDim>{};

-                    // these should be compile-time known
-                    ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
-                        auto src_normal_dim_access_id) {
+        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
+        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};

-                        auto src_normal_dim_data_id = src_normal_dim_access_id;
-                        src_normal_dim_data_id(src_vector_access_dim) =
-                            src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
+        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};

-                        const index_t src_normal_offset =
-                            SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
+        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
+            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);

-                        // load vector from src
-                        const src_vector_t vector_data =
-                            *reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
+        // separate linear dimensions from non-linear dimensions
+        constexpr auto dst_linear_dim_mask    = DstDesc::GetLinearDimensionMask();
+        constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();

-                        // unpack vector into buffer
-                        for(index_t i = 0; i < SrcDataPerAccess; ++i)
-                        {
-                            auto scalar_id                   = make_zero_array<index_t, nDim>();
-                            scalar_id(src_vector_access_dim) = i;
+        static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
+                          long_vector_size == DstDataPerAccess,
+                      "Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
+                      "would drop");

-                            const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
-                                src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
+        // separate steps into linear and non-linear components, accoording to dst tensor
+        constexpr auto linear_long_vector_access_lengths =
+            mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);

-                            p_buffer[buffer_offset] =
-                                reinterpret_cast<const SrcTData*>(&vector_data)[i];
-                        }
-                    });
-                });
-#endif
-        }
+        constexpr auto nonlinear_long_vector_access_lengths =
+            mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);

-        // copy data from buffer into dst
-        {
-            using dst_vector_t = typename vector_type<DstTData, DstDataPerAccess>::MemoryType;
+        // loop over dst's non-linear dimensions
+        ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
+            auto nonlinear_dim_long_vector_access_id) {

-            constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-            constexpr auto dst_data_per_access   = Number<DstDataPerAccess>{};
+            // calculate step-sizes along dst's nonlinear dimensions
+            auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
+            nonlinear_dim_data_steps(vector_access_dim) =
+                long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];

-            constexpr auto dst_access_lengths = SliceLengths::Modify(
-                dst_vector_access_dim,
-                SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
+            // move dst cooridnate along nonlinear dimensions
+            // this coordinate contains run-time per-thread offset
+            const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;

-            constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
-                nDim,
-                IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
+            // loop over dst's linear dimensions
+            ford<decltype(linear_long_vector_access_lengths)>{}([&](
+                auto linear_dim_long_vector_access_id) {

-            constexpr auto dst_normal_dim_access_lengths =
-                dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
+                // step-sizes along dst's linear dimensions
+                auto linear_dim_data_steps = linear_dim_long_vector_access_id;
+                linear_dim_data_steps(vector_access_dim) =
+                    long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];

-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
-            // offset w.r.t. merged dimension need to be computed at run-time
-            static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
-                auto dst_merged_dim_access_id_) {
+                // buffer to hold a long-vector
+                SrcData p_src_long_vector[long_vector_size];

-                constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){};
+                // zero out buffer
+                for(index_t i = 0; i < long_vector_size; ++i)
+                {
+                    p_src_long_vector[i] = 0;
+                }

-                constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify(
-                    dst_vector_access_dim,
-                    dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
+                // Loop over VectorAccessDim, and load data from src to the
+                //   long-vector buffer.
+                // If VectorAccessDim is dst's linear dimension, then dst's
+                //   offset-diff due to this looping is known at compile-time. If
+                //   VectorAccessDim is dst's nonlinear dimension, then dst's
+                //   offset-diff due to this looping is only known at run-time. For best
+                //   performance, VectorAccessDim, should be dst's linear dimension
+                for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
+                {
+                    auto scalar_id               = make_zero_array<index_t, nDim>();
+                    scalar_id(vector_access_dim) = i * src_data_per_access;

-                DstTData* p_dst_tmp =
-                    p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
+                    const index_t buffer_offset = i * src_data_per_access;

-                // offset w.r.t. normal dimension can be computed at compile-time
-                static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
-                    auto dst_normal_dim_access_id_) {
-                    constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){};
+                    // src offset is calculated here, without explicitly separating into
+                    //   compile-time and per-thread component
+                    const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
+                                                              linear_dim_data_steps + scalar_id);

-                    constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify(
-                        dst_vector_access_dim,
-                        dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
+                    // Check src vector's padding situation, only check the first data in
+                    //   this src vector. It's user's responsiblity to make sure all data in
+                    //   the src vector has the same padding situation
+                    if(src_coord.IsUpperIndexMappedToValidOffset())
+                    {
+                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
+                            *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
+                    }
+                }

-                    dst_vector_t vector_data{};
+                // SrcData to DstData conversion
+                DstData p_dst_long_vector[long_vector_size];

-                    // pack vector from buffer
-                    static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
-                        constexpr auto scalar_id =
-                            typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
-                                dst_vector_access_dim, i);
+                for(index_t i = 0; i < long_vector_size; ++i)
+                {
+                    p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
+                }

-                        constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
-                            dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
+                // store data from the long-vector buffer to dst
+                for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
+                {
+                    auto scalar_id               = make_zero_array<index_t, nDim>();
+                    scalar_id(vector_access_dim) = i * dst_data_per_access;

-                        reinterpret_cast<DstTData*>(&vector_data)[i] =
-                            type_convert<DstTData>{}(p_buffer[buffer_offset]);
-                    });
+                    const index_t buffer_offset = i * dst_data_per_access;

-                    constexpr index_t dst_normal_offset =
-                        DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
+                    // move dst cooridnate along linear dimensions
+                    const auto dst_coord =
+                        dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);

-                    // write vector into dst
-                    *reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
-                });
-            });
+#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
+                    // this is dst compile-time offset
+                    const index_t dst_linear_offset =
+                        dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #else
-            // offset w.r.t. merged dimension need to be computed at run-time
-            ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
-                auto dst_merged_dim_access_id) {
-
-                auto dst_merged_dim_data_id = dst_merged_dim_access_id;
-                dst_merged_dim_data_id(dst_vector_access_dim) =
-                    dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
-
-                DstTData* p_dst_tmp =
-                    p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
-
-                // offset w.r.t. normal dimension can be computed at compile-time
-                ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
-                    auto dst_normal_dim_access_id) {
-
-                    auto dst_normal_dim_data_id = dst_normal_dim_access_id;
-                    dst_normal_dim_data_id(dst_vector_access_dim) =
-                        dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
-
-                    dst_vector_t vector_data{};
+                    // this is dst compile-time offset
+                    const index_t dst_linear_offset =
+                        dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
+#endif

-                    // pack vector from buffer
-                    for(index_t i = 0; i < DstDataPerAccess; ++i)
+                    // Check dst vector's padding situation, only check the first data in
+                    //   this dst vector. It's user's responsiblity to make sure all data in
+                    //   the dst vector has the same padding situation
+                    if(dst_coord.IsUpperIndexMappedToValidOffset())
                    {
-                        auto scalar_id                   = make_zero_array<index_t, nDim>();
-                        scalar_id(dst_vector_access_dim) = i;
-
-                        const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
-                            dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
-
-                        reinterpret_cast<DstTData*>(&vector_data)[i] =
-                            type_convert<DstTData>{}(p_buffer[buffer_offset]);
+                        static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_BUFFER_ADDRESSING
+                            __buffer_store<DstData, DstDataPerAccess>(
+                                *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
+                                p_dst,
+                                dst_nonlinear_coord.GetOffset(),
+                                dst_linear_offset);
+#else
+                            *reinterpret_cast<dst_vector_t*>(
+                                &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
+                                *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+#endif
+                        }).Else([&](auto) {
+                            *reinterpret_cast<dst_vector_t*>(
+                                &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
+                                *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
+                        });
                    }
-
-                    const index_t dst_normal_offset =
-                        DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
-
-                    // write vector into dst
-                    *reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
-                });
+                }
            });
+        });
+    }
+
+    __device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
+    {
+#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
+        return true;
+#else
+        return false;
 #endif
-        }
    }

-    // T can be Sequence or Array
-    template <class T, bool PositiveDirection>
-    __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    template <typename T, bool PositiveDirection>
+    __device__ void MoveSrcSliceWindow(const T& step_sizes_,
+                                       integral_constant<bool, PositiveDirection>)
    {
+        const auto step_sizes = to_array(step_sizes_);
+
        static_if<PositiveDirection>{}([&](auto) {
-            mSrcSliceOrigin += step_sizes;
+            mSrcSliceOrigin += to_array(step_sizes);
        }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
    }

-    template <class T, bool PositiveDirection>
-    __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    template <typename T, bool PositiveDirection>
+    __device__ void MoveDstSliceWindow(const T& step_sizes_,
+                                       integral_constant<bool, PositiveDirection>)
    {
+        const auto step_sizes = to_array(step_sizes_);
+
        static_if<PositiveDirection>{}([&](auto) {
            mDstSliceOrigin += step_sizes;
        }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
    }

    private:
-    SrcCoordinate mSrcSliceOrigin;
-    DstCoordinate mDstSliceOrigin;
+    SrcCoord mSrcSliceOrigin;
+    DstCoord mDstSliceOrigin;
 };

 } // namespace ck

--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -4,12 +4,9 @@
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"

-#include "bfloat16_dev.hpp"
-
-
 #define CK_DEVICE_BACKEND_AMD 1
 #define CK_USE_AMD_INLINE_ASM 1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0

 namespace ck {

--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -24,6 +24,21 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
    return os;
 }

+template <class Range>
+std::ostream& LogRangeStrided(std::ostream& os, Range&& range, std::string delim, size_t stride)
+{
+    bool first = true;
+    for(size_t idx=0; idx<range.size(); idx+=stride)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << range[idx];
+    }
+    return os;
+}
+
 typedef enum {
    Half  = 0,
    Float = 1,

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -1000,12 +1000,14 @@ int main(int argc, char* argv[])
        }
 #if CONV_DIRECTION_FWD_DATA // fwd data
        check_error(out_nkhw_host, out_nkhw_device);
-#elif CONV_DIRECTION_BWD_WEIT // bwd  wrw
+//#elif CONV_DIRECTION_BWD_WEIT // bwd  wrw
+#elif CONV_DIRECTION_BWD_WEIT
        check_error(wei_kcyx_host, wei_kcyx);
 #endif 
        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;        
        LogRange(std::cout << "out_nkhw_device  : ", out_nkhw.mData, ",") << std::endl;        
-        //LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
+        //LogRangeStrided(std::cout << "out_nkhw_device  : ", out_nkhw.mData, ",", 64) << std::endl;        
+        LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
 #if 0
        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
        LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;