Merge branch 'master' into jd_redux

52423948 · Jehandad Khan · b97af4ec · 98a2cfcc · 52423948 · 52423948
Commit 52423948 authored Sep 27, 2019 by Jehandad Khan
20 changed files
--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
@@ -6,7 +6,7 @@
 namespace ck {

 template <class Lengths>
-__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
+__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths)
 {
    return reverse_inclusive_scan_sequence(
               Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
@@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
 }

 template <class Lengths, index_t Align>
-__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
+__host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, Number<Align>)
 {
    constexpr index_t L_back_align =
        Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);

-    return calculate_tensor_strides_packed(
+    return calculate_tensor_strides_packed_old(
        Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
 }

@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor

    __host__ __device__ static constexpr auto GetElementSize()
    {
-        return Number<accumulate_on_sequence(
-            Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
+        return Number<reduce_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
    }

    __host__ __device__ static constexpr auto GetElementSpace()
    {
-        constexpr index_t element_space_unaligned = accumulate_on_sequence(
+        constexpr index_t element_space_unaligned = reduce_on_sequence(
            (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});

        return Number<element_space_unaligned>{};
@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor

        constexpr auto multi_id = Sequence<Is...>{};

-        return Number<accumulate_on_sequence(
+        return Number<reduce_on_sequence(
            multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{})>{};
    }

@@ -178,7 +177,7 @@ struct ConstantTensorDescriptor
        {
            constexpr auto IDim      = IDim_{};
            constexpr index_t stride = PackedStrides::Get(IDim);
-            multi_id.Set(IDim, id / stride);
+            multi_id(IDim)           = id / stride;
            id -= multi_id[IDim] * stride;
        }
    };
@@ -187,12 +186,12 @@ struct ConstantTensorDescriptor
    {
        Array<index_t, nDim> multi_id;

-        using PackedStrides = decltype(calculate_tensor_strides_packed(GetLengths()));
+        using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths()));

        // calculate index in each of the dimensions in the order of their dimension
        static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));

-        multi_id.Set(Number<nDim - 1>{}, id / PackedStrides::Get(Number<nDim - 1>{}));
+        multi_id(Number<nDim - 1>{}) = id / PackedStrides::Get(Number<nDim - 1>{});

        return multi_id;
    }
@@ -204,7 +203,7 @@ struct ConstantTensorDescriptor
    }

    // This function doesn't do carry check on the highest dimension for positive stepping (or
-    // borrow check on the lowest dimension for negative stepping) , for performance reason. It is
+    // borrow check on the highest dimension for negative stepping) , for performance reason. It is
    // the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
    // highest dimension for positive stepping (or on the lowest dimension for negative stepping)
    template <bool PositiveDirection>
@@ -304,14 +303,73 @@ struct ConstantTensorDescriptor
                                            GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
    }

+    template <index_t IDimVector, index_t DataPerVector>
+    struct lambda_IsVectorizationAllowed
+    {
+        bool& is_allowed;
+
+        __host__ __device__ constexpr lambda_IsVectorizationAllowed(bool& is_allowed_)
+            : is_allowed(is_allowed_)
+        {
+        }
+
+        template <index_t IDim_>
+        __host__ __device__ constexpr void operator()(Number<IDim_>) const
+        {
+            constexpr auto IDim = Number<IDim_>{};
+
+            if(IDimVector != IDim && Strides::Get(IDim) % DataPerVector != 0)
+            {
+                is_allowed = false;
+            }
+        }
+    };
+
+    template <index_t IDimVector, index_t DataPerVector>
+    __host__ __device__ static constexpr bool IsVectorizationAllowed(Number<IDimVector>,
+                                                                     Number<DataPerVector>)
+    {
+        bool is_allowed = (Strides{}[IDimVector] == 1 || DataPerVector == 1) &&
+                          Lengths{}[IDimVector] % DataPerVector == 0;
+
+        static_for<0, nDim, 1>{}(
+            lambda_IsVectorizationAllowed<IDimVector, DataPerVector>{is_allowed});
+
+        return is_allowed;
+    }
+
+    template <index_t IDim, index_t DataPerVector>
+    __host__ __device__ static constexpr auto Vectorize(Number<IDim>, Number<DataPerVector>)
+    {
+        constexpr auto idim            = Number<IDim>{};
+        constexpr auto data_per_vector = Number<DataPerVector>{};
+
+        static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!");
+
+        using vectorized_lengths =
+            decltype(Lengths::Modify(Number<IDim>{}, Number<Lengths{}[IDim] / DataPerVector>{}));
+        using vectorized_strides =
+            decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{}));
+
+        return ConstantTensorDescriptor<vectorized_lengths, vectorized_strides>{};
+    }
+
    template <index_t IDim, index_t SliceLen>
    __host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
    {
-        using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}));
+        using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{}));

        return ConstantTensorDescriptor<slice_lengths, Strides>{};
    }

+    template <index_t... Is>
+    __host__ __device__ static constexpr auto Slice(Sequence<Is...> slice_lengths)
+    {
+        static_assert(slice_lengths.GetSize() == nDim, "wrong!");
+
+        return ConstantTensorDescriptor<decltype(slice_lengths), Strides>{};
+    }
+
    template <index_t IDim, index_t SliceLength, index_t SliceStride>
    __host__ __device__ static constexpr auto
        StridedSlice(Number<IDim>, Number<SliceLength>, Number<SliceStride>)
@@ -330,7 +388,7 @@ struct ConstantTensorDescriptor
        constexpr auto fold_intervals = Sequence<FoldIntervals...>{};

        constexpr index_t fold_intervals_product =
-            accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
+            reduce_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});

        constexpr auto unfold_length = GetLength(Number<IDim>{});
        constexpr auto unfold_stride = GetStride(Number<IDim>{});
@@ -388,7 +446,7 @@ struct ConstantTensorDescriptor
        static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable");

        // unfolded length, stride
-        constexpr index_t unfold_length = accumulate_on_sequence(
+        constexpr index_t unfold_length = reduce_on_sequence(
            GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});

        constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
@@ -409,7 +467,7 @@ struct ConstantTensorDescriptor

    __host__ __device__ static constexpr auto Pack()
    {
-        using packed_strides = decltype(calculate_tensor_strides_packed(Lengths{}));
+        using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
        return ConstantTensorDescriptor<Lengths, packed_strides>{};
    }

@@ -431,7 +489,7 @@ struct ConstantTensorDescriptor
 template <class Lengths>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
 {
-    using Strides = decltype(calculate_tensor_strides_packed(Lengths{}));
+    using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
    return ConstantTensorDescriptor<Lengths, Strides>{};
 }

@@ -444,7 +502,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
 template <class Lengths, index_t Align>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
 {
-    using Strides = decltype(calculate_tensor_strides_aligned(Lengths{}, Number<Align>{}));
+    using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{}));
    return ConstantTensorDescriptor<Lengths, Strides>{};
 }


--- a/composable_kernel/include/tensor_description/dimension.hpp
+++ b/composable_kernel/include/tensor_description/dimension.hpp
+#ifndef CK_DIMENSION_HPP
+#define CK_DIMENSION_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t Length>
+struct Dimension
+{
+    __host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
+};
+
+template <index_t Length, index_t Stride>
+struct NativeDimension
+{
+    __host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
+
+    __host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
+
+    __host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
+
+    __host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
+    {
+        return i_diff * Stride;
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
--- a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
+#ifndef CK_TENSOR_COORDINATE_HELPER_HPP
+#define CK_TENSOR_COORDINATE_HELPER_HPP
+
+#include "tensor_coordiante_v2.hpp"
+
+namespace ck {
+
+template <typename TensorDesc>
+__host__ __device__ constexpr auto
+make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
+{
+    return typename TensorCoordinate<TensorDesc>::type(idx);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
--- a/composable_kernel/include/tensor_description/tensor_view.hpp
+++ b/composable_kernel/include/tensor_description/tensor_view.hpp
+#ifndef CK_TENSOR_VIEW_HPP
+#define CK_TENSOR_VIEW_HPP
+
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"
+#include "tensor_coordinate_deprecated.hpp"
+
+namespace ck {
+
+// TensorDesc is ConstantTensorDescriptor or ConstantMergedTensorDescriptor
+template <class TensorDesc, class TData>
+struct NormalTensorView
+{
+    using type             = NormalTensorView;
+    using tensor_desc_type = TensorDesc;
+    using coordinate_type  = typename NormalTensorCoordinate_deprecated<TensorDesc>::type;
+    using data_type        = TData;
+
+    static constexpr auto nDim = TensorDesc::GetNumOfDimension();
+
+    __host__ __device__ constexpr NormalTensorView(TData* p_data) : mpData{p_data} {}
+
+    __host__ __device__ constexpr NormalTensorView() : NormalTensorView{nullptr} {}
+
+    __host__ __device__ static constexpr auto GetNumOfDimension() { return nDim; }
+
+    __host__ __device__ static constexpr auto GetLengths() { return TensorDesc::GetLengths(); }
+
+    __host__ __device__ const TData& operator[](coordinate_type coord) const
+    {
+        return mpData[coord.GetOffset()];
+    }
+
+    __host__ __device__ TData& operator()(coordinate_type coord) const
+    {
+        return mpData[coord.GetOffset()];
+    }
+
+    template <class IDim, class DataPerVector>
+    __host__ __device__ static constexpr auto IsVectorizationAllowed(IDim, DataPerVector)
+    {
+        return TensorDesc::IsVectorizationAllowed(IDim{}, DataPerVector{});
+    }
+
+    template <class IDim, class DataPerVector>
+    __host__ __device__ auto Vectorize(IDim idim, DataPerVector data_per_vector) const
+    {
+        static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!");
+
+        using vector_t = typename vector_type<TData, data_per_vector>::MemoryType;
+        return NormalTensorView<decltype(TensorDesc::Vectorize(idim, data_per_vector)), vector_t>(
+            reinterpret_cast<vector_t*>(mpData));
+    }
+
+    template <index_t... Is>
+    __host__ __device__ auto Slice(coordinate_type slice_origin, Sequence<Is...> slice_lengths)
+    {
+        static_assert(slice_lengths.GetSize() == nDim, "wrong!");
+
+        return NormalTensorView<decltype(TensorDesc::Slice(slice_lengths)), TData>(
+            mpData + slice_origin.GetOffset());
+    }
+
+    template <class IDim, class SliceLen>
+    __host__ __device__ auto
+    Slice(coordinate_type slice_origin, IDim idim, SliceLen slice_len) const
+    {
+        return NormalTensorView<decltype(TensorDesc::Slice(idim, slice_len)), TData>(
+            mpData + slice_origin.GetOffset());
+    }
+
+    // slice_window is a slicing window on "*this"
+    template <class SliceWindow, class T, bool PositiveDirection>
+    __device__ void MoveSliceWindow(SliceWindow& slice_window,
+                                    T step_sizes,
+                                    integral_constant<bool, PositiveDirection>)
+    {
+        if(PositiveDirection)
+        {
+            slice_window.mpData += coordinate_type{step_sizes}.GetOffset();
+        }
+        else
+        {
+            slice_window.mpData -= coordinate_type{step_sizes}.GetOffset();
+        }
+    }
+
+    // private:
+    data_type* mpData;
+};
+
+template <class... Xs, class TData>
+__host__ __device__ constexpr auto make_TensorView(ConstantTensorDescriptor<Xs...>, TData* p_data)
+{
+    return NormalTensorView<ConstantTensorDescriptor<Xs...>, TData>{p_data};
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/tensor_visit.hpp
+++ b/composable_kernel/include/tensor_description/tensor_visit.hpp
+#ifndef CK_TENSOR_VISIT_HPP
+#define CK_TENSOR_VISIT_HPP
+
+#include "common_header.hpp"
+#include "dimension.hpp"
+#include "dimension_transform.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_coordinate.hpp"
+
+namespace ck {
+
+template <class TensorDescriptor>
+struct TensorVisit
+{
+    using Index      = typename TensorDescriptor::Index;
+    using Coordinate = typename TensorCoordinate<TensorDescriptor>::type;
+
+    __host__ __device__ static void Run_v1(Index idx_begin)
+    {
+        const auto coord_begin = Coordinate(idx_begin);
+
+        ford<TensorDescriptor::GetLengths()>{}(
+            [&](auto idx_diff) { index_t offset = (coord_begin + idx_diff).GetOffset(); });
+    }
+
+    __host__ __device__ static void Run_v2(Index idx_begin)
+    {
+        const auto coord_begin = Coordinate(idx_begin);
+
+        ford<TensorDescriptor::GetLengths()>{}([&](auto idx_diff) {
+            index_t offset_diff = coord_begin.GetOffsetDiff(idx_diff);
+            index_t offset      = coord_begin.GetOffset() + offset_diff;
+        });
+    }
+
+    __host__ __device__ static void Run_v3(Index idx_begin)
+    {
+        const auto coord_begin = Coordinate(idx_begin);
+
+        constexpr auto linear_dimensions    = TensorDescriptor::GetLinearDimensions();
+        constexpr auto nonlinear_dimensions = TensorDescriptor::GetNonLinearDimensions();
+
+        constexpr auto lengths = TensorDescriptor::GetLengths();
+
+        constexpr auto linear_dimension_lengths_hack =
+            lambda_HackLengths{}(lengths, linear_dimensions);
+        constexpr auto nonlinear_dimension_lengths_hack =
+            lambda_HackLengths{}(lengths, nonlinear_dimensions);
+
+        ford<nonlinear_dimension_lengths_hack>{}([&](auto idx_diff_nonlinear_hack) {
+            // run-time component
+            index_t offset_diff_nonlinear = coord_begin.GetOffsetDiff(idx_diff_nonlinear_hack);
+
+            ford<linear_dimension_lengths_hack>{}([&](auto idx_diff_linear_hack) {
+                // compile-time component
+                index_t offset_diff_linear = coord_begin.GetOffsetDiff(idx_diff_linear_hack);
+
+                index_t offset =
+                    coord_begin.GetOffset() + offset_diff_nonlinear + offset_diff_linear;
+            });
+        });
+    }
+
+    __host__ __device__ static void Run_v4(Index idx_begin)
+    {
+        const auto coord_begin = Coordinate(idx_begin);
+
+        constexpr auto linear_dimensions = TensorDescriptor::GetLinearDimensions();
+
+        constexpr auto nonlinear_independent_dimension_groups =
+            TensorDescriptor::GetNonLinearIndependentDimensionGroups();
+
+        constexpr auto lengths = TensorDescriptor::GetLengths();
+
+        constexpr auto linear_dimension_lengths = lambda_HackLengths{}(lengths, linear_dimensions);
+
+        // run-time component
+        index_t offset_diff_nonlinear = 0;
+
+        template <index_t NGroup>
+        struct f_recursion
+        {
+            template <index_t IGroup>
+            __host__ __device__ void Run(Number<IGroup>)
+            {
+                constexpr auto nonlinear_independent_dimensions_igroup =
+                    nonlinear_independent_dimension_groups.Get(igroup);
+
+                constexpr auto nonlinear_independent_lengths_igroup =
+                    lambda_HackLengths{}(lengths, nonlinear_independent_dimensions_igroup);
+
+                ford<nonlinear_independent_lengths_igroup>{}(
+                    [&](auto idx_diff_nonlinear_igroup_hack) {
+                        // run-time component
+                        offset_diff_nonlinear +=
+                            coord_begin.GetOffsetDiff(idx_diff_nonlinear_igroup_hack);
+
+                        Run(Number<IGroup + 1>{});
+                    });
+            };
+
+            // inner-most work
+            template <>
+            __host__ __device__ void Run(Number<NGroup>)
+            {
+                ford<linear_dimension_lengths>{}([&](auto idx_diff_linear_hack) {
+                    // compile-time component
+                    index_t offset_diff_linear = coord_begin.GetOffsetDiff(idx_diff_linear_hack);
+
+                    index_t offset =
+                        coord_begin.GetOffset() + offset_diff_nonlinear + offset_diff_linear;
+                });
+            }
+        };
+
+        // run-time component
+        index_t offset_diff_nonlinear = 0;
+
+        f_recursion<nonlinear_independent_dimension_groups.GetSize()>{}.Run();
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3
        }
    }

-    __device__ constexpr index_t GetRegisterClipboardSize() const
+    __device__ constexpr index_t GetRegisterBufferSize() const
    {
        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3
        return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0;
    }

-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
+    __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
+                                          Float* __restrict__ p_clipboard) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3
        }
    }

-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
+                                           Float* __restrict__ p_dst) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3
    }

 #if CK_USE_AMD_INLINE_ASM
-    __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
-                                                 Float* p_clipboard) const
+    __device__ void RunLoadRegisterBuffer_asm(const Float* __restrict__ p_src,
+                                              Float* p_clipboard) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3
        }
    }

-    __device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard,
-                                                  Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer_asm(const Float* __restrict__ p_clipboard,
+                                               Float* __restrict__ p_dst) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3
                      "wrrong! BlockSize is not big enough for ThreadPerDims!");

        constexpr index_t num_active_thread =
-            accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
+            reduce_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});

        if(BlockSize > num_active_thread)
        {
@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3
        }
    }

-    __device__ static constexpr index_t GetRegisterClipboardSize()
+    __device__ static constexpr index_t GetRegisterBufferSize()
    {
        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3
        return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
    }

-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
+    __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
+                                          Float* __restrict__ p_clipboard) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3
        }
    }

-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
+                                           Float* __restrict__ p_dst) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_intrinsic.hpp