Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen

Refactor for MIOpen integration (#4)
Refactor, so can bring multi-index transformation and padding support into MIOpen
52c3fe05 · Chao Liu · GitHub · 9aaeacc8 · 52c3fe05 · 52c3fe05
Unverified Commit 52c3fe05 authored Oct 11, 2019 by Chao Liu Committed by GitHub Oct 11, 2019
20 changed files
--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -2,7 +2,7 @@
 #define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "tensor_descriptor.hpp"
 namespace ck {
@@ -32,6 +32,11 @@ struct ConstantMatrixDescriptor
        return irow * RowStride_ + icol;
    }
+    __host__ __device__ static index_t CalculateOffset(index_t irow, index_t icol)
+    {
+        return GetOffsetFromMultiIndex(irow, icol);
+    }
    template <index_t SubNRow, index_t SubNCol>
    __host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
                                                                      Number<SubNCol>)
@@ -54,9 +59,10 @@ __host__ __device__ constexpr auto
 }
 template <typename... Ts>
-__host__ __device__ constexpr auto make_ConstantMatrixDescriptor(ConstantTensorDescriptor<Ts...>)
+__host__ __device__ constexpr auto
+    make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
 {
-    using TDesc = ConstantTensorDescriptor<Ts...>;
+    using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
    static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
    static_assert(TDesc::GetStrides()[1] == 1, "wrong");
    return ConstantMatrixDescriptor<TDesc::GetLengths()[0],

--- a/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp
-#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
+#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
-#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
+#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 namespace ck {
-// OriginalTensorDesc : ConstantTensorDescriptor<...>
+// OriginalTensorDesc : ConstantTensorDescriptor_deprecated<...>
 //     it's the tensor whose dimensions are to be merged
 // OriginalDimMergeSeqs : Sequence<...>...
 //     each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
 template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
-struct ConstantMergedTensorDescriptor
+struct ConstantMergedTensorDescriptor_deprecated
 {
-    using Type = ConstantMergedTensorDescriptor;
+    using Type = ConstantMergedTensorDescriptor_deprecated;
    static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
    static constexpr index_t nDim         = sizeof...(OriginalDimMergeSeqs);
    static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
-    __host__ __device__ constexpr ConstantMergedTensorDescriptor()
+    __host__ __device__ constexpr ConstantMergedTensorDescriptor_deprecated()
    {
        static_assert(nDim <= nOriginalDim, "wrong!");
@@ -189,7 +189,7 @@ struct ConstantMergedTensorDescriptor
    {
        constexpr auto lengths = GetLengths();
        constexpr auto strides = calculate_tensor_strides_packed(lengths);
-        return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
+        return ConstantTensorDescriptor_deprecated<decltype(lengths), decltype(strides)>{};
    }
 };
@@ -197,7 +197,7 @@ template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
 __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
                                                                       OriginalDimMergeSeqs...)
 {
-    return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
+    return ConstantMergedTensorDescriptor_deprecated<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
 }
 template <class TDesc>

--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
-#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
+#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
-#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
+#define CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 #include "common_header.hpp"
 namespace ck {
 template <class Lengths>
-__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths)
+__host__ __device__ constexpr auto calculate_tensor_strides_packed_deprecated(Lengths)
 {
    return reverse_inclusive_scan_sequence(
               Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
@@ -19,18 +19,18 @@ __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths,
    constexpr index_t L_back_align =
        Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
-    return calculate_tensor_strides_packed_old(
+    return calculate_tensor_strides_packed_deprecated(
        Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
 }
 template <class Lengths, class Strides>
-struct ConstantTensorDescriptor
+struct ConstantTensorDescriptor_deprecated
 {
-    using Type = ConstantTensorDescriptor;
+    using Type = ConstantTensorDescriptor_deprecated;
    static constexpr index_t nDim = Lengths::GetSize();
-    __host__ __device__ constexpr ConstantTensorDescriptor()
+    __host__ __device__ constexpr ConstantTensorDescriptor_deprecated()
    {
        static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent");
    }
@@ -186,7 +186,7 @@ struct ConstantTensorDescriptor
    {
        Array<index_t, nDim> multi_id;
-        using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths()));
+        using PackedStrides = decltype(calculate_tensor_strides_packed_deprecated(GetLengths()));
        // calculate index in each of the dimensions in the order of their dimension
        static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));
@@ -284,7 +284,7 @@ struct ConstantTensorDescriptor
        using extract_lengths = decltype(Lengths::Extract(extract_dims...));
        using extract_strides = decltype(Strides::Extract(extract_dims...));
-        return ConstantTensorDescriptor<extract_lengths, extract_strides>{};
+        return ConstantTensorDescriptor_deprecated<extract_lengths, extract_strides>{};
    }
    template <index_t... IDims>
@@ -294,13 +294,13 @@ struct ConstantTensorDescriptor
    }
    template <class... Ts>
-    __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor<Ts...>)
+    __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor_deprecated<Ts...>)
    {
-        using leaf_tensor = ConstantTensorDescriptor<Ts...>;
+        using leaf_tensor = ConstantTensorDescriptor_deprecated<Ts...>;
-        return ConstantTensorDescriptor<decltype(GetLengths().PushBack(leaf_tensor::GetLengths())),
+        return ConstantTensorDescriptor_deprecated<
-                                        decltype(
+            decltype(GetLengths().PushBack(leaf_tensor::GetLengths())),
-                                            GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
+            decltype(GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
    }
    template <index_t IDimVector, index_t DataPerVector>
@@ -351,7 +351,7 @@ struct ConstantTensorDescriptor
        using vectorized_strides =
            decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{}));
-        return ConstantTensorDescriptor<vectorized_lengths, vectorized_strides>{};
+        return ConstantTensorDescriptor_deprecated<vectorized_lengths, vectorized_strides>{};
    }
    template <index_t IDim, index_t SliceLen>
@@ -359,7 +359,7 @@ struct ConstantTensorDescriptor
    {
        using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{}));
-        return ConstantTensorDescriptor<slice_lengths, Strides>{};
+        return ConstantTensorDescriptor_deprecated<slice_lengths, Strides>{};
    }
    template <index_t... Is>
@@ -367,7 +367,7 @@ struct ConstantTensorDescriptor
    {
        static_assert(slice_lengths.GetSize() == nDim, "wrong!");
-        return ConstantTensorDescriptor<decltype(slice_lengths), Strides>{};
+        return ConstantTensorDescriptor_deprecated<decltype(slice_lengths), Strides>{};
    }
    template <index_t IDim, index_t SliceLength, index_t SliceStride>
@@ -379,7 +379,7 @@ struct ConstantTensorDescriptor
        using new_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLength>{}));
        using new_strides = decltype(Strides::Modify(Number<IDim>{}, Number<new_stride>{}));
-        return ConstantTensorDescriptor<new_lengths, new_strides>{};
+        return ConstantTensorDescriptor_deprecated<new_lengths, new_strides>{};
    }
    template <index_t IDim, index_t... FoldIntervals>
@@ -418,7 +418,7 @@ struct ConstantTensorDescriptor
        constexpr auto new_strides =
            GetStrides().Extract(left).PushBack(fold_strides).PushBack(GetStrides().Extract(right));
-        return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{};
+        return ConstantTensorDescriptor_deprecated<decltype(new_lengths), decltype(new_strides)>{};
    }
    template <index_t IDim, index_t... FoldIntervals>
@@ -462,54 +462,55 @@ struct ConstantTensorDescriptor
                                         .PushBack(Number<unfold_stride>{})
                                         .PushBack(GetStrides().Extract(right));
-        return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{};
+        return ConstantTensorDescriptor_deprecated<decltype(new_lengths), decltype(new_strides)>{};
    }
    __host__ __device__ static constexpr auto Pack()
    {
-        using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
+        using packed_strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{}));
-        return ConstantTensorDescriptor<Lengths, packed_strides>{};
+        return ConstantTensorDescriptor_deprecated<Lengths, packed_strides>{};
    }
    template <class MapNew2Old>
    __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old)
    {
-        return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})),
+        return ConstantTensorDescriptor_deprecated<
-                                        decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{};
+            decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})),
+            decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{};
    }
    template <class MapOld2New>
    __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
    {
-        return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenOld2New(MapOld2New{})),
+        return ConstantTensorDescriptor_deprecated<
-                                        decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{};
+            decltype(Lengths::ReorderGivenOld2New(MapOld2New{})),
+            decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{};
    }
 };
 template <class Lengths>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
 {
-    using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
+    using Strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{}));
-    return ConstantTensorDescriptor<Lengths, Strides>{};
+    return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
 }
 template <class Lengths, class Strides>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides)
 {
-    return ConstantTensorDescriptor<Lengths, Strides>{};
+    return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
 }
 template <class Lengths, index_t Align>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
 {
    using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{}));
-    return ConstantTensorDescriptor<Lengths, Strides>{};
+    return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
 }
 template <index_t... Lengths, index_t... Strides>
-__host__ __device__ void
+__host__ __device__ void print_ConstantTensorDescriptor(
-print_ConstantTensorDescriptor(const char* s,
+    const char* s, ConstantTensorDescriptor_deprecated<Sequence<Lengths...>, Sequence<Strides...>>)
-                               ConstantTensorDescriptor<Sequence<Lengths...>, Sequence<Strides...>>)
 {
    constexpr index_t ndim = sizeof...(Lengths);

--- a/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp
+#ifndef CK_PRINT_TENSOR_DESCRIPTOR_HPP
+#define CK_PRINT_TENSOR_DESCRIPTOR_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+namespace ck {
+template <typename... NativeDimensions>
+__host__ __device__ void
+print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
+{
+    print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides());
+}
+template <typename... Ts>
+__host__ __device__ void print_tensor_descriptor(const char* s,
+                                                 const TransformedTensorDescriptor<Ts...>& desc)
+{
+    print_tensor_descriptor_impl(s, desc.GetLengths());
+}
+template <index_t... Lengths, index_t... Strides>
+__host__ __device__ void
+print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>, Sequence<Strides...>)
+{
+    constexpr index_t nDim = sizeof...(Lengths);
+    static_assert(nDim > 0 && nDim <= 12, "wrong!");
+    static_if<nDim == 1>{}([&](auto) {
+        printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...);
+    });
+    static_if<nDim == 2>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...);
+    });
+    static_if<nDim == 3>{}([&](auto) {
+        printf(
+            "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...);
+    });
+    static_if<nDim == 4>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 5>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 6>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 7>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 8>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 9>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
+               "%u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 10>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
+               "%u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 11>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
+               "%u %u "
+               "%u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+    static_if<nDim == 12>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
+               "%u %u %u %u "
+               "%u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+}
+template <index_t... Lengths>
+__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>)
+{
+    constexpr index_t nDim = sizeof...(Lengths);
+    static_assert(nDim > 0 && nDim <= 12, "wrong!");
+    static_if<nDim == 1>{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); });
+    static_if<nDim == 2>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); });
+    static_if<nDim == 3>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); });
+    static_if<nDim == 4>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); });
+    static_if<nDim == 5>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); });
+    static_if<nDim == 6>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); });
+    static_if<nDim == 7>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); });
+    static_if<nDim == 8>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+    static_if<nDim == 9>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+    static_if<nDim == 10>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+    static_if<nDim == 11>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+    static_if<nDim == 12>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
-#ifndef CK_TENSOR_COORDINATE_V2_HPP
+#ifndef CK_TENSOR_COORDINATE_HPP
-#define CK_TENSOR_COORDINATE_V2_HPP
+#define CK_TENSOR_COORDINATE_HPP
 #include "common_header.hpp"
 #include "dimension.hpp"
@@ -8,9 +8,24 @@
 namespace ck {
+// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
+// At the bare minimun, user should be able to query the following information from a tensor
+// coordinate:
+//   1. Tensor descriptor
+//   2. Location, represented in the form of multi-index
+//   3. Location, represented in the form of the offset to the origin of the tensor
+//   4. If the location is inside invalid area or not, i.e. the padding area of an implicitly padded
+//      tensor is considered invalid, because the padding area doesn't have any physical memory
+//      allocation
+// A tensor cooridnate also provides following functionality:
+//   1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
+//      can freely move the "point of location" inside the tensor
+// wrapper class for NativeTensorCoordinate and TransformedTensorCoordinate
 template <typename TensorDesc>
 struct TensorCoordinate;
+// tensor coordinate for native tensor
 template <typename NativeTensorDesc>
 struct NativeTensorCoordinate
 {
@@ -78,12 +93,10 @@ struct NativeTensorCoordinate
        return coord;
    }
-#if 0 // tweaking
    __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
    {
        return tensor_desc_type::CalculateOffsetDiff(idx_diff);
    }
-#endif
    __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
@@ -96,6 +109,7 @@ struct NativeTensorCoordinate
    index_t mOffset;
 };
+// tensor coordinate for transformed tensor
 template <typename TransformedTensorDesc>
 struct TransformedTensorCoordinate
 {
@@ -177,10 +191,10 @@ struct TransformedTensorCoordinate
        return coord_up;
    }
-#if 0 // tweaking
    // Calculate offset diff without updating tensor-coordinate
    // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
    //   then all calculation can be done at compile-time.
+    // TODO: this function is not compiled to expected ISA
    __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
    {
        // For transformation of multi-index difference, not all transformation functions need to
@@ -191,7 +205,6 @@ struct TransformedTensorCoordinate
        return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
    }
-#endif
    __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
    {

--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -2,12 +2,12 @@
 #define CK_TENSOR_COORDINATE_DEPRECATED_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 namespace ck {
-// TensorDesc is ConstantTensorDescriptor
+// TensorDesc is ConstantTensorDescriptor_deprecated
 template <class TensorDesc>
 struct NormalTensorCoordinate_deprecated
 {
@@ -95,18 +95,19 @@ struct NormalTensorCoordinate_deprecated
    index_t mOffset;
 };
-// TensorDesc is ConstantMergedTensorDescriptor
+// TensorDesc is ConstantMergedTensorDescriptor_deprecated
 template <class TensorDesc>
-struct MergedTensorCoordinate
+struct MergedTensorCoordinate_deprecated
 {
-    using type             = MergedTensorCoordinate;
+    using type             = MergedTensorCoordinate_deprecated;
    using tensor_desc_type = TensorDesc;
    static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
    static constexpr index_t nOriginalDim =
        tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
-    __host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
+    __host__
+        __device__ constexpr MergedTensorCoordinate_deprecated(Array<index_t, nDim> tensor_index)
        : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
    {
        // partial offset on each dimension
@@ -127,8 +128,8 @@ struct MergedTensorCoordinate
    }
    template <class... Xs>
-    __host__ __device__ constexpr MergedTensorCoordinate(Xs... xs)
+    __host__ __device__ constexpr MergedTensorCoordinate_deprecated(Xs... xs)
-        : MergedTensorCoordinate(Array<index_t, nDim>{xs...})
+        : MergedTensorCoordinate_deprecated(Array<index_t, nDim>{xs...})
    {
    }
@@ -311,7 +312,7 @@ struct MergedTensorCoordinate
    // dimensions, and those merged dimensions, that would never be involved in index
    // arithmetic after construction of TensorCoordinate.
    // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
-    // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
+    // and simplify implementation of ConstantMergedTensorDescriptor_deprecated, so we don't need to
    // count on compiler to optimize away those register memory for us
    Array<index_t, nOriginalDim> mOriginalIndex;
    Array<index_t, nDim> mPartialOffsets;
@@ -326,16 +327,17 @@ struct TensorCoordinate_deprecated
    private:
    template <class... Ts>
    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
    {
-        return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
+        return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
    }
    template <class... Ts>
    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
    {
-        return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
+        return MergedTensorCoordinate_deprecated<
+            ConstantMergedTensorDescriptor_deprecated<Ts...>>();
    }
    public:

--- a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
 #ifndef CK_TENSOR_COORDINATE_HELPER_HPP
 #define CK_TENSOR_COORDINATE_HELPER_HPP
-#include "tensor_coordiante_v2.hpp"
+#include "tensor_coordiante_hpp"
 namespace ck {
 template <typename TensorDesc>
 __host__ __device__ constexpr auto
-make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
+make_tensor_coordinate(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
 {
    return typename TensorCoordinate<TensorDesc>::type(idx);
 }

--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -7,6 +7,8 @@
 namespace ck {
+// tensor descriptor for "native tensor"
+// A "native tensor" is a "true" tensor that can be represented by Lengths and Strides
 template <typename... NativeDimensions>
 struct NativeTensorDescriptor
 {
@@ -113,12 +115,10 @@ struct NativeTensorDescriptor
    __host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
-#if 0
    __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
    {
        return Tuple<>{};
    }
-#endif
    __host__ __device__ static constexpr bool
    IsUpperIndexMappedToValidOffset(const Index& /* idx */)
@@ -127,14 +127,11 @@ struct NativeTensorDescriptor
    }
 };
-// LowerTensorDescriptor
+// Tensor descriptor for "transformed tensor"
-// Transforms: Tuple<DimensionTransforms...>
+template <typename LowTensorDescriptor, // NativeTensorDescriptor or TransformedTensorDescriptor
-// LowerDimensionIds: Tuple<Sequence<...>>
+          typename Transforms,          // Tuple<MultIndexTransforms...>
-// UpperDimensionIds: Tuple<Sequence<...>>
+          typename LowDimensionIds,     // Tuple<Sequence<...>>
-template <typename LowTensorDescriptor,
+          typename UpDimensionIds>      // Tuple<Sequence<...>>
-          typename Transforms,
-          typename LowDimensionIds,
-          typename UpDimensionIds>
 struct TransformedTensorDescriptor
 {
    using type                          = TransformedTensorDescriptor;
@@ -412,6 +409,7 @@ struct TransformedTensorDescriptor
    {
 #if 0
        // create tuple of linear dimension masks, for all transformations
+        // TODO: this doesn't compile, because transform_tuples() complain about constexpr
        constexpr auto tuple_of_linear_dimension_mask =
            transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{},
                             Transforms{},
@@ -419,7 +417,7 @@ struct TransformedTensorDescriptor
                             UpDimensionIds{});
 #else
        // create tuple of linear dimension masks, for all transformations
-        // TODO: this is a hack, transform_tuples() doesn't compile, complain about constexpr
+        // TODO: this is a hack
        constexpr auto tuple_of_linear_dimension_mask = dummy_transform_tuples_impl(
            lambda_get_linear_dimension_mask_of_single_tranform{},
            Transforms{},
@@ -465,7 +463,7 @@ struct TransformedTensorDescriptor
 #if 0
    __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
    {
-        // not implemented
+        // TODO: not implemented
    }
 #endif

--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -63,10 +63,11 @@ template <typename LowerTensorDescriptor,
          index_t... LowerLengths,
          index_t... LowerDimensionIds,
          index_t... UpperDimensionIds>
-__host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDescriptor,
+__host__ __device__ constexpr auto
-                                                                  Sequence<LowerLengths...>,
+    reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
-                                                                  Sequence<LowerDimensionIds...>,
+                                               Sequence<LowerLengths...>,
-                                                                  Sequence<UpperDimensionIds...>)
+                                               Sequence<LowerDimensionIds...>,
+                                               Sequence<UpperDimensionIds...>)
 {
    return TransformedTensorDescriptor<LowerTensorDescriptor,
                                       Tuple<PassThrough<LowerLengths>...>,
@@ -74,17 +75,40 @@ __host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDes
                                       Tuple<Sequence<UpperDimensionIds>...>>{};
 }
-template <typename LowerTensorDescriptor, typename MapLower2Upper>
+// reorder a NativeTensorDescriptor
+template <typename... Ts, typename MapLower2Upper>
+__host__ __device__ constexpr auto
+    reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
+{
+    static_assert(is_valid_sequence_map<MapLower2Upper>{},
+                  "wrong! MapLower2Upper is not a valid map");
+    constexpr auto old_desc = NativeTensorDescriptor<Ts...>{};
+    static_assert(old_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!");
+    constexpr auto new_lengths = old_desc.GetLengths().ReorderGivenOld2New(MapLower2Upper{});
+    constexpr auto new_strides = old_desc.GetStrides().ReorderGivenOld2New(MapLower2Upper{});
+    return make_native_tensor_descriptor(new_lengths, new_strides);
+}
+// reorder a TransformedTensorDescriptor
+template <typename... Ts, typename MapLower2Upper>
 __host__ __device__ constexpr auto
-    reorder_tensor_descriptor_given_lower2upper(LowerTensorDescriptor, MapLower2Upper)
+    reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
 {
    static_assert(is_valid_sequence_map<MapLower2Upper>{},
                  "wrong! MapLower2Upper is not a valid map");
-    return reorder_tensor_descriptor_impl(
+    constexpr auto low_desc = TransformedTensorDescriptor<Ts...>{};
-        LowerTensorDescriptor{},
-        LowerTensorDescriptor::GetLengths(),
+    static_assert(low_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!");
-        typename arithmetic_sequence_gen<0, LowerTensorDescriptor::GetNumOfDimension(), 1>::type{},
+    return reorder_transformed_tensor_descriptor_impl(
+        low_desc,
+        low_desc.GetLengths(),
+        typename arithmetic_sequence_gen<0, low_desc.GetNumOfDimension(), 1>::type{},
        MapLower2Upper{});
 }
@@ -97,7 +121,7 @@ __host__ __device__ constexpr auto
 }
 template <typename Lengths, typename Strides>
-__host__ __device__ constexpr bool AreDimensionsUnfoldable(Lengths, Strides)
+__host__ __device__ constexpr bool are_dimensions_unfoldable(Lengths, Strides)
 {
    static_assert(Lengths::Size() == Strides::Size(), "wrong!");
@@ -129,7 +153,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
    constexpr auto right = typename arithmetic_sequence_gen<LastUnfoldDim + 1, nDim, 1>::type{};
    // sanity-checknfoldable
-    static_assert(AreDimensionsUnfoldable(desc.GetLengths(middle), desc.GetStrides(middle)),
+    static_assert(are_dimensions_unfoldable(desc.GetLengths(middle), desc.GetStrides(middle)),
                  "wrong! not unfoldable");
    // unfolded length, stride
@@ -148,30 +172,6 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
    return make_native_tensor_descriptor(new_lengths, new_strides);
 }
-#if 0
-// not implemented
-template <typename LowerTensorDescriptor,
-          typename PadDimensionIds,
-          typename LeftPads,
-          typename RightPads>
-__host__ __device__ constexpr auto
-    pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
-{
-    constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
-    constexpr auto non_pad_low_dim_ids = xxx;
-    return transform_tensor_descriptor(
-        LowerTensorDescriptor{},
-        make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
-                       LeftPads,
-                       RightPads>{})
-            .PushBack(PassThrough<xxxx>...),
-        make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
-        sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
-}
-#endif
 // a cluster map 1d index to N-d index
 template <typename Lengths, typename ArrangeOrder>
 struct ClusterDescriptor
@@ -205,169 +205,7 @@ template <typename Lengths,
 __host__ __device__ constexpr auto make_cluster_descriptor(
    Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
 {
-    return ClusterDescriptor<Lengths, ArrangeOrder>{};
+    return ClusterDescriptor<Lengths, decltype(order)>{};
-}
-template <typename... NativeDimensions>
-__host__ __device__ void
-print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
-{
-    print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides());
-}
-template <typename... Ts>
-__host__ __device__ void print_tensor_descriptor(const char* s,
-                                                 const TransformedTensorDescriptor<Ts...>& desc)
-{
-    print_tensor_descriptor_impl(s, desc.GetLengths());
-}
-template <index_t... Lengths, index_t... Strides>
-__host__ __device__ void
-print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>, Sequence<Strides...>)
-{
-    constexpr index_t nDim = sizeof...(Lengths);
-    static_assert(nDim > 0 && nDim <= 12, "wrong!");
-    static_if<nDim == 1>{}([&](auto) {
-        printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...);
-    });
-    static_if<nDim == 2>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...);
-    });
-    static_if<nDim == 3>{}([&](auto) {
-        printf(
-            "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...);
-    });
-    static_if<nDim == 4>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 5>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 6>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 7>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 8>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 9>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
-               "%u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 10>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
-               "%u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 11>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
-               "%u %u "
-               "%u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-    static_if<nDim == 12>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
-               "%u %u %u %u "
-               "%u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-}
-template <index_t... Lengths>
-__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>)
-{
-    constexpr index_t nDim = sizeof...(Lengths);
-    static_assert(nDim > 0 && nDim <= 12, "wrong!");
-    static_if<nDim == 1>{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); });
-    static_if<nDim == 2>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); });
-    static_if<nDim == 3>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); });
-    static_if<nDim == 4>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); });
-    static_if<nDim == 5>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); });
-    static_if<nDim == 6>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); });
-    static_if<nDim == 7>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); });
-    static_if<nDim == 8>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-    static_if<nDim == 9>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-    static_if<nDim == 10>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-    static_if<nDim == 11>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-    static_if<nDim == 12>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
 }
 } // namespace ck

--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -68,64 +68,118 @@ struct BlockwiseGenericTensorSliceCopy_v4
    template <typename BlockSrcData,
              typename ThreadBufferData,
-              address_space_t BlockSrcAddressSpace     = address_space_t::generic,
+              AddressSpace BlockSrcAddressSpace,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic>
+              AddressSpace ThreadBufferAddressSpace>
+    __device__ void
+    RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                        ThreadBufferData* p_thread_buffer,
+                        integral_constant<AddressSpace, BlockSrcAddressSpace>,
+                        integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
+    {
+        constexpr auto block_src_address_space =
+            integral_constant<AddressSpace, BlockSrcAddressSpace>{};
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+        constexpr bool has_optimized_address_calculation =
+            decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
+        // TODO: threadwise copy is still being tweaked
+        if(has_optimized_address_calculation)
+        {
+            mThreadwiseLoad.Run_optimized_src_address_calculation(
+                p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
+        }
+        else
+        {
+            mThreadwiseLoad.Run(
+                p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
+        }
+    }
+    template <typename BlockSrcData, typename ThreadBufferData>
    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
                                        ThreadBufferData* p_thread_buffer) const
    {
-#if 1
+        constexpr auto generic_address_space =
-        mThreadwiseLoad.template Run<BlockSrcData,
+            integral_constant<AddressSpace, AddressSpace::generic>{};
-                                     ThreadBufferData,
-                                     BlockSrcAddressSpace,
+        RunLoadThreadBuffer(
-                                     ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
+            p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
-#else // tweaking
-        mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
-                                                                       ThreadBufferData,
-                                                                       BlockSrcAddressSpace,
-                                                                       ThreadBufferAddressSpace>(
-            p_block_src, p_thread_buffer);
-#endif
    }
    template <typename ThreadBufferData,
              typename BlockDstData,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic,
+              AddressSpace ThreadBufferAddressSpace,
-              address_space_t BlockDstAddressSpace     = address_space_t::generic>
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                         BlockDstData* p_block_dst,
+                         integral_constant<AddressSpace, ThreadBufferAddressSpace>,
+                         integral_constant<AddressSpace, BlockDstAddressSpace>) const
+    {
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+        constexpr auto block_dst_address_space =
+            integral_constant<AddressSpace, BlockDstAddressSpace>{};
+        constexpr bool has_optimized_address_calculation =
+            decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
+        // TODO: threadwise copy is still being tweaked
+        if(has_optimized_address_calculation)
+        {
+            mThreadwiseStore.Run_optimized_dst_address_calculation(
+                p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
+        }
+        else
+        {
+            mThreadwiseStore.Run(
+                p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
+        }
+    }
+    template <typename ThreadBufferData, typename BlockDstData>
    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
                                         BlockDstData* p_block_dst) const
    {
-#if 1
+        constexpr auto generic_address_space =
-        mThreadwiseStore.template Run<ThreadBufferData,
+            integral_constant<AddressSpace, AddressSpace::generic>{};
-                                      BlockDstData,
-                                      ThreadBufferAddressSpace,
+        RunStoreThreadBuffer(
-                                      BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
+            p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
-#else // tweaking
-        mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
-                                                                        BlockDstData,
-                                                                        ThreadBufferAddressSpace,
-                                                                        BlockDstAddressSpace>(
-            p_thread_buffer, p_block_dst);
-#endif
    }
    template <typename BlockSrcData,
              typename BlockDstData,
-              address_space_t BlockSrcAddressSpace = address_space_t::generic,
+              AddressSpace BlockSrcAddressSpace,
-              address_space_t BlockDstAddressSpace = address_space_t::generic>
+              AddressSpace BlockDstAddressSpace>
-    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
+    __device__ void
+    Run(const BlockSrcData* p_block_src,
+        BlockDstData* p_block_dst,
+        integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
+        integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
    {
        BlockSrcData p_thread_buffer[GetThreadBufferSize()];
-        RunLoadThreadBuffer<BlockSrcData,
+        constexpr auto generic_address_space =
-                            BlockSrcData,
+            integral_constant<AddressSpace, AddressSpace::generic>{};
-                            BlockSrcAddressSpace,
-                            address_space_t::generic>(p_block_src, p_thread_buffer);
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
        // if there is type conversion, it's done during store
-        RunStoreThreadBuffer<BlockSrcData,
+        RunStoreThreadBuffer(
-                             BlockDstData,
+            p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
-                             address_space_t::generic,
+    }
-                             BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
+    template <typename BlockSrcData, typename BlockDstData>
+    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
    }
    template <typename T, bool PositiveDirection>

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -2,15 +2,11 @@
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "tensor_coordinate_deprecated.hpp"
 #include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
-#endif
 namespace ck {
 // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
@@ -20,7 +16,7 @@ namespace ck {
 // that, on a merged dimension that constains multiple original dimensions, the length of
 // the last original dimension need to be evenly dividable by its sub-lengths. Also, the
 // repeat-length on the merged dimension need to be 1. These sanity checks are performed
-// in constructor of BlockwiseGenericTensorSliceCopy_v1
+// in constructor of BlockwiseGenericTensorSliceCopy_v1_deprecated
 template <index_t BlockSize,
          typename SrcDesc,
          typename DstDesc,
@@ -34,7 +30,7 @@ template <index_t BlockSize,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
          index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v1
+struct BlockwiseGenericTensorSliceCopy_v1_deprecated
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
@@ -62,7 +58,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
    Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
    Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
-    __device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
+    __device__
+    BlockwiseGenericTensorSliceCopy_v1_deprecated(Array<index_t, nDim> src_block_data_id_begin,
                                                  Array<index_t, nDim> dst_block_data_id_begin)
    {
        // check NDim consistency
@@ -196,14 +193,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
        return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
    }
-    __device__ static constexpr index_t GetRegisterBufferSize()
+    __device__ static constexpr index_t GetThreadBufferSize()
    {
        return GetRegisterBufferDescriptor().GetElementSpace();
    }
    template <typename TData>
-    __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
+    __device__ void RunLoadThreadBuffer(const TData* __restrict__ p_src,
-                                          TData* __restrict__ p_buffer) const
+                                        TData* __restrict__ p_buffer) const
    {
        constexpr auto thread_sub_tensor_lengths = SubLengths{};
@@ -244,22 +241,22 @@ struct BlockwiseGenericTensorSliceCopy_v1
            // that constains multiple original dimensions, the length of the last original
            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
            // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1
+            // of BlockwiseGenericTensorSliceCopy_v1_deprecated
-            ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<SrcDesc,
-                                                  decltype(thread_buffer_desc),
+                                                             decltype(thread_buffer_desc),
-                                                  SubLengths,
+                                                             SubLengths,
-                                                  SrcDimAccessOrder,
+                                                             SrcDimAccessOrder,
-                                                  SrcVectorAccessDim,
+                                                             SrcVectorAccessDim,
-                                                  SrcDataPerAccess,
+                                                             SrcDataPerAccess,
-                                                  1>(make_zero_array<index_t, nDim>(),
+                                                             1>(make_zero_array<index_t, nDim>(),
-                                                     make_zero_array<index_t, nDim>())
+                                                                make_zero_array<index_t, nDim>())
                .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
        });
    }
    template <typename TData>
-    __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
+    __device__ void RunStoreThreadBuffer(const TData* __restrict__ p_buffer,
-                                           TData* __restrict__ p_dst) const
+                                         TData* __restrict__ p_dst) const
    {
        constexpr auto thread_sub_tensor_lengths = SubLengths{};
@@ -299,14 +296,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
            // that constains multiple original dimensions, the length of the last original
            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
            // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1
+            // of BlockwiseGenericTensorSliceCopy_v1_deprecated
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<decltype(thread_buffer_desc),
-                                                  DstDesc,
+                                                             DstDesc,
-                                                  SubLengths,
+                                                             SubLengths,
-                                                  DstDimAccessOrder,
+                                                             DstDimAccessOrder,
-                                                  DstVectorAccessDim,
+                                                             DstVectorAccessDim,
-                                                  1,
+                                                             1,
-                                                  DstDataPerAccess>(
+                                                             DstDataPerAccess>(
                make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
                .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
        });
@@ -315,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
    template <typename TData>
    __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
    {
-        TData p_buffer[GetRegisterBufferSize()];
+        TData p_buffer[GetThreadBufferSize()];
-        RunLoadRegisterBuffer(p_src, p_buffer);
+        RunLoadThreadBuffer(p_src, p_buffer);
-        RunStoreRegisterBuffer(p_buffer, p_dst);
+        RunStoreThreadBuffer(p_buffer, p_dst);
    }
    // When moving the slicing windows along a merged dimension, if the strides of the
@@ -432,14 +429,14 @@ template <index_t BlockSize,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
          index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v2
+struct BlockwiseGenericTensorSliceCopy_v2_deprecated
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
    using Index = MultiIndex<nDim>;
-    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin,
+    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2_deprecated(
-                                                            const Index& dst_block_slice_origin)
+        const Index& src_block_slice_origin, const Index& dst_block_slice_origin)
    {
        static_assert(
            nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
@@ -478,42 +475,96 @@ struct BlockwiseGenericTensorSliceCopy_v2
        return ThreadBufferDesc::GetElementSpace();
    }
-    template <typename SrcData,
+    template <typename BlockSrcData,
-              typename DstData,
+              typename ThreadBufferData,
-              address_space_t BlockSrcAddressSpace     = address_space_t::generic,
+              AddressSpace BlockSrcAddressSpace,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic>
+              AddressSpace ThreadBufferAddressSpace>
-    __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const
+    __device__ void
+    RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                        ThreadBufferData* p_thread_buffer,
+                        integral_constant<AddressSpace, BlockSrcAddressSpace>,
+                        integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
    {
-        mThreadwiseLoad
+        constexpr auto block_src_address_space =
-            .template Run<SrcData, DstData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
+            integral_constant<AddressSpace, BlockSrcAddressSpace>{};
-                p_block_src, p_thread_buffer);
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+        mThreadwiseLoad.Run(
+            p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
    }
-    template <typename SrcData,
+    template <typename BlockSrcData, typename ThreadBufferData>
-              typename DstData,
+    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic,
+                                        ThreadBufferData* p_thread_buffer) const
-              address_space_t BlockDstAddressSpace     = address_space_t::generic>
-    __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const
    {
-        mThreadwiseStore
+        constexpr auto generic_address_space =
-            .template Run<SrcData, DstData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
+            integral_constant<AddressSpace, AddressSpace::generic>{};
-                p_thread_buffer, p_block_dst);
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
    }
-    template <typename SrcData,
+    template <typename ThreadBufferData,
-              typename DstData,
+              typename BlockDstData,
-              address_space_t BlockSrcAddressSpace = address_space_t::generic,
+              AddressSpace ThreadBufferAddressSpace,
-              address_space_t BlockDstAddressSpace = address_space_t::generic>
+              AddressSpace BlockDstAddressSpace>
-    __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const
+    __device__ void
+    RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                         BlockDstData* p_block_dst,
+                         integral_constant<AddressSpace, ThreadBufferAddressSpace>,
+                         integral_constant<AddressSpace, BlockDstAddressSpace>) const
    {
-        SrcData p_thread_buffer[GetThreadBufferSize()];
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+        constexpr auto block_dst_address_space =
+            integral_constant<AddressSpace, BlockDstAddressSpace>{};
-        RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, address_space_t::generic>(
+        mThreadwiseStore.Run(
-            p_block_src, p_thread_buffer);
+            p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
+    }
+    template <typename ThreadBufferData, typename BlockDstData>
+    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                                         BlockDstData* p_block_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        RunStoreThreadBuffer(
+            p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
+    }
+    template <typename BlockSrcData,
+              typename BlockDstData,
+              AddressSpace BlockSrcAddressSpace,
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    Run(const BlockSrcData* p_block_src,
+        BlockDstData* p_block_dst,
+        integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
+        integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
+    {
+        BlockSrcData p_thread_buffer[GetThreadBufferSize()];
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
        // if there is type conversion, it's done during store
-        RunStoreThreadBuffer<SrcData, DstData, address_space_t::generic, BlockDstAddressSpace>(
+        RunStoreThreadBuffer(
-            p_thread_buffer, p_block_dst);
+            p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
+    }
+    template <typename BlockSrcData, typename BlockDstData>
+    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
    }
    template <typename T, bool PositiveDirection>
@@ -533,25 +584,25 @@ struct BlockwiseGenericTensorSliceCopy_v2
    private:
    using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
-    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
+    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<SrcDesc,
-                                                                 ThreadBufferDesc,
+                                                                            ThreadBufferDesc,
-                                                                 SubLengths,
+                                                                            SubLengths,
-                                                                 SrcDimAccessOrder,
+                                                                            SrcDimAccessOrder,
-                                                                 SrcDimAccessOrder,
+                                                                            SrcDimAccessOrder,
-                                                                 SrcVectorAccessDim,
+                                                                            SrcVectorAccessDim,
-                                                                 SrcVectorAccessDim,
+                                                                            SrcVectorAccessDim,
-                                                                 SrcDataPerAccess,
+                                                                            SrcDataPerAccess,
-                                                                 1>;
+                                                                            1>;
-    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<ThreadBufferDesc,
+    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<ThreadBufferDesc,
-                                                                  DstDesc,
+                                                                             DstDesc,
-                                                                  SubLengths,
+                                                                             SubLengths,
-                                                                  DstDimAccessOrder,
+                                                                             DstDimAccessOrder,
-                                                                  DstDimAccessOrder,
+                                                                             DstDimAccessOrder,
-                                                                  DstVectorAccessDim,
+                                                                             DstVectorAccessDim,
-                                                                  DstVectorAccessDim,
+                                                                             DstVectorAccessDim,
-                                                                  1,
+                                                                             1,
-                                                                  DstDataPerAccess>;
+                                                                             DstDataPerAccess>;
    ThreadwiseLoad mThreadwiseLoad;
    ThreadwiseStore mThreadwiseStore;

--- a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
@@ -2,7 +2,7 @@
 #define CK_THREADWISE_DIRECT_CONVOLUTION_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "threadwise_tensor_slice_copy.hpp"
 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -3,102 +3,164 @@
 #include "common_header.hpp"
 #include "ConstantMatrixDescriptor.hpp"
+#include "math.hpp"
 namespace ck {
-template <class Float, class Matrix>
+template <typename Float, class Matrix>
 __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
 {
    for(index_t i = 0; i < Matrix::NRow(); ++i)
    {
        for(index_t j = 0; j < Matrix::NCol(); ++j)
        {
-            const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
+            const index_t id = Matrix::CalculateOffset(i, j);
            p_thread[id]     = Float(0);
        }
    }
 }
-template <class Float,
+template <typename SrcMatrix,
-          class SrcMatrix,
+          typename DstMatrix,
-          class DstMatrix,
+          index_t NSliceRow,
-          index_t NRow,
+          index_t NSliceCol,
-          index_t NCol,
+          index_t DataPerAccess>
-          index_t DataPerRead>
+struct ThreadwiseMatrixSliceCopy
-__device__ void threadwise_matrix_copy(SrcMatrix,
-                                       const Float* __restrict__ p_src,
-                                       DstMatrix,
-                                       Float* __restrict__ p_dst,
-                                       Sequence<NRow, NCol>,
-                                       Number<DataPerRead>)
 {
-    static_assert(NCol % DataPerRead == 0, "wrong! should be NCol % == DataPerRead == 0");
+    __device__ constexpr ThreadwiseMatrixSliceCopy()
+    {
-    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
+        static_assert(SrcMatrix::RowStride() % DataPerAccess == 0 &&
+                          DstMatrix::RowStride() % DataPerAccess == 0,
-    constexpr auto src_mtx = SrcMatrix{};
+                      "wrong! wrong alignment");
-    constexpr auto dst_mtx = DstMatrix{};
+        static_assert(NSliceCol % DataPerAccess == 0,
+                      "wrong! should be NSliceCol % DataPerAccess == 0");
+    }
-    for(index_t i = 0; i < NRow; ++i)
+    template <typename Data>
+    __device__ static void Run(const Data* p_src, Data* p_dst)
    {
-        for(index_t j = 0; j < NCol; j += DataPerRead)
+        using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
+        for(index_t i = 0; i < NSliceRow; ++i)
        {
-            const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j);
+            for(index_t j = 0; j < NSliceCol; j += DataPerAccess)
-            const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j);
+            {
+                const index_t src_index = SrcMatrix::CalculateOffset(i, j);
+                const index_t dst_index = DstMatrix::CalculateOffset(i, j);
-            *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+                *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
-                *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+                    *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+            }
        }
    }
-}
+};
-template <class MatrixA,
+// C += transpose(A) * B
-          class MatrixB,
+//   Element of matrix can be vectorized data
-          class MatrixC,
+template <typename MatrixA, typename MatrixB, typename MatrixC>
-          bool TransA,
+struct ThreadwiseGemmTransANormalBNormalC
-          bool TransB,
-          bool TransC,
-          class FloatA,
-          class FloatB,
-          class FloatC>
-__device__ void threadwise_gemm(MatrixA,
-                                integral_constant<bool, TransA>,
-                                const FloatA* __restrict__ p_a_thread,
-                                MatrixB,
-                                integral_constant<bool, TransB>,
-                                const FloatB* __restrict__ p_b_thread,
-                                MatrixC,
-                                integral_constant<bool, TransC>,
-                                FloatC* __restrict__ p_c_thread)
 {
-    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) {
+    __device__ constexpr ThreadwiseGemmTransANormalBNormalC()
-        constexpr auto a_mtx = MatrixA{};
+    {
-        constexpr auto b_mtx = MatrixB{};
+        static_assert(MatrixA::NRow() == MatrixB::NRow() && MatrixA::NCol() == MatrixC::NRow() &&
-        constexpr auto c_mtx = MatrixC{};
+                          MatrixB::NCol() == MatrixC::NCol(),
+                      "wrong!");
+    }
-        constexpr index_t M = c_mtx.NRow();
+    template <typename FloatA, typename FloatB, typename FloatC>
-        constexpr index_t N = c_mtx.NCol();
+    __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-        constexpr index_t K = a_mtx.NRow(); // A is transposed
+    {
+        constexpr index_t M = MatrixC::NRow();
+        constexpr index_t N = MatrixC::NCol();
+        constexpr index_t K = MatrixA::NRow(); // A is transposed
        for(index_t k = 0; k < K; ++k)
        {
-            for(index_t i = 0; i < M; ++i)
+            for(index_t m = 0; m < M; ++m)
            {
-                for(index_t j = 0; j < N; ++j)
+                for(index_t n = 0; n < N; ++n)
                {
-                    const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed
+                    const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
-                    const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j);
+                    const index_t bindex = MatrixB::CalculateOffset(k, n);
-                    const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j);
+                    const index_t cindex = MatrixC::CalculateOffset(m, n);
-                    p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex];
+                    p_c[cindex] +=
+                        inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
                }
            }
        }
-    }).Else([&](auto fwd) {
+    }
-        // not implemented
-        static_assert(fwd(false), "wrong! support for this config is not implemented");
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-    });
+    template <typename FloatA, typename FloatB, typename FloatC>
-}
+    __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+        constexpr index_t M = MatrixC::NRow();
+        constexpr index_t N = MatrixC::NCol();
+        constexpr index_t K = MatrixA::NRow(); // A is transposed
+        static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
+        for(index_t k = 0; k < K; ++k)
+        {
+            for(index_t m = 0; m < M; ++m)
+            {
+                const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
+                static_if<N == 2>{}([&](auto) {
+                    const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
+                    const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
+                    const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
+                    const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
+                    __outer_product_1x2(
+                        p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
+                });
+                static_if<N == 4>{}([&](auto) {
+                    const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
+                    const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
+                    const index_t bindex_2 = MatrixB::CalculateOffset(k, 2);
+                    const index_t bindex_3 = MatrixB::CalculateOffset(k, 3);
+                    const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
+                    const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
+                    const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
+                    const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
+                    __outer_product_1x4(p_a[aindex],
+                                        p_b[bindex_0],
+                                        p_b[bindex_1],
+                                        p_b[bindex_2],
+                                        p_b[bindex_3],
+                                        p_c[cindex_0],
+                                        p_c[cindex_1],
+                                        p_c[cindex_2],
+                                        p_c[cindex_3]);
+                });
+            }
+        }
+    }
+#endif
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+        constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
+                                     ((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
+                                      (is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
+                                      (is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
+        static_if<has_amd_asm>{}([&](auto fwd) {
+            Run_amd_asm(p_a, p_b, fwd(p_c));
+        }).Else([&](auto) { Run_source(p_a, p_b, p_c); });
+#else
+        Run_source(p_a, p_b, p_c);
+#endif
+    }
+};
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
@@ -2,8 +2,8 @@
 #define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 namespace ck {
 template <class Float, class TDesc>

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -6,14 +6,6 @@
 #include "tensor_descriptor_helper.hpp"
 #include "tensor_coordinate.hpp"
-#ifndef CK_USE_AMD_INTRINSIC
-#define CK_USE_AMD_INTRINSIC 1
-#endif
-#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
-#endif
 namespace ck {
 // This version use multi-index transformation
@@ -76,9 +68,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    // Will do padding check on dst data: No write if dst data is in paddin area.
    template <typename SrcData,
              typename DstData,
-              address_space_t SrcAddressSpace = address_space_t::generic,
+              AddressSpace SrcAddressSpace,
-              address_space_t DstAddressSpace = address_space_t::generic>
+              AddressSpace DstAddressSpace>
-    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+    __device__ void Run(const SrcData* p_src,
+                        DstData* p_dst,
+                        integral_constant<AddressSpace, SrcAddressSpace>,
+                        integral_constant<AddressSpace, DstAddressSpace>) const
    {
        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
@@ -122,15 +117,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                // Check src vector's padding situation, only check the first data in this src
                //   vector. It's user's responsiblity to make sure all data in the src vector
-                //   has
+                //   has the same padding situation
-                //   the same padding situation
                if(src_coord.IsUpperIndexMappedToValidOffset())
                {
-                    static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
+                    static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+#if CK_USE_AMD_BUFFER_ADDRESSING
                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                            __buffer_load<SrcData, SrcDataPerAccess>(
-                                p_src, src_coord.GetOffset(), 0);
+                                fwd(p_src), src_coord.GetOffset(), 0);
 #else
                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                            *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
@@ -163,15 +157,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                // Check dst vector's padding situation, only check the first data in this dst
                //   vector. It's user's responsiblity to make sure all data in the dst vector
-                //   has
+                //   has the same padding situation
-                //   the same padding situation
                if(dst_coord.IsUpperIndexMappedToValidOffset())
                {
-                    static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
+                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+#if CK_USE_AMD_BUFFER_ADDRESSING
                        __buffer_store<DstData, DstDataPerAccess>(
                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
-                            p_dst,
+                            fwd(p_dst),
                            dst_coord.GetOffset(),
                            0);
 #else
@@ -188,6 +181,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
        });
    }
+    template <typename SrcData, typename DstData>
+    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        Run(p_src, p_dst, generic_address_space, generic_address_space);
+    }
    // Modify Length to 1, if Mask is set to false
    // Used for isolating linear dimension from non-linear dimensions
    template <index_t... Lengths, index_t... Mask>
@@ -202,12 +204,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    // Will do padding check on src data: Read 0 if src data is in padding area.
    // Will do padding check on dst data: No write if dst data is in paddin area.
    // This version is optimized for address calculation of src tensor
+    // TODO: this function is not compiled to expected ISA
    template <typename SrcData,
              typename DstData,
-              address_space_t SrcAddressSpace = address_space_t::generic,
+              AddressSpace SrcAddressSpace,
-              address_space_t DstAddressSpace = address_space_t::generic>
+              AddressSpace DstAddressSpace>
-    __device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
+    __device__ void
-                                                          DstData* p_dst) const
+    Run_optimized_src_address_calculation(const SrcData* p_src,
+                                          DstData* p_dst,
+                                          integral_constant<AddressSpace, SrcAddressSpace>,
+                                          integral_constant<AddressSpace, DstAddressSpace>) const
    {
        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
@@ -287,14 +293,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    const auto src_coord =
                        src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
-#if 1 // tweaking
+#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
                    // this is src compile-time offset
                    const index_t src_linear_offset =
-                        src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
+                        src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #else
                    // this is src compile-time offset
                    const index_t src_linear_offset =
-                        src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
+                        src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
 #endif
                    // Check src vector's padding situation, only check the first data in
@@ -302,8 +308,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    //   the src vector has the same padding situation
                    if(src_coord.IsUpperIndexMappedToValidOffset())
                    {
-                        static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
+                        static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+#if CK_USE_AMD_BUFFER_ADDRESSING
                            *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                                __buffer_load<SrcData, SrcDataPerAccess>(
                                    p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
@@ -360,12 +366,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    // Will do padding check on src data: Read 0 if src data is in padding area.
    // Will do padding check on dst data: No write if dst data is in paddin area.
    // This version is optimized for address calculation of dst tensor
+    // TODO: this function is not compiled to expected ISA
    template <typename SrcData,
              typename DstData,
-              address_space_t SrcAddressSpace = address_space_t::generic,
+              AddressSpace SrcAddressSpace,
-              address_space_t DstAddressSpace = address_space_t::generic>
+              AddressSpace DstAddressSpace>
-    __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
+    __device__ void
-                                                          DstData* p_dst) const
+    Run_optimized_dst_address_calculation(const SrcData* p_src,
+                                          DstData* p_dst,
+                                          integral_constant<AddressSpace, SrcAddressSpace>,
+                                          integral_constant<AddressSpace, DstAddressSpace>) const
    {
        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
@@ -476,14 +486,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    const auto dst_coord =
                        dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
-#if 1 // tweaking
+#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
                    // this is dst compile-time offset
                    const index_t dst_linear_offset =
-                        dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
+                        dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #else
                    // this is dst compile-time offset
                    const index_t dst_linear_offset =
-                        dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
+                        dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
 #endif
                    // Check dst vector's padding situation, only check the first data in
@@ -491,8 +501,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    //   the dst vector has the same padding situation
                    if(dst_coord.IsUpperIndexMappedToValidOffset())
                    {
-                        static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
+                        static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+#if CK_USE_AMD_BUFFER_ADDRESSING
                            __buffer_store<DstData, DstDataPerAccess>(
                                *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
                                p_dst,
@@ -514,6 +524,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
        });
    }
+    __device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
+    {
+#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
+        return true;
+#else
+        return false;
+#endif
+    }
    template <typename T, bool PositiveDirection>
    __device__ void MoveSrcSliceWindow(const T& step_sizes_,
                                       integral_constant<bool, PositiveDirection>)

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
+#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
+#define CK_AMD_BUFFER_ADDRESSING_HPP
+#include "float_type.hpp"
+namespace ck {
+// For 128bit SGPRs in buffer_load and buffer_store instructions
+// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+template <typename T>
+union BufferLoadStoreDwordConfig
+{
+    int32x4_t data;
+    T* address[2];
+    int32_t range[4];
+};
+__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
+                                           index_t vindex,
+                                           index_t offset,
+                                           bool glc,
+                                           bool slc) __asm("llvm.amdgcn.buffer.load");
+__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
+                                                index_t vindex,
+                                                index_t offset,
+                                                bool glc,
+                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
+__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
+                                                index_t vindex,
+                                                index_t offset,
+                                                bool glc,
+                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
+__device__ void __llvm_amdgcn_buffer_store(float vdata,
+                                           int32x4_t rsrc,
+                                           index_t vindex,
+                                           index_t offset,
+                                           bool glc,
+                                           bool slc) __asm("llvm.amdgcn.buffer.store");
+__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
+                                             int32x4_t rsrc,
+                                             index_t vindex,
+                                             index_t offset,
+                                             bool glc,
+                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
+__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
+                                             int32x4_t rsrc,
+                                             index_t vindex,
+                                             index_t offset,
+                                             bool glc,
+                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
+template <typename T, index_t VectorSize>
+__device__ typename vector_type<T, VectorSize>::MemoryType
+__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
+template <typename T, index_t VectorSize>
+__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
+                               T* p_dst_block,
+                               index_t dst_thread_data_offset,
+                               index_t dst_const_data_offset);
+template <>
+__device__ float __buffer_load<float, 1>(const float* p_src_block,
+                                         index_t src_thread_data_offset,
+                                         index_t src_const_data_offset)
+{
+    float dst;
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    BufferLoadStoreDwordConfig<float> src_block_config;
+    // fill in byte 0 - 1
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    src_block_config.range[2] = -1;
+    // fill in byte 3
+    src_block_config.range[3] = 0x00027000;
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    dst = __llvm_amdgcn_buffer_load(
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+#else
+    asm volatile(
+        "\n \
+    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
+#endif
+    return dst;
+}
+template <>
+__device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
+                                            index_t src_thread_data_offset,
+                                            index_t src_const_data_offset)
+{
+    float2_t dst;
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    BufferLoadStoreDwordConfig<float> src_block_config;
+    // fill in byte 0 - 1
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    src_block_config.range[2] = -1;
+    // fill in byte 3
+    src_block_config.range[3] = 0x00027000;
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    dst = __llvm_amdgcn_buffer_loadx2(
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+#else
+    asm volatile(
+        "\n \
+    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
+#endif
+    return dst;
+}
+template <>
+__device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
+                                            index_t src_thread_data_offset,
+                                            index_t src_const_data_offset)
+{
+    float4_t dst;
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    BufferLoadStoreDwordConfig<float> src_block_config;
+    // fill in byte 0 - 1
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    src_block_config.range[2] = -1;
+    // fill in byte 3
+    src_block_config.range[3] = 0x00027000;
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    dst = __llvm_amdgcn_buffer_loadx4(
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+#else
+    asm volatile(
+        "\n \
+    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
+#endif
+    return dst;
+}
+template <>
+__device__ void __buffer_store<float, 1>(const float& src,
+                                         float* p_dst_block,
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
+{
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+    // fill in byte 0 - 1
+    dst_block_config.address[0] = p_dst_block;
+    // fill in byte 2
+    dst_block_config.range[2] = -1;
+    // fill in byte 3
+    dst_block_config.range[3] = 0x00027000;
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    __llvm_amdgcn_buffer_store(src,
+                               dst_block_config.data,
+                               0,
+                               dst_thread_addr_offset + dst_const_addr_offset,
+                               false,
+                               false);
+#else
+    asm volatile("\n \
+    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_config.data),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#endif
+}
+template <>
+__device__ void __buffer_store<float, 2>(const float2_t& src,
+                                         float* p_dst_block,
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
+{
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+    // fill in byte 0 - 1
+    dst_block_config.address[0] = p_dst_block;
+    // fill in byte 2
+    dst_block_config.range[2] = -1;
+    // fill in byte 3
+    dst_block_config.range[3] = 0x00027000;
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    __llvm_amdgcn_buffer_storex2(src,
+                                 dst_block_config.data,
+                                 0,
+                                 dst_thread_addr_offset + dst_const_addr_offset,
+                                 false,
+                                 false);
+#else
+    asm volatile("\n \
+    buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_config.data),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#endif
+}
+template <>
+__device__ void __buffer_store<float, 4>(const float4_t& src,
+                                         float* p_dst_block,
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
+{
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+    // fill in byte 0 - 1
+    dst_block_config.address[0] = p_dst_block;
+    // fill in byte 2
+    dst_block_config.range[2] = -1;
+    // fill in byte 3
+    dst_block_config.range[3] = 0x00027000;
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    __llvm_amdgcn_buffer_storex4(src,
+                                 dst_block_config.data,
+                                 0,
+                                 dst_thread_addr_offset + dst_const_addr_offset,
+                                 false,
+                                 false);
+#else
+    asm volatile("\n \
+    buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_config.data),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#endif
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_intrinsic.hpp