Merge remote-tracking branch 'origin/develop' into gemm_f16_int8

05fd7ff8 · Jakub Piasecki · 2784b516 · 84832fc4 · 05fd7ff8 · 05fd7ff8
Commit 05fd7ff8 authored Jan 30, 2024 by Jakub Piasecki
20 changed files
--- a/include/ck/wrapper/layout.hpp
+++ b/include/ck/wrapper/layout.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -14,22 +14,28 @@ namespace wrapper {
 * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
 *         (dynamic layout). It is possible to pass nested shapes
 *         (e.g. ((4, 2), 2)), nested dimensions are merged.
- * \tparam UnnestedDescriptorType Tensor descriptor for unnested shape dims.
+ * \tparam UnrolledDescriptorType Tensor descriptor for unnested shape dims.
 */
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
 struct Layout
 {
    private:
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
-    // Generate default idxs tuple (idx with all merged nested shapes)
+    /**
+     * \brief Generate default indices tuple (idx with all merged nested shapes)
+     *
+     * \param shape Shape to align.
+     * \return Multi idx tuple with zeros.
+     */
    template <typename... Ts>
-    __host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple<Ts...>&)
+    __host__ __device__ constexpr static auto
+    GenerateDefaultIdxsTuple([[maybe_unused]] const Tuple<Ts...>& shape)
    {
        return generate_tuple(
            [&](auto) {
-                if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime())
+                if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
                {
                    // runtime layout
                    return index_t(0);
@@ -43,11 +49,18 @@ struct Layout
            Number<Tuple<Ts...>::Size()>{});
    }
-    // Generate LowerDims in Compile-time for MergeTrasform using passed Type
+    /**
-    // If element of Tuple<Ts...> is also tuple, then merge (generate sequence for merge)
+     * \brief Generate lower dims in compile-time for the Merge transform using
-    // If tuple is element, then pass through (sequence with one element)
+     * provided type. If element of nested Tuple<Ts...> is also a tuple, then
+     * merge (generate sequence for merge). If tuple is element, then pass
+     * through (sequence with one element).
+     *
+     * \param shape Shape to align.
+     * \return LowerDims for MergeTrasform.
+     */
    template <typename Idx, typename... Ts>
-    __host__ __device__ constexpr static auto GenerateLowerDim(const Tuple<Ts...>&)
+    __host__ __device__ constexpr static auto
+    GenerateLowerDim([[maybe_unused]] const Tuple<Ts...>& shape)
    {
        if constexpr(Idx::value == 0)
        {
@@ -87,11 +100,17 @@ struct Layout
        }
    }
-    // Iterate over nested tuples in shape
+    /**
-    // Unroll nested tuples to align Tuple<ShapeDims...> to Tuple<IdxDims...>
+     * \brief Iterate over the nested tuples in the shape.
-    // Example idx:     (1,      1), 1,      1
+     * Unroll nested tuples to align Tuple<ShapeDims...> to Tuple<IdxDims...>
-    // Example shape:   (2, (2, 2)), 2, (2, 2)
+     * Example idx:     (1,      1), 1,      1
-    // Unrolled shape:  2,  (2, 2),  2, (2, 2)
+     * Example shape:   (2, (2, 2)), 2, (2, 2)
+     * Unrolled shape:  2,  (2, 2),  2, (2, 2)
+     *
+     * \param shape Layout shape.
+     * \param idx Idx to align.
+     * \return Algined shape.
+     */
    template <typename... ShapeDims, typename... IdxDims>
    __host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple<ShapeDims...>& shape,
                                                              const Tuple<IdxDims...>& idx)
@@ -126,6 +145,13 @@ struct Layout
        }
    }
+    /**
+     * \brief Merge descriptor to 1D.
+     *
+     * \param shape Layout shape.
+     * \param desc Descriptor to merge.
+     * \return 1D descriptor.
+     */
    template <typename... ShapeDims, typename DescriptorToMerge>
    __host__ __device__ constexpr static auto MakeMerge1d(const Tuple<ShapeDims...>& shape,
                                                          const DescriptorToMerge& desc)
@@ -137,18 +163,41 @@ struct Layout
        const auto lower_dims    = make_tuple(MergeElemsSequence::Reverse());
        const auto upper_dims    = make_tuple(Sequence<0>{});
        // Merge to 1d
-        return transform_tensor_descriptor(
+        if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
-            desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
+        {
+            return transform_tensor_descriptor(
+                desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
+        }
+        else
+        {
+            // If the descriptor is known at the compilation time,
+            // use `make_merge_transform_v1_carry_check` because it doesn't use
+            // memcpy.
+            return transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform_v1_carry_check(merge_elems)),
+                lower_dims,
+                upper_dims);
+        }
    }
-    // Merge nested shape dims when corresponding index is also nested.
+    /**
-    // Input desc shape: 2,  2,  2, 2,  2,  2
+     * \brief Merge nested shape dims when corresponding index is also merged.
-    // Example idx:      1,      1, 1,      1
+     * Input desc shape: 2,  2,  2, 2,  2, 2
-    // Example shape:    2, (2, 2), 2, (2, 2)
+     * Example idx:      1,      1, 1, (1, 1)
-    // Merged shape:     2,      4, 2,      4
+     * Example shape:    2, (2, 2), 2, (2, 2)
+     * Merged shape:     2,      4, 2,  2, 2
+     *
+     * \param shape Layout shape.
+     * \param idxs Indexes to align descriptor.
+     * \param desc Descriptor to merge.
+     * \return Aligned descriptor to idx.
+     */
    template <typename... ShapeDims, typename... IdxDims, typename DescriptorToMerge>
-    __host__ __device__ constexpr static auto CreateMergedDescriptor(
+    __host__ __device__ constexpr static auto
-        const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
+    CreateMergedDescriptor(const Tuple<ShapeDims...>& shape,
+                           [[maybe_unused]] const Tuple<IdxDims...>& idxs,
+                           DescriptorToMerge& desc)
    {
        const auto transforms = generate_tuple(
            [&](auto i) {
@@ -160,7 +209,17 @@ struct Layout
                    // If shape element is tuple and idx element is Number, then merge
                    // Unroll and reverse tuple to traverse column-major
                    const auto merge_elems = TupleReverse(UnrollNestedTuple(shape.At(i)));
-                    return make_merge_transform(merge_elems);
+                    if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
+                    {
+                        return make_merge_transform(merge_elems);
+                    }
+                    else
+                    {
+                        // If the descriptor is known at the compilation time,
+                        // use `make_merge_transform_v1_carry_check` because
+                        // it doesn't use memcpy.
+                        return make_merge_transform_v1_carry_check(merge_elems);
+                    }
                }
                else
                {
@@ -185,14 +244,23 @@ struct Layout
    }
    using Descriptor1dType =
-        remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnnestedDescriptorType{}))>;
+        remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnrolledDescriptorType{}))>;
    using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
+    public:
+    /**
+     * \brief Transform descriptor to align to passed indexes.
+     *
+     * \param shape Layout shape.
+     * \param idxs Indexes to align descriptor.
+     * \param naive_descriptor Descriptor to merge.
+     * \return Aligned descriptor to idx.
+     */
    template <typename... ShapeDims, typename... IdxDims>
    __host__ __device__ constexpr static auto
    TransformDesc(const Tuple<ShapeDims...>& shape,
-                  const Tuple<IdxDims...>& idx,
+                  const Tuple<IdxDims...>& idxs,
-                  const UnnestedDescriptorType& naive_descriptor)
+                  const UnrolledDescriptorType& naive_descriptor)
    {
        if constexpr(Tuple<IdxDims...>::Size() == I1)
        {
@@ -208,19 +276,18 @@ struct Layout
            static_assert(Tuple<ShapeDims...>::Size() == Tuple<IdxDims...>::Size(),
                          "Idx rank and Shape rank must be the same (except 1d).");
            // Unroll while IdxDims is nested
-            const auto aligned_shape = AlignShapeToIdx(shape, idx);
+            const auto aligned_shape = AlignShapeToIdx(shape, idxs);
            // Transform correct form of shape
-            return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor);
+            return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idxs), naive_descriptor);
        }
    }
    using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc(
-        Shape{}, DefaultIdxsTupleType{}, UnnestedDescriptorType{}))>;
+        Shape{}, DefaultIdxsTupleType{}, UnrolledDescriptorType{}))>;
-    public:
    __host__ __device__ constexpr auto GetElementSpaceSize() const
    {
-        return unnested_descriptor_.GetElementSpaceSize();
+        return unrolled_descriptor_.GetElementSpaceSize();
    }
    __host__ __device__ Layout() = delete;
@@ -232,16 +299,15 @@ struct Layout
     * \param unnested_descriptor Descriptor
     */
    __host__ __device__ constexpr Layout(const Shape& shape,
-                                         const UnnestedDescriptorType& unnested_descriptor)
+                                         const UnrolledDescriptorType& unnested_descriptor)
-        : shape_(shape)
+        : unrolled_descriptor_(unnested_descriptor), shape_(shape)
    {
        // Construct if runtime mode
-        if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime())
+        if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
        {
-            unnested_descriptor_ = unnested_descriptor;
+            descriptor_1d_ = MakeMerge1d(shape_, unrolled_descriptor_);
-            descriptor_1d_       = MakeMerge1d(shape_, unnested_descriptor_);
            merged_nests_descriptor_ =
-                TransformDesc(shape_, DefaultIdxsTupleType{}, unnested_descriptor_);
+                TransformDesc(shape_, DefaultIdxsTupleType{}, unrolled_descriptor_);
        }
    }
@@ -254,9 +320,9 @@ struct Layout
    template <typename Idxs>
    __host__ __device__ constexpr index_t operator()() const
    {
-        static_assert(UnnestedDescriptorType::IsKnownAtCompileTime(),
+        static_assert(remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime(),
                      "Compiletime operator used on runtime layout.");
-        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnnestedDescriptorType{}));
+        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnrolledDescriptorType{}));
        using UnrolledIdx     = decltype(UnrollNestedTuple(Idxs{}));
        return TransformedDesc{}.CalculateOffset(UnrolledIdx{});
    }
@@ -283,7 +349,7 @@ struct Layout
        else
        {
            // Custom index, need to transform descriptor
-            const auto transformed_desc = TransformDesc(shape_, Idx, unnested_descriptor_);
+            const auto transformed_desc = TransformDesc(shape_, Idx, unrolled_descriptor_);
            return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
        }
    }
@@ -350,29 +416,55 @@ struct Layout
    }
    /**
-     * \brief Get default descriptor (with the same size as Shape)
+     * \brief Get descriptor with all nested dimensions merged.
+     * Example, shape: ((2, 2), 2)
+     * Descriptor lengths: (4, 2)
     *
-     * \return Default descriptor.
+     * \note The size of merged descriptor is the same as Layout's shape.
+     *
+     * \return Merged nests descriptor.
     */
-    __host__ __device__ constexpr const MergedNestsDescriptorType& GetDefaultDescriptor() const
+    __host__ __device__ constexpr const MergedNestsDescriptorType&
+    GetMergedNestingDescriptor() const
    {
        return merged_nests_descriptor_;
    }
+    /**
+     * \brief Get descriptor with all dimensions are merged (1D).
+     * Example, shape: ((2, 2), 2)
+     * Descriptor lengths: (8)
+     *
+     * \return 1D descriptor.
+     */
+    __host__ __device__ constexpr const Descriptor1dType& Get1DDescriptor() const
+    {
+        return descriptor_1d_;
+    }
    /**
     * \brief Get unnested descriptor (with unrolled dims)
+     * Example, shape: ((2, 2), 2)
+     * Descriptor lengths: (2, 2, 2)
     *
-     * \return Flatten descriptor.
+     * \return Flattened descriptor.
     */
-    __host__ __device__ constexpr const UnnestedDescriptorType& GetUnnestedDescriptor() const
+    __host__ __device__ constexpr const UnrolledDescriptorType& GetUnrolledDescriptor() const
    {
-        return unnested_descriptor_;
+        return unrolled_descriptor_;
    }
    private:
-    UnnestedDescriptorType unnested_descriptor_;
+    // All dimensions are unrolled
+    UnrolledDescriptorType unrolled_descriptor_;
+    // 1D descriptor
    Descriptor1dType descriptor_1d_;
+    // All nesting are merged
    MergedNestsDescriptorType merged_nests_descriptor_;
+    // Example, shape: ((2, 2), 2)
+    // UnrolledDescriptorType lengths: (2, 2, 2)
+    // Descriptor1dType lengths: (8)
+    // MergedNestsDescriptorType lengths: (4, 2)
    const Shape shape_;
 };

--- a/include/ck/wrapper/operations/copy.hpp
+++ b/include/ck/wrapper/operations/copy.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "../utils/tensor_utils.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 namespace ck {
 namespace wrapper {
 /**
- * \brief Perform generic copy between two tensors. Tensors must have the
+ * \brief Perform generic copy between two tensors partitions (threadwise copy).
- *  same size.
+ *  Tensors must have the same size.
 *
 * \param src_tensor Source tensor.
 * \param dst_tensor Destination tensor.
@@ -37,5 +42,134 @@ __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& ds
    }
 }
+/**
+ * \brief Perform optimized copy between two tensors partitions (threadwise copy).
+ * Tensors must have the same size.
+ *
+ * \tparam DimAccessOrderTuple Tuple with dimension access order.
+ * \tparam VectorDim Dimension for vectorized read and write.
+ * \tparam ScalarPerVector Number of scalar per vectorized read and write.
+ * \param src_tensor Source tensor.
+ * \param dst_tensor Destination tensor.
+ */
+template <typename DimAccessOrderTuple,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          typename SrcTensorType,
+          typename DstTensorType>
+__device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
+{
+    static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    const auto& in_grid_desc  = layout(src_tensor).GetUnrolledDescriptor();
+    const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor();
+    using SrcShapeType         = remove_cvref_t<decltype(shape(src_tensor))>;
+    constexpr index_t num_dims = SrcShapeType::Size();
+    constexpr auto thread_slice_lengths =
+        generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
+    constexpr auto dim_access_order = generate_sequence_v2(
+        [](auto I) { return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
+    if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
+    {
+        // Perform a copy between DynamicBuffers
+        auto transfer = ThreadwiseTensorSliceTransfer_v7<
+            Tuple<typename SrcTensorType::TensorElementType>,
+            Tuple<typename DstTensorType::TensorElementType>,
+            decltype(tie(in_grid_desc)),
+            decltype(tie(out_grid_desc)),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
+            decltype(thread_slice_lengths),
+            decltype(dim_access_order),
+            VectorDim,
+            ScalarPerVector,
+            Sequence<false>,
+            Sequence<false>>{in_grid_desc,
+                             make_tuple(src_tensor.GetMultiIdxOffsets()),
+                             out_grid_desc,
+                             make_tuple(dst_tensor.GetMultiIdxOffsets()),
+                             tensor_operation::element_wise::PassThrough{}};
+        transfer.Run(tie(in_grid_desc),
+                     tie(src_tensor.GetBuffer()),
+                     tie(out_grid_desc),
+                     tie(dst_tensor.GetBuffer()));
+    }
+    else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
+    {
+        // Perform copy from StaticBuffer to DynamicBuffer
+        const auto src_slice_origin_idxs =
+            generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
+        auto transfer =
+            ThreadwiseTensorSliceTransfer_v1r3<typename SrcTensorType::TensorElementType,
+                                               typename DstTensorType::TensorElementType,
+                                               remove_cvref_t<decltype(in_grid_desc)>,
+                                               remove_cvref_t<decltype(out_grid_desc)>,
+                                               tensor_operation::element_wise::PassThrough,
+                                               decltype(thread_slice_lengths),
+                                               decltype(dim_access_order),
+                                               VectorDim,
+                                               ScalarPerVector,
+                                               InMemoryDataOperationEnum::Set,
+                                               I1,
+                                               true>{out_grid_desc,
+                                                     dst_tensor.GetMultiIdxOffsets(),
+                                                     tensor_operation::element_wise::PassThrough{}};
+        transfer.Run(in_grid_desc,
+                     src_slice_origin_idxs,
+                     src_tensor.GetBuffer(),
+                     out_grid_desc,
+                     dst_tensor.GetBuffer());
+    }
+    else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
+    {
+        // Perform copy from DynamicBuffer to StaticBuffer
+        const auto src_dst_slice_origin =
+            generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
+        constexpr auto src_vector_tensor_lengths = generate_sequence_v2(
+            [&](auto I) {
+                if constexpr(I == VectorDim)
+                {
+                    return Number<ScalarPerVector>{};
+                }
+                else
+                {
+                    return I1;
+                }
+            },
+            Number<num_dims>{});
+        auto transfer =
+            ThreadwiseTensorSliceTransfer_v4r1<typename SrcTensorType::TensorElementType,
+                                               typename DstTensorType::TensorElementType,
+                                               remove_cvref_t<decltype(in_grid_desc)>,
+                                               remove_cvref_t<decltype(out_grid_desc)>,
+                                               decltype(thread_slice_lengths),
+                                               decltype(dim_access_order),
+                                               decltype(src_vector_tensor_lengths),
+                                               decltype(dim_access_order)>{
+                src_tensor.GetMultiIdxOffsets()};
+        transfer.Run(in_grid_desc,
+                     src_dst_slice_origin,
+                     src_tensor.GetBuffer(),
+                     out_grid_desc,
+                     src_dst_slice_origin,
+                     dst_tensor.GetBuffer());
+    }
+    else
+    {
+        // Perform copy between StaticBuffers
+        copy(src_tensor, dst_tensor);
+    }
+}
 } // namespace wrapper
 } // namespace ck
--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
@@ -10,189 +10,205 @@
 namespace ck {
 namespace wrapper {
+namespace detail {
+namespace {
 /**
- * \brief Tensor wrapper that performs static and dynamic buffer logic.
+ * \brief Check if Tuple contains Slice object
 *
- * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
+ * \return True if tuple contains Slice object.
- * \tparam ElementType Element data type.
- * \tparam Shape Tensor shape (layout component).
- * \tparam UnnestedDescriptorType Unnested descriptor (layout component).
- * \tparam NumVectors Number of vectors (only for VGPR, SGPR).
- * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
 */
-template <MemoryTypeEnum BufferAddressSpace,
+template <typename T>
-          typename ElementType,
+__host__ __device__ constexpr bool HasSlice(T&&)
-          typename Shape,
-          typename UnnestedDescriptorType,
-          index_t NumVectors,     // param for Register memory
-          index_t ScalarPerVector // param for Register memory
-          >
-struct Tensor
 {
-    private:
+    return is_detected<is_slice, T>::value;
-    // Check if Tuple contains Slice object
+}
-    template <typename T>
+template <typename... Ts>
-    __host__ __device__ constexpr static bool IsSlicing(T&&)
+__host__ __device__ constexpr bool HasSlice(Tuple<Ts...>&&)
-    {
+{
-        return is_detected<is_slice, T>::value;
+    return (HasSlice(Ts{}) || ...);
-    }
+}
-    template <typename... Ts>
-    __host__ __device__ constexpr static bool IsSlicing(Tuple<Ts...>&&)
-    {
-        return (IsSlicing(Ts{}) || ...);
-    }
-    // Calculate new tensor shape after slice
+/**
-    template <typename... Ts, typename ShapeTmpType>
+ * \brief Calculate new shape after slice from parent shape.
-    __host__ __device__ constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx,
+ *
-                                                                const ShapeTmpType& shape) const
+ * \param idxs Tuple of indexes defining slice ranges.
-    {
+ * \param shape Shape which will be sliced.
-        // Pack each value in tuple to remove empty tuples after generation
+ * \return New tensor shape.
-        auto new_shape = generate_tuple(
+ */
-            [&](auto i) {
+template <typename... Ts, typename SlicedShape>
-                constexpr auto num_i = Number<i>{};
+__host__ __device__ constexpr auto GetSlicedShape(const Tuple<Ts...>& idxs,
-                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                                                  const SlicedShape& shape)
-                {
+{
-                    if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
+    // Pack each value in tuple to remove empty tuples after generation
-                    {
+    auto new_shape = generate_tuple(
-                        // if tuple does not have any slice then we can remove dimension
+        [&](auto i) {
-                        return Tuple<>{};
+            constexpr auto num_i = Number<i>{};
-                    }
+            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
-                    else
+            {
-                    {
+                if constexpr(!detail::HasSlice(tuple_element_t<i.value, Tuple<Ts...>>{}))
-                        // if tuple then recurrence
-                        return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i)));
-                    }
-                }
-                else if constexpr(is_detected<is_slice,
-                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
-                {
-                    // calculate new dimension
-                    const auto& dim = size(shape.At(num_i));
-                    const auto val  = idx.At(num_i).range(dim);
-                    return make_tuple(val);
-                }
-                else
                {
-                    // remove dimension for just value
+                    // if tuple does not have any slice then we can remove dimension
                    return Tuple<>{};
                }
-            },
-            Number<Tuple<Ts...>::Size()>{});
-        // Remove empty tuples (deleted elements) and return
-        return UnrollNestedTuple<0, 1>(new_shape);
-    }
-    // Generate Freeze for each of nested shape
-    template <typename T, typename ShapeTmpType>
-    __host__ __device__ constexpr auto GenerateMultipleFreeze(T idx,
-                                                              const ShapeTmpType& shape) const
-    {
-        const auto unrolled_shape = UnrollNestedTuple(shape);
-        return generate_tuple(
-            [&](auto i) {
-                // dimension offset from idx
-                const auto dim     = unrolled_shape.At(Number<i>{});
-                const auto dim_idx = idx % dim;
-                idx /= dim;
-                return make_freeze_transform(dim_idx);
-            },
-            Number<decltype(unrolled_shape)::Size()>{});
-    }
-    template <typename... Ts, typename ShapeTmpType>
-    __host__ __device__ constexpr auto
-    GetTransformsFromSlicedTensor(const Tuple<Ts...>& idx, const ShapeTmpType& shape) const
-    {
-        // Pack each value in tuple to remove empty tuples after generation
-        auto transforms = generate_tuple(
-            [&](auto i) {
-                constexpr auto num_i = Number<i>{};
-                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
-                {
-                    return GetTransformsFromSlicedTensor(idx.At(num_i), shape.At(num_i));
-                }
-                else if constexpr(is_detected<is_slice,
-                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
-                {
-                    const auto from  = idx.At(num_i).from_;
-                    const auto dim   = shape.At(num_i);
-                    const auto range = idx.At(num_i).range(dim);
-                    return make_slice_transform(range, from, from + range);
-                }
                else
                {
-                    // remove dimension for just value
+                    // if tuple then recurrence
-                    return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i));
+                    return make_tuple(GetSlicedShape(idxs.At(num_i), shape.At(num_i)));
                }
-            },
+            }
-            Number<Tuple<Ts...>::Size()>{});
+            else if constexpr(is_detected<is_slice, tuple_element_t<i.value, Tuple<Ts...>>>::value)
-        // Remove empty tuples (deleted elements) and return
+            {
-        return UnrollNestedTuple(transforms);
+                // calculate new dimension
-    }
+                const auto& dim = size(shape.At(num_i));
+                const auto val  = idxs.At(num_i).range(dim);
+                return make_tuple(val);
+            }
+            else
+            {
+                // remove dimension for just value
+                return Tuple<>{};
+            }
+        },
+        Number<Tuple<Ts...>::Size()>{});
+    // Remove empty tuples (deleted elements) and return
+    return UnrollNestedTuple<0, 1>(new_shape);
+}
+/**
+ * \brief Generate Freeze for each of nested shape.
+ *
+ * \param idx Tuple of start indices for slice.
+ * \param shape Shape which will be freezed.
+ * \return Generated freeze transforms.
+ */
+template <typename T, typename Shape>
+__host__ __device__ constexpr auto GenerateMultipleFreeze(T idx, const Shape& shape)
+{
+    const auto unrolled_shape = UnrollNestedTuple(shape);
+    return generate_tuple(
+        [&](auto i) {
+            // dimension offset from idx
+            const auto dim     = unrolled_shape.At(Number<i>{});
+            const auto dim_idx = idx % dim;
+            idx /= dim;
+            return make_freeze_transform(dim_idx);
+        },
+        Number<decltype(unrolled_shape)::Size()>{});
+}
+/**
+ * \brief Generate transforms for slice tensor.
+ *
+ * \param idx Tuple of start indices for slice.
+ * \param shape Shape which will be sliced.
+ * \return Generated transforms.
+ */
+template <typename... Ts, typename Shape>
+__host__ __device__ constexpr auto GenerateSliceTransforms(const Tuple<Ts...>& idx,
+                                                           const Shape& shape)
+{
+    // Pack each value in tuple to remove empty tuples after generation
+    auto transforms = generate_tuple(
+        [&](auto i) {
+            constexpr auto num_i = Number<i>{};
+            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+            {
+                return GenerateSliceTransforms(idx.At(num_i), shape.At(num_i));
+            }
+            else if constexpr(is_detected<is_slice, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+            {
+                const auto from  = idx.At(num_i).from_;
+                const auto dim   = size<num_i>(shape);
+                const auto range = idx.At(num_i).range(dim);
+                return make_slice_transform(range, from, from + range);
+            }
+            else
+            {
+                // remove dimension for just value
+                return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i));
+            }
+        },
+        Number<Tuple<Ts...>::Size()>{});
+    // Remove empty tuples (deleted elements) and return
+    return UnrollNestedTuple(transforms);
+}
+template <index_t i, typename LowerIndex>
+__host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze<LowerIndex>&)
+{
    // There is no output for Freeze transform
-    template <index_t i, typename LowerIndex>
+    return Sequence<>{};
-    __host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze<LowerIndex>&) const
+}
-    {
-        return Sequence<>{};
-    }
-    template <index_t i, typename LowLength, typename SliceBegin, typename SliceEnd>
+template <index_t i, typename LowLength, typename SliceBegin, typename SliceEnd>
-    __host__ __device__ constexpr auto
+__host__ __device__ constexpr auto GetSequenceVal(const ck::Slice<LowLength, SliceBegin, SliceEnd>&)
-    GetSequenceVal(const ck::Slice<LowLength, SliceBegin, SliceEnd>&) const
+{
-    {
+    return Sequence<i>{};
-        return Sequence<i>{};
+}
-    }
-    template <index_t i>
+template <index_t i>
-    __host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) const
+__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&)
+{
+    return Tuple<>{};
+}
+template <index_t i, typename... Transforms>
+__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<Transforms...>& transforms)
+{
+    constexpr auto num_transforms = Tuple<Transforms...>::Size();
+    // Deduce Sequence element for specific transform
+    const auto current_elem = GetSequenceVal<i>(transforms.At(Number<0>{}));
+    if constexpr(is_same_v<decltype(current_elem), const Sequence<>>)
    {
-        return Tuple<>{};
+        const auto next_tuple = GenerateUpperDims<i>(TupleSlice<1, num_transforms>(transforms));
+        return concat_tuple(make_tuple(current_elem), next_tuple);
    }
+    else
-    template <index_t i, typename... Transforms>
-    __host__ __device__ constexpr auto
-    GenerateUpperDims(const Tuple<Transforms...>& transforms) const
    {
-        constexpr auto num_transforms = Tuple<Transforms...>::Size();
+        // Increase i if current_elem is Slice transform
-        // Deduce Sequence element for specific transform
+        const auto next_tuple = GenerateUpperDims<i + 1>(TupleSlice<1, num_transforms>(transforms));
-        const auto currect_elem = GetSequenceVal<i>(transforms.At(Number<0>{}));
+        return concat_tuple(make_tuple(current_elem), next_tuple);
-        if constexpr(is_same_v<decltype(currect_elem), const Sequence<>>)
-        {
-            const auto next_tuple = GenerateUpperDims<i>(TupleSlice<1, num_transforms>(transforms));
-            return concat_tuple(make_tuple(currect_elem), next_tuple);
-        }
-        else
-        {
-            // Increase i if current_elem is Slice transform
-            const auto next_tuple =
-                GenerateUpperDims<i + 1>(TupleSlice<1, num_transforms>(transforms));
-            return concat_tuple(make_tuple(currect_elem), next_tuple);
-        }
    }
+}
-    template <typename... Ts, typename ShapeTmpType, typename FlattenDescriptor>
+template <typename... Ts, typename Shape, typename FlattenDescriptor>
-    __host__ __device__ constexpr auto
+__host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>& idx,
-    GetDescriptorFromSlicedTensor(const Tuple<Ts...>& idx,
+                                                            const Shape& shape,
-                                  const ShapeTmpType& shape,
+                                                            const FlattenDescriptor& flatten_desc)
-                                  const FlattenDescriptor& flatten_desc) const
+{
-    {
+    constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
-        constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
-        const auto transforms     = GetTransformsFromSlicedTensor(idx, shape);
+    const auto transforms     = GenerateSliceTransforms(idx, shape);
-        using TransformsTupleType = decltype(transforms);
+    using TransformsTupleType = decltype(transforms);
-        const auto lower_dims =
+    const auto lower_dims =
-            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
+        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
-        const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){};
+    const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){};
-        return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
+    return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
-    }
+}
+} // namespace
+} // namespace detail
+/**
+ * \brief Tensor wrapper that performs static and dynamic buffer logic.
+ * The tensor is based on a descriptor stored in the Layout. Additionally,
+ * tensor can be sliced or shifted using multi-index offset.
+ *
+ * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
+ * \tparam ElementType Element data type.
+ * \tparam Shape Tensor shape (layout component).
+ * \tparam UnrolledDescriptorType Flatten descriptor (layout component).
+ */
+template <MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename UnrolledDescriptorType>
+struct Tensor
+{
    public:
-    using ElementSpaceSize  = decltype(Layout<Shape, UnnestedDescriptorType>{
+    using ElementSpaceSize  = decltype(Layout<Shape, UnrolledDescriptorType>{
-        Shape{}, UnnestedDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
+        Shape{}, UnrolledDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
    using TensorElementType = ElementType;                          // DataType
    static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
@@ -200,134 +216,207 @@ struct Tensor
                                              BufferAddressSpace == MemoryTypeEnum ::Vgpr);
    __host__ __device__ Tensor() = delete;
-    __host__ __device__ Tensor(ElementType* pointer,
+    __host__ __device__ constexpr Tensor(ElementType* pointer,
-                               const Layout<Shape, UnnestedDescriptorType>& layout)
+                                         const Layout<Shape, UnrolledDescriptorType>& layout)
        : layout_(layout),
-          buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize()))
+          buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize())),
+          multi_idx_offset_(make_zero_multi_index<Shape::Size()>()),
+          base_offset_(0)
    {
+        static_assert(IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
    }
-    __host__ __device__ Tensor(const Layout<Shape, UnnestedDescriptorType>& layout)
+    __host__ __device__ constexpr Tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
-        : layout_(layout)
+        : layout_(layout),
+          multi_idx_offset_(make_zero_multi_index<Shape::Size()>()),
+          base_offset_(0)
    {
        static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
    }
-    __host__ __device__ constexpr const Layout<Shape, UnnestedDescriptorType>& GetLayout() const
+    __host__ __device__ constexpr const Layout<Shape, UnrolledDescriptorType>& GetLayout() const
    {
        return layout_;
    }
-    // Getter for new sliced tensor
+    /**
-    template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
+     * \brief Get the new sliced tensor.
-    __host__ __device__ auto operator[](const Tuple<Ts...>& idx) const
+     *
+     * \param idx Tuple of indices: slice(from,to) or scalar.
+     * \return Sliced tensor.
+     */
+    template <typename... Ts, enable_if_t<detail::HasSlice(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ auto operator[](const Tuple<Ts...>& idx)
    {
        static_assert(IsDynamicBuffer, "Register slice is not supported");
        const auto& shape = layout_.GetShape();
-        auto new_shape    = GetShapeFromSlicedTensor(idx, shape);
+        auto new_shape    = detail::GetSlicedShape(idx, shape);
-        const auto& flatten_desc = layout_.GetUnnestedDescriptor();
+        const auto& flatten_desc = layout_.GetUnrolledDescriptor();
-        auto new_desc            = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc);
+        auto new_desc            = detail::GenerateSlicedDescriptor(idx, shape, flatten_desc);
        const auto new_layout =
            Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
+        // Update embed offset
+        base_offset_ -= new_layout(make_tuple(Number<0>{}));
        return make_tensor<BufferAddressSpace>(buffer_.p_data_, new_layout);
    }
-    template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
+    template <typename... Ts, enable_if_t<detail::HasSlice(Tuple<Ts...>{}), bool> = false>
-    __host__ __device__ auto operator()(const Tuple<Ts...>& idx) const
+    __host__ __device__ auto operator()(const Tuple<Ts...>& idx)
    {
        return this->operator[](idx);
    }
-    template <typename... Idxs, enable_if_t<IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    template <typename... Idxs, enable_if_t<detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
-    __host__ __device__ auto operator()(Idxs... idxs) const
+    __host__ __device__ auto operator()(Idxs... idxs)
    {
        return this->operator[](make_tuple(idxs...));
    }
-    // Getter for the const value
+    /**
-    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+     * \brief Getter of the tensor's const value reference.
+     *
+     * \param idx Tuple of indices.
+     * \return Requested value.
+     */
+    template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
    __host__ __device__ const ElementType& operator[](const Tuple<Ts...>& idx) const
    {
        if constexpr(IsDynamicBuffer)
        {
-            const index_t offset = layout_(idx);
+            const index_t offset = layout_(idx) + base_offset_;
            return buffer_[offset];
        }
        else
        {
-            constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{
+            constexpr index_t index_offset = Layout<Shape, UnrolledDescriptorType>{
                Shape{},
-                UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>();
+                UnrolledDescriptorType{}}.template operator()<Tuple<Ts...>>();
-            return buffer_[Number<offset>{}];
+            // Calculate and apply base offset in compile-time
+            constexpr index_t base_offset = Layout<Shape, UnrolledDescriptorType>{
+                Shape{},
+                UnrolledDescriptorType{}}.template operator()<MultiIndex<Shape::Size()>>();
+            return buffer_[Number<index_offset + base_offset>{}];
        }
    }
-    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
    __host__ __device__ const ElementType& operator()(const Tuple<Ts...>& idx) const
    {
        return this->operator[](idx);
    }
-    template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    template <typename... Idxs, enable_if_t<!detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
    __host__ __device__ const ElementType& operator()(Idxs... idxs) const
    {
        return this->operator[](make_tuple(idxs...));
    }
-    // Getter for the value reference
+    /**
-    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+     * \brief Getter of tensor value reference.
+     *
+     * \param idx Tuple of indices.
+     * \return Requested value.
+     */
+    template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
    __host__ __device__ ElementType& operator[](const Tuple<Ts...>& idx)
    {
        if constexpr(IsDynamicBuffer)
        {
-            const index_t offset = layout_(idx);
+            const index_t offset = layout_(idx) + base_offset_;
            return buffer_(offset);
        }
        else
        {
-            constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{
+            constexpr index_t index_offset = Layout<Shape, UnrolledDescriptorType>{
+                Shape{},
+                UnrolledDescriptorType{}}.template operator()<Tuple<Ts...>>();
+            // Apply embed offset (calculate in compiletime)
+            constexpr index_t base_offset = Layout<Shape, UnrolledDescriptorType>{
                Shape{},
-                UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>();
+                UnrolledDescriptorType{}}.template operator()<MultiIndex<Shape::Size()>>();
-            return buffer_(Number<offset>{});
+            return buffer_(Number<index_offset + base_offset>{});
        }
    }
-    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
    __host__ __device__ ElementType& operator()(const Tuple<Ts...>& idx)
    {
        return this->operator[](idx);
    }
-    template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    template <typename... Idxs, enable_if_t<!detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
    __host__ __device__ ElementType& operator()(Idxs... idxs)
    {
        return this->operator[](make_tuple(idxs...));
    }
-    __host__ __device__ constexpr auto GetDefaultDescriptor()
+    /**
+     * \brief Get descriptor with all nested dimensions merged.
+     *
+     * \return Merged nests descriptor.
+     */
+    __host__ __device__ constexpr auto GetMergedNestingDescriptor()
    {
-        return layout_.GetDefaultDescriptor();
+        return layout_.GetMergedNestingDescriptor();
    }
+    /**
+     * \brief Get pointer to the data.
+     *
+     * \return Pointer.
+     */
    __host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; }
+    __host__ __device__ constexpr auto& GetBuffer() { return buffer_; }
+    __host__ __device__ constexpr auto& GetBuffer() const { return buffer_; }
+    /**
+     * \brief Get multi index offset to the data.
+     *
+     * \return Multi index offset.
+     */
+    __host__ __device__ constexpr auto& GetMultiIdxOffsets() const { return multi_idx_offset_; }
+    /**
+     * \brief Apply multi index offset on the tensor.
+     *
+     * \param multi_idx_offset Multi index offset.
+     */
+    template <typename MultiIdxOffsets>
+    __host__ __device__ constexpr void SetMultiIdxOffset(const MultiIdxOffsets multi_idx_offset)
+    {
+        multi_idx_offset_ = multi_idx_offset;
+        base_offset_ += layout_(multi_idx_offset);
+    }
    private:
    using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
                                            ElementType,
                                            ElementSpaceSize,
                                            true /*InvalidElementUseNumericalZeroValue*/>;
-    using StaticBufferType =
+    using StaticBufferType  = StaticBuffer<BufferAddressSpace,
-        StaticBufferTupleOfVector<BufferAddressSpace,
+                                          ElementType,
-                                  ElementType,
+                                          size(Shape{}),
-                                  NumVectors,
+                                          true /*InvalidElementUseNumericalZeroValue*/>;
-                                  ScalarPerVector,
-                                  true /*InvalidElementUseNumericalZeroValue*/>;
    // If register use static buffer, else use dynamic buffer
    using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
-    const Layout<Shape, UnnestedDescriptorType> layout_;
+    const Layout<Shape, UnrolledDescriptorType> layout_;
    Buffer buffer_;
+    // We use multi_idx_offset_ to enable the creation of a descriptor in
+    // compile time for partitions or tiles if tile shape and thread layout
+    // is known at compile time (We can use the same descriptor for each
+    // thread). Additionally, the copy between the static and dynamic buffer
+    // requires a descriptor known at compile time, so we can shift data using
+    // such multi_idx_offset_.
+    MultiIndex<Shape::Size()> multi_idx_offset_;
+    // Base offset and multi index offset are corresponding to exactly the
+    // same element in tensor ( and in physical memory ). Multi index offset
+    // is multi dimensional index. However base offset is calculated using
+    // tensor descriptor (thus all it's transforms) and is linear (1D).
+    // We store base_offset_ to avoid multiple recalculations.
+    index_t base_offset_;
 };
 } // namespace wrapper

--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -22,14 +22,19 @@ namespace wrapper {
 // Disable from doxygen docs generation
 /// @cond
 // forward declaration
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
 struct Layout;
 template <typename T>
 using is_tuple = decltype(std::declval<T&>().IsTuple());
 namespace {
-// Generate packed (column-major) strides if not passed
+/**
+ * \brief Generate packed (column-major) strides if not passed
+ *
+ * \param shape Tensor shape.
+ * \return Generated column-major strides.
+ */
 template <typename... Ts>
 __host__ __device__ constexpr static auto
 GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
@@ -50,9 +55,16 @@ GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
        Number<decltype(unrolled_shape)::Size()>{});
 }
+/**
+ * \brief Create naive tensor descriptor from nested shape.
+ *
+ * \param shape Tensor shape.
+ * \param strides Tensor strides.
+ * \return Unrolled descriptor
+ */
 template <typename LayoutShape, typename LayoutStrides>
-__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape,
+__host__ __device__ constexpr auto MakeUnrolledDescriptor(const LayoutShape& shape,
-                                                         const LayoutStrides& strides)
+                                                          const LayoutStrides& strides)
 {
    const auto unrolled_shape = UnrollNestedTuple(shape);
    if constexpr(is_same_v<LayoutStrides, Tuple<>>)
@@ -86,8 +98,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap
 template <typename Shape, typename Strides>
 __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides)
 {
-    using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{}));
+    using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Strides{}));
-    return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, strides));
+    return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, strides));
 }
 /**
@@ -100,15 +112,19 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides
 template <typename Shape>
 __host__ __device__ constexpr auto make_layout(const Shape& shape)
 {
-    using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{}));
+    using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Tuple<>{}));
-    return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, Tuple<>{}));
+    return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, Tuple<>{}));
 }
 // Layout helpers
 // get
-// Get dim (could be returned from get with empty Idxs)
 /**
 * \private
+ * \brief Get dim.
+ *
+ * \param dim Dimension.
+ * \return Returned the same dimension.
 */
 template <typename T>
 __host__ __device__ T constexpr get(const T& dim)
@@ -178,7 +194,7 @@ __host__ __device__ constexpr auto get(const Layout<Shape, FlattenDesc>& layout)
        },
        Number<old_shape_dims>{});
-    const auto& flatten_desc = layout.GetUnnestedDescriptor();
+    const auto& flatten_desc = layout.GetUnrolledDescriptor();
    auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
    return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
 }
@@ -197,9 +213,12 @@ __host__ __device__ constexpr auto get(const T& elem)
 }
 // size
-// Get dim size (could be returned from get function)
 /**
 * \private
+ * \brief Get size.
+ *
+ * \param dim Size.
+ * \return Returned the same size.
 */
 template <typename T>
 __host__ __device__ T constexpr size(const T& dim)
@@ -214,8 +233,8 @@ __host__ __device__ T constexpr size(const T& dim)
 * \param layout Layout to get Shape of.
 * \return Requsted length.
 */
-template <index_t idx, typename Shape, typename UnnestedDescriptorType>
+template <index_t idx, typename Shape, typename UnrolledDescriptorType>
-__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
+__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    return layout.template GetLength<idx>();
 }
@@ -240,8 +259,8 @@ __host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
 * \param layout Layout to calculate shape size.
 * \return Requsted size.
 */
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
-__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
+__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    return layout.GetLengths();
 }
@@ -280,9 +299,9 @@ __host__ __device__ constexpr auto size(const T& elem)
 * \param layout Layout to calculate rank.
 * \return Requsted rank.
 */
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto
-rank([[maybe_unused]] const Layout<Shape, UnnestedDescriptorType>& layout)
+rank([[maybe_unused]] const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    return Shape::Size();
 }
@@ -302,17 +321,25 @@ __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& t
 /**
 * \private
+ * \brief Rank for scalar
+ *
+ * \param dim Dimension scalar.
+ * \return Returned 1.
 */
 template <index_t IDim>
-__host__ __device__ constexpr index_t rank(const Number<IDim>&)
+__host__ __device__ constexpr index_t rank([[maybe_unused]] const Number<IDim>& dim)
 {
    return 1;
 }
 /**
 * \private
+ * \brief Rank for scalar
+ *
+ * \param dim Dimension scalar.
+ * \return Returned 1.
 */
-__host__ __device__ constexpr index_t rank(const index_t&) { return 1; }
+__host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t& dim) { return 1; }
 /**
 * \brief Hierarchical rank.
@@ -334,8 +361,8 @@ __host__ __device__ constexpr auto rank(const T& elem)
 * \param layout Layout to calculate depth.
 * \return Requsted depth.
 */
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
-__host__ __device__ constexpr auto depth(const Layout<Shape, UnnestedDescriptorType>& layout)
+__host__ __device__ constexpr auto depth(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    const auto& shape = layout.GetShape();
    return TupleDepth(shape);
@@ -355,17 +382,25 @@ __host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)
 /**
 * \private
+ * \brief Depth for scalar
+ *
+ * \param dim Scalar.
+ * \return Returned 0.
 */
 template <index_t IDim>
-__host__ __device__ constexpr index_t depth(const Number<IDim>&)
+__host__ __device__ constexpr index_t depth([[maybe_unused]] const Number<IDim>& dim)
 {
    return 0;
 }
 /**
 * \private
+ * \brief Depth for scalar
+ *
+ * \param dim Scalar.
+ * \return Returned 0.
 */
-__host__ __device__ constexpr index_t depth(const index_t&) { return 0; }
+__host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t& dim) { return 0; }
 /**
 * \brief Hierarchical depth.

--- a/include/ck/wrapper/utils/tensor_partition.hpp
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -6,12 +6,22 @@
 #include "tensor_utils.hpp"
 #include "layout_utils.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
 namespace ck {
 namespace wrapper {
 namespace {
-// Calculate shape for partition based on number of threads per each dim and
-// previous shape
+/**
+ * \brief Calculate shape for partition based on number of threads per each dim and
+ * previous shape
+ *
+ * \param shape Base tensor shape.
+ * \param thread_lengths Tuple of thread lengths.
+ * \return Partition shape.
+ */
 template <typename... Ts, typename... Ls>
 __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape,
                                                                const Tuple<Ls...>& thread_lengths)
@@ -20,265 +30,165 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts..
    return generate_tuple(
        [&](auto i) {
            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+            const auto slice_len = size<num_i>(shape) / thread_lengths.At(num_i);
-            {
+            return slice_len;
-                // if tuple then recurrence
-                return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i));
-            }
-            else
-            {
-                const auto slice_len = shape.At(num_i) / thread_lengths.At(num_i);
-                return slice_len;
-            }
-        },
-        Number<Tuple<Ts...>::Size()>{});
-}
-// Calculate shape for partition based on number of threads per each dim,
-// previous strides and steps
-template <typename... Ts, typename... Ls, typename... Steps, typename FlattenDescType>
-__host__ __device__ constexpr auto
-CalculateLocalPartitionDescriptor(const Tuple<Ts...>& shape,
-                                  const Tuple<Ls...>& thread_lengths,
-                                  const Tuple<Steps...>& steps,
-                                  const FlattenDescType& flatten_desc)
-{
-    static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
-    const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths);
-    const auto unrolled_shape          = UnrollNestedTuple(shape);
-    constexpr auto dims                = decltype(unrolled_thread_lengths)::Size();
-    using UnrolledStepsType = decltype(UnrollNestedTuple(steps));
-    using I1 = Number<1>;
-    const auto transforms = generate_tuple(
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-            {
-                // By default raked partition
-                const auto partition_stride = unrolled_thread_lengths.At(num_i);
-                return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                            make_tuple(partition_stride));
-            }
-            else if constexpr(!is_same_v<tuple_element_t<i.value, UnrolledStepsType>, index_t>)
-            {
-                // Compiletime partition
-                if constexpr(is_same_v<tuple_element_t<i.value, UnrolledStepsType>, I1>)
-                {
-                    // raked
-                    const auto partition_stride = unrolled_thread_lengths.At(num_i);
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(partition_stride));
-                }
-                else
-                {
-                    // packed
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(I1{}));
-                }
-            }
-            else
-            {
-                // Runtime partition
-                if(steps.At(num_i) == 1)
-                {
-                    // raked
-                    const auto partition_stride = unrolled_thread_lengths.At(num_i);
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(partition_stride));
-                }
-                else
-                {
-                    // packed
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(I1{}));
-                }
-            }
-        },
-        Number<dims>{});
-    const auto lower_dims =
-        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
-    const auto upper_dims =
-        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
-    return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
-}
-template <typename... Ls, typename... Steps>
-__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple<Ls...>& thread_lengths,
-                                                                const Tuple<Steps...>& steps,
-                                                                index_t& thread_id)
-{
-    return generate_tuple(
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ls...>>>::value)
-            {
-                // if tuple then recurrence
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return CalculateLayoutOffsetIdxImpl(
-                        thread_lengths.At(num_i), Tuple<>{}, thread_id);
-                }
-                else
-                {
-                    return CalculateLayoutOffsetIdxImpl(
-                        thread_lengths.At(num_i), steps.At(num_i), thread_id);
-                }
-            }
-            else
-            {
-                // Update thread_id after each dim
-                const auto dim_thread_id = thread_id % thread_lengths.At(num_i);
-                thread_id /= thread_lengths.At(num_i);
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return dim_thread_id;
-                }
-                else
-                {
-                    // Apply step
-                    return steps.At(num_i) * dim_thread_id;
-                }
-            }
        },
        Number<Tuple<Ls...>::Size()>{});
 }
-// Convert integer thread_idx to tuple index with steps applied
+/**
-template <typename... Ls, typename... Steps>
+ * \brief Calculate total number of blocks.
-__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths,
+ *
-                                                            const Tuple<Steps...>& steps,
+ * \param shape Base tensor shape.
-                                                            const index_t thread_id)
+ * \param tile_shape Tile shape.
+ * \return Tuple with blocks number.
+ */
+template <typename... Ts, typename... Ls>
+__host__ __device__ constexpr auto CalculateGridSize(const Tuple<Ts...>& shape,
+                                                     const Tuple<Ls...>& tile_shape)
 {
-    // Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates
+    static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
-    index_t thread_id_copy = thread_id;
+    return generate_tuple([&](auto i) { return size<i>(shape) / size<i>(tile_shape); },
-    return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy);
+                          Number<Tuple<Ls...>::Size()>{});
 }
-// Apply steps to index represented as tuple
+/**
-template <typename... Steps, typename... Idxs>
+ * \brief Calculate scaled offset for new partition/tile.
-__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...>& steps,
+ *
-                                                            const Tuple<Idxs...>& block_idxs)
+ * \param thread_idxs Thread 1d id.
+ * \param partition_lengths_seq Sequence of partition shape.
+ * \param old_offset_idxs Multi index offset from base tensor to shift values.
+ * \return Partition shape.
+ */
+template <typename ThreadIdxs, typename PartitionLengthsSeq, typename OldOffsetIdxs>
+__host__ __device__ constexpr auto
+CalculateOffsetMultiIdxs(const ThreadIdxs& thread_idxs,
+                         const PartitionLengthsSeq& partition_lengths_seq,
+                         const OldOffsetIdxs& old_offset_idxs)
 {
-    return generate_tuple(
+    return thread_idxs * partition_lengths_seq + old_offset_idxs;
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Idxs...>>>::value)
-            {
-                // if tuple then recurrence
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i));
-                }
-                else
-                {
-                    return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i));
-                }
-            }
-            else
-            {
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return block_idxs.At(num_i);
-                }
-                else
-                {
-                    // apply step
-                    return steps.At(num_i) * block_idxs.At(num_i);
-                }
-            }
-        },
-        Number<Tuple<Idxs...>::Size()>{});
 }
-// User passes only shape per block to the make_local_tile function. This function calculates
-// block layout based on the shape.
-template <typename... Ts, typename... BlockDims>
-__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& shape,
-                                                         const Tuple<BlockDims...>& tile_shape)
-{
-    return generate_tuple(
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
-            {
-                // if tuple then recurrence
-                return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i));
-            }
-            else
-            {
-                return shape.At(num_i) / tile_shape.At(num_i);
-            }
-        },
-        Number<Tuple<Ts...>::Size()>{});
-}
 } // namespace
 /**
- * \brief Create local partition for thread.
+ * \brief Create local partition for thread (At now only packed partition
+ * is supported).
 *
 * \param tensor Tensor for partition.
- * \param thread_lengths Layout of threads.
+ * \param thread_lengths Layout of threads (could not be nested).
 * \param thread_id Thread index represented as integer.
- * \param steps Thread step (default=1, raked partition)
 * \return Partition tensor.
 */
-template <typename TensorType, typename ThreadLengthsTuple, typename StepsTuple = Tuple<>>
+template <typename TensorType, typename ThreadLengthsTuple>
-__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor,
+__host__ __device__ constexpr auto
-                                                        const ThreadLengthsTuple& thread_lengths,
+make_local_partition(TensorType& tensor,
-                                                        const index_t thread_id,
+                     [[maybe_unused]] const ThreadLengthsTuple& thread_lengths,
-                                                        const StepsTuple steps = StepsTuple{})
+                     const index_t thread_id)
 {
-    // Create shape, strides and layout for new partition tensor
+    static_assert(!IsNestedTuple(ThreadLengthsTuple{}));
-    const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths);
+    // Calculate new partition shape
-    // Create new descriptor and layout
+    const auto& tensor_shape = shape(tensor);
-    const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
+    constexpr auto partition_shape =
-    auto partition_desc =
+        CalculateLocalPartitionShape(decltype(tensor_shape){}, ThreadLengthsTuple{});
-        CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc);
+    // Create Thread Cluster Descriptor
-    const auto partition_layout = Layout<decltype(partition_shape), decltype(partition_desc)>(
+    constexpr auto partition_lengths_seq = generate_sequence_v2(
-        partition_shape, partition_desc);
+        [&](auto I) { return size<I>(partition_shape); }, Number<ThreadLengthsTuple::Size()>{});
-    // Calculate offset for new partition tensor
+    constexpr auto thread_lengths_seq =
-    const auto offset_idx       = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id);
+        generate_sequence_v2([&](auto I) { return size<I>(ThreadLengthsTuple{}); },
-    const auto partition_offset = layout(tensor)(offset_idx);
+                             Number<ThreadLengthsTuple::Size()>{});
-    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset,
+    constexpr auto thread_cluster_desc_ = make_cluster_descriptor(thread_lengths_seq);
-                                                             partition_layout);
+    // Calculate thread idxs and offsets
+    const auto thread_idxs = thread_cluster_desc_.CalculateBottomIndex(make_multi_index(thread_id));
+    const auto offset_multi_idxs =
+        CalculateOffsetMultiIdxs(thread_idxs, partition_lengths_seq, tensor.GetMultiIdxOffsets());
+    // Create new layout and tensor
+    auto& flatten_desc = layout(tensor).GetUnrolledDescriptor();
+    const auto partition_layout =
+        Layout<remove_reference_t<decltype(partition_shape)>, decltype(flatten_desc)>(
+            partition_shape, flatten_desc);
+    auto partition_tensor =
+        make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), partition_layout);
+    // Apply offsets
+    partition_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
+    return partition_tensor;
 }
 /**
- * \brief Create local tile for thread block.
+ * \brief Create local tile for thread block. (At now only packed tile
+ * is supported).
+ *
+ * \note Temporary to gain the best performance use 2d
+ * tile_shape.
+ *
 *
 * \param tensor Tensor for partition.
 * \param tile_shape Shapes of requested tile.
- * \param block_idx Block index represented as tuple.
+ * \param block_id Block index represented as integer.
- * \param steps Block step (default=1, raked partition)
 * \return Tile tensor.
 */
-template <typename TensorType,
+template <typename TensorType, typename BlockShapeTuple>
-          typename BlockShapeTuple,
+__host__ __device__ constexpr auto
-          typename BlockIdxTuple,
+make_local_tile(const TensorType& tensor, const BlockShapeTuple& tile_shape, const index_t block_id)
-          typename StepsTuple = Tuple<>>
-__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
-                                                   const BlockShapeTuple& tile_shape,
-                                                   const BlockIdxTuple& block_idx,
-                                                   const StepsTuple steps = StepsTuple{})
 {
-    // Create block lengths, strides and layout for new tile tensor
+    static_assert(!IsNestedTuple(BlockShapeTuple{}));
-    const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape);
-    // Create new descriptor and layout
+    constexpr auto I0 = Number<0>{};
-    const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
+    constexpr auto I1 = Number<1>{};
-    auto tile_desc =
+    constexpr auto I2 = Number<2>{};
-        CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc);
-    const auto tile_layout = Layout<remove_reference_t<decltype(tile_shape)>, decltype(tile_desc)>(
+    auto& aligned_desc = layout(tensor).GetMergedNestingDescriptor();
-        tile_shape, tile_desc);
-    // Calculate offset for new partition tensor
+    if constexpr(BlockShapeTuple::Size() == I2)
-    const auto offset_idx  = CalculateLayoutOffsetIdx(steps, block_idx);
+    {
-    const auto tile_offset = layout(tensor)(offset_idx);
+        // Optimized version for 2d tile shape [MxK]
-    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + tile_offset,
+        const auto block_2_tile_map =
-                                                             tile_layout);
+            BlockToCTileMap_M00_N0_M01Adapt<BlockShapeTuple{}.At(I0),
+                                            BlockShapeTuple{}.At(I1),
+                                            remove_cvref_t<decltype(aligned_desc)>>(aligned_desc);
+        const auto block_work_idx =
+            block_2_tile_map.CalculateBottomIndex(make_multi_index(block_id));
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * size<0>(tile_shape));
+        const index_t k_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * size<1>(tile_shape));
+        const auto offset_multi_idxs =
+            make_tuple(m_block_data_idx_on_grid, k_block_data_idx_on_grid);
+        // Create new layout and tensor
+        const auto tile_layout =
+            Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
+                                                                                     aligned_desc);
+        auto tile_tensor =
+            make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
+        // Apply offsets
+        tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
+        return tile_tensor;
+    }
+    else
+    {
+        // Calculate offsets
+        // Sequence with data to process per block
+        constexpr auto tile_shape_seq =
+            generate_sequence_v2([](auto I) { return size(BlockShapeTuple{}.At(I)); },
+                                 Number<BlockShapeTuple::Size()>{});
+        // Tuple with number of blocks
+        const auto block_lengths           = CalculateGridSize(shape(tensor), tile_shape);
+        constexpr auto block_cluster_desc_ = make_cluster_descriptor(block_lengths);
+        const auto block_idxs =
+            block_cluster_desc_.CalculateBottomIndex(make_multi_index(block_id));
+        const auto offset_multi_idxs =
+            CalculateOffsetMultiIdxs(block_idxs, tile_shape_seq, tensor.GetMultiIdxOffsets());
+        // Create new layout and tensor
+        const auto tile_layout =
+            Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
+                                                                                     aligned_desc);
+        auto tile_tensor =
+            make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
+        // Apply offsets
+        tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
+        return tile_tensor;
+    }
 }
 } // namespace wrapper

--- a/include/ck/wrapper/utils/tensor_utils.hpp
+++ b/include/ck/wrapper/utils/tensor_utils.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -10,6 +10,7 @@
 #include "ck/utility/tuple_helper.hpp"
 #include "ck/utility/dynamic_buffer.hpp"
 #include "ck/utility/amd_address_space.hpp"
+#include "ck/utility/multi_index.hpp"
 namespace ck {
 namespace wrapper {
@@ -27,16 +28,12 @@ using MemoryTypeEnum = AddressSpaceEnum;
 // Disable from doxygen docs generation
 /// @cond
 // forward declarations
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
 struct Layout;
 template <MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
+          typename UnrolledDescriptorType>
-          index_t NumVectors,     // params for Register memory
-          index_t ScalarPerVector // param for Register memory
-          >
 struct Tensor;
 template <typename FromType, typename ToType>
@@ -45,13 +42,22 @@ struct Slice
    __host__ __device__ constexpr Slice() : from_(), to_() {}
    __host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}
+    /**
+     * \brief Calculate slice range.
+     *
+     * \param dim Dimension size.
+     * \return Slice range.
+     */
    template <typename T>
    __host__ __device__ constexpr auto range(const T& dim) const
    {
        if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
                     is_same_v<T, index_t>)
        {
-            assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range");
+            if(!(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_)))
+            {
+                throw std::runtime_error("Invalid range");
+            }
            if(to_ < 0)
            {
                return dim - from_ + to_ + 1;
@@ -101,40 +107,27 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
 template <MemoryTypeEnum MemoryType,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType>
+          typename UnrolledDescriptorType>
 constexpr auto make_tensor(ElementType* pointer,
-                           const Layout<Shape, UnnestedDescriptorType>& layout)
+                           const Layout<Shape, UnrolledDescriptorType>& layout)
 {
-    return Tensor<MemoryType,
+    return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(pointer, layout);
-                  ElementType,
-                  Shape,
-                  UnnestedDescriptorType,
-                  0 /*NumVectors*/,
-                  0 /*ScalarPerVector*/>(pointer, layout);
 }
 /**
 * \brief Make SGPR or VGPR tensor function.
 *
 * \tparam MemoryType Type of memory.
- * \tparam NumVectors Number of vectors.
- * \tparam ScalarPerVector Scalars per vector.
 * \tparam ElementType Memory data type.
 * \return Constructed tensor.
 */
 template <MemoryTypeEnum MemoryType,
-          index_t NumVectors,
+          typename ElementType,
-          index_t ScalarPerVector,
+          typename Shape,
-          typename ElementType>
+          typename UnrolledDescriptorType>
-constexpr auto make_register_tensor()
+constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
-    const auto layout = make_layout(make_tuple(Number<NumVectors>{}), make_tuple(Number<1>{}));
+    return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(layout);
-    return Tensor<MemoryType,
-                  ElementType,
-                  Tuple<Number<NumVectors>>,
-                  std::remove_const_t<remove_reference_t<decltype(layout.GetUnnestedDescriptor())>>,
-                  NumVectors,
-                  ScalarPerVector>(layout);
 }
 /**
@@ -146,15 +139,9 @@ constexpr auto make_register_tensor()
 template <MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
+          typename UnrolledDescriptorType>
-          index_t NumVectors,
+__host__ __device__ constexpr const auto&
-          index_t ScalarPerVector>
+layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
-__host__ __device__ constexpr const auto& layout(const Tensor<BufferAddressSpace,
-                                                              ElementType,
-                                                              Shape,
-                                                              UnnestedDescriptorType,
-                                                              NumVectors,
-                                                              ScalarPerVector>& tensor)
 {
    return tensor.GetLayout();
 }
@@ -170,15 +157,9 @@ template <index_t... Idxs,
          MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
+          typename UnrolledDescriptorType>
-          index_t NumVectors,
+__host__ __device__ constexpr auto
-          index_t ScalarPerVector>
+size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
-__host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace,
-                                                     ElementType,
-                                                     Shape,
-                                                     UnnestedDescriptorType,
-                                                     NumVectors,
-                                                     ScalarPerVector>& tensor)
 {
    return size<Idxs...>(tensor.GetLayout());
 }
@@ -194,15 +175,9 @@ template <index_t... Idxs,
          MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
+          typename UnrolledDescriptorType>
-          index_t NumVectors,
+__host__ __device__ constexpr auto
-          index_t ScalarPerVector>
+rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
-__host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace,
-                                                     ElementType,
-                                                     Shape,
-                                                     UnnestedDescriptorType,
-                                                     NumVectors,
-                                                     ScalarPerVector>& tensor)
 {
    return rank<Idxs...>(tensor.GetLayout());
 }
@@ -218,15 +193,9 @@ template <index_t... Idxs,
          MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
+          typename UnrolledDescriptorType>
-          index_t NumVectors,
+__host__ __device__ constexpr auto
-          index_t ScalarPerVector>
+depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
-__host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
-                                                      ElementType,
-                                                      Shape,
-                                                      UnnestedDescriptorType,
-                                                      NumVectors,
-                                                      ScalarPerVector>& tensor)
 {
    return depth<Idxs...>(tensor.GetLayout());
 }
@@ -240,15 +209,9 @@ __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
 template <MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
+          typename UnrolledDescriptorType>
-          index_t NumVectors,
+__host__ __device__ constexpr const auto&
-          index_t ScalarPerVector>
+shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
-__host__ __device__ constexpr const auto& shape(const Tensor<BufferAddressSpace,
-                                                             ElementType,
-                                                             Shape,
-                                                             UnnestedDescriptorType,
-                                                             NumVectors,
-                                                             ScalarPerVector>& tensor)
 {
    return shape(tensor.GetLayout());
 }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
@@ -265,6 +265,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
                return 0;
            }
+            throw std::runtime_error("Col2Img: number of dimensions should be between 1 and 3.");
+            return 1;
        }
        float Run(const device::BaseArgument* p_arg,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -313,6 +313,9 @@ struct ReferenceConvBwdData : public device::BaseOperator
                return 0;
            }
+            throw std::runtime_error(
+                "Conv_bwd_data: number of dimensions must be between 1 and 3.");
+            return 1;
        }
        float Run(const device::BaseArgument* p_arg,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
@@ -265,6 +265,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                return 0;
            }
+            throw std::runtime_error("Conv_bwd: number of dimensions must be between 1 and 3.");
+            return 1;
        }
        float Run(const device::BaseArgument* p_arg,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -360,6 +360,8 @@ struct ReferenceConvFwd : public device::BaseOperator
                return 0;
            }
+            throw std::runtime_error("Conv_fwd: number of dimensions must be between 1 and 3.");
+            return 1;
        }
        float Run(const device::BaseArgument* p_arg,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -63,12 +63,11 @@ struct ReferenceGemm : public device::BaseOperator
                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
                AccDataType v_acc = 0;
+                ComputeTypeA v_a  = 0;
+                ComputeTypeB v_b  = 0;
                for(int k = 0; k < K; ++k)
                {
-                    ComputeTypeA v_a;
-                    ComputeTypeB v_b;
                    // use PassThrough instead of ConvertBF16RTN for reference calculation
                    if constexpr(is_same_v<AElementwiseOperation,
                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
@@ -94,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                }
-                CDataType v_c;
+                CDataType v_c = 0;
                arg.c_element_op_(v_c, v_acc);

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/numeric.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -229,6 +230,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
                return 0;
            }
+            throw std::runtime_error("Img2Col: number of dimensions should be between 1 and 3.");
+            return 1;
        }
        float Run(const device::BaseArgument* p_arg,

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -106,9 +106,8 @@ struct DeviceOperationInstanceFactory<
        return op_ptrs;
    }
 };
+#endif
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
@@ -114,9 +114,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmSt
        return op_ptrs;
    }
 };
+#endif
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+#ifdef CK_ENABLE_FP32
+// FP32
+void add_device_groupnorm_bwd_gamma_beta_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F32, F32, F32, F32, F32, 5, 3>>>&);
+#endif
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                                  XDataType,
+                                                                  MeanInvStdDataType,
+                                                                  DGammaDataType,
+                                                                  DBetaDataType,
+                                                                  5,
+                                                                  3>>
+{
+    using DeviceOp = DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                     XDataType,
+                                                     MeanInvStdDataType,
+                                                     DGammaDataType,
+                                                     DBetaDataType,
+                                                     5,
+                                                     3>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP32
+        if constexpr(is_same_v<DYDataType, F32> && is_same_v<XDataType, F32> &&
+                     is_same_v<MeanInvStdDataType, F32> && is_same_v<DGammaDataType, F32> &&
+                     is_same_v<DBetaDataType, F32>)
+        {
+            add_device_groupnorm_bwd_gamma_beta_f32_instances(op_ptrs);
+        }
+#endif
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+#ifdef CK_ENABLE_FP16
+// FP16
+void add_device_layernorm2d_bwd_gamma_beta_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F16, F16, F16, F16, F16, 2, 1>>>&);
+#endif
+#ifdef CK_ENABLE_FP32
+// FP32
+void add_device_layernorm2d_bwd_gamma_beta_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F32, F32, F32, F32, F32, 2, 1>>>&);
+#endif
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                                  XDataType,
+                                                                  MeanInvStdDataType,
+                                                                  DGammaDataType,
+                                                                  DBetaDataType,
+                                                                  Rank,
+                                                                  NumReduceDim>>
+{
+    using DeviceOp = DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                     XDataType,
+                                                     MeanInvStdDataType,
+                                                     DGammaDataType,
+                                                     DBetaDataType,
+                                                     Rank,
+                                                     NumReduceDim>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<DYDataType, F16> && is_same_v<XDataType, F16> &&
+                     is_same_v<MeanInvStdDataType, F16> && is_same_v<DGammaDataType, F16> &&
+                     is_same_v<DBetaDataType, F16>)
+        {
+            if constexpr(Rank == 2 && NumReduceDim == 1)
+            {
+                add_device_layernorm2d_bwd_gamma_beta_f16_instances(op_ptrs);
+            }
+        }
+#endif
+#ifdef CK_ENABLE_FP32
+        if constexpr(is_same_v<DYDataType, F32> && is_same_v<XDataType, F32> &&
+                     is_same_v<MeanInvStdDataType, F32> && is_same_v<DGammaDataType, F32> &&
+                     is_same_v<DBetaDataType, F32>)
+        {
+            if constexpr(Rank == 2 && NumReduceDim == 1)
+            {
+                add_device_layernorm2d_bwd_gamma_beta_f32_instances(op_ptrs);
+            }
+        }
+#endif
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
@@ -7,6 +7,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
@@ -57,7 +58,8 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances = std::tuple<
        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV2<   Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        2,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
        // pipeline v1, 2 waves
        ,

--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
@@ -7,6 +7,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
@@ -52,7 +53,8 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple<
        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV2<   Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        2,   256,   256,   256,    32,   8,   8,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
        // pipeline v1, 2 waves
        ,

--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -7,6 +7,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
@@ -57,7 +58,8 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV2<   Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        2,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
        // pipeline v1, 2 waves
        ,

--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -7,6 +7,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
@@ -52,7 +53,8 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV2<   Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        2,   256,   256,   256,    32,   8,   8,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
        // pipeline v1, 2 waves
        ,