sync from the public repo

e599063f · illsilin · 5dbbf5d6 · 566b6480 · e599063f · e599063f
Commit e599063f authored May 10, 2024 by illsilin
20 changed files
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor/static_tensor.hpp"
+#include "ck/utility/is_detected.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          typename SrcsScalarPerVector,         // Sequence
+          typename DstsScalarPerVector,         // Sequence
+          typename SrcsScalarStrideInVector,    // Sequence
+          typename DstsScalarStrideInVector,    // Sequence
+          typename SrcsResetCoordinateAfterRun, // control whether to move back src coordinate after
+                                                // each RunRead(),  will be fused with
+                                                // MoveSrcSliceWindow to save addr computation
+          typename DstsResetCoordinateAfterRun, // control whether to move back dst coordinate after
+                                                // each RunWrite(),  will be fused with
+                                                // MoveDstSliceWindow to save addr computation
+          index_t NumThreadScratch = 1>
+struct ThreadwiseTensorSliceTransfer_v3r2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    static constexpr index_t nSrc = SrcDescs::Size();
+    static constexpr index_t nDst = DstDescs::Size();
+
+    // return a tuple of coordiantes for a tuple of tensor
+    template <typename Descs,
+              typename Indices,
+              enable_if_t<Descs::Size() == Indices::Size(), bool> = false>
+    static constexpr auto MakeCoordinates(const Descs& descs, const Indices& indices)
+    {
+        return generate_tuple([&](auto i) { return make_tensor_coordinate(descs[i], indices[i]); },
+                              Number<Descs::Size()>{});
+    }
+
+    using SrcCoords = decltype(MakeCoordinates(SrcDescs{}, StaticallyIndexedArray<Index, nSrc>{}));
+    using DstCoords = decltype(MakeCoordinates(DstDescs{}, StaticallyIndexedArray<Index, nDst>{}));
+
+    static constexpr auto I0 = Number<0>{};
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r2(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_slice_origins,
+        const ElementwiseOperation& element_op)
+        : src_coords_(MakeCoordinates(src_descs, src_slice_origins)),
+          dst_coords_(MakeCoordinates(dst_descs, dst_slice_origins)),
+          element_op_(element_op)
+    {
+    }
+
+    template <typename Indices, enable_if_t<SrcDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetSrcSliceOrigins(const SrcDescs& src_descs,
+                                       const Indices& src_slice_origin_idxs)
+    {
+        static_for<0, nSrc, 1>{}([&](auto src_i) {
+            src_coords_(src_i) =
+                make_tensor_coordinate(src_descs.At(src_i), src_slice_origin_idxs[src_i]);
+        });
+    }
+
+    template <typename Indices, enable_if_t<DstDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetDstSliceOrigins(const DstDescs& dst_descs,
+                                       const Indices& dst_slice_origin_idxs)
+    {
+        static_for<0, nDst, 1>{}([&](auto dst_i) {
+            dst_coords_(dst_i) =
+                make_tensor_coordinate(dst_descs.At(dst_i), dst_slice_origin_idxs[dst_i]);
+        });
+    }
+
+    template <typename SrcBuffers, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDescs& src_descs,
+                            const SrcBuffers& src_bufs,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access_tuple = generate_tuple(
+            [&](auto src_i) {
+                return generate_sequence(
+                    detail::lambda_scalar_per_access<SrcVectorDim,
+                                                     SrcsScalarPerVector::At(src_i)>{},
+                    Number<nDim>{});
+            },
+            Number<nSrc>{});
+
+        constexpr auto src_access_lengths_tuple = generate_tuple(
+            [&](auto src_i) {
+                return SliceLengths{} / src_scalar_per_access_tuple.At(src_i);
+                static_assert(
+                    SliceLengths::At(SrcVectorDim) % SrcsScalarPerVector::At(src_i) == 0,
+                    "SliceLengths[SrcVectorDim] must be divisible by SrcsScalarPerVector");
+            },
+            Number<nSrc>{});
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths_tuple = generate_tuple(
+            [&](auto src_i) {
+                return container_reorder_given_new2old(src_access_lengths_tuple.At(src_i),
+                                                       src_dim_access_order);
+            },
+            Number<nSrc>{});
+
+        // make forward steps
+        const auto src_forward_steps_tuple = generate_tuple(
+            [&](auto src_i) {
+                return generate_tuple(
+                    [&](auto i) {
+                        Index forward_step_idx;
+
+                        static_for<0, nDim, 1>{}([&](auto j) {
+                            forward_step_idx(j) =
+                                (i.value == j.value) ? src_scalar_per_access_tuple.At(src_i)[i] : 0;
+                        });
+
+                        return make_tensor_coordinate_step(src_descs.At(src_i), forward_step_idx);
+                    },
+                    Number<nDim>{});
+            },
+            Number<nSrc>{});
+
+        // make backward steps
+        const auto src_backward_steps_tuple = generate_tuple(
+            [&](auto src_i) {
+                return generate_tuple(
+                    [&](auto i) {
+                        Index backward_step_idx;
+
+                        static_for<0, nDim, 1>{}([&](auto j) {
+                            backward_step_idx(j) = (i.value == j.value)
+                                                       ? -src_scalar_per_access_tuple.At(src_i)[i]
+                                                       : 0;
+                        });
+
+                        return make_tensor_coordinate_step(src_descs.At(src_i), backward_step_idx);
+                    },
+                    Number<nDim>{});
+            },
+            Number<nSrc>{});
+
+        // loop over tensor and copy
+        static_for<0, nSrc, 1>{}([&](auto src_i) {
+            static_ford<remove_cvref_t<decltype(ordered_src_access_lengths_tuple.At(src_i))>>{}(
+                [&](auto ordered_src_access_idx) {
+                    // judge move forward or move backward
+                    constexpr auto forward_sweep = [&]() {
+                        StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                        forward_sweep_(I0) = true;
+
+                        static_for<1, nDim, 1>{}([&](auto i) {
+                            index_t tmp = ordered_src_access_idx[I0];
+
+                            static_for<1, i, 1>{}([&](auto j) {
+                                tmp = tmp * ordered_src_access_lengths_tuple[j] +
+                                      ordered_src_access_idx[j];
+                            });
+
+                            forward_sweep_(i) = tmp % 2 == 0;
+                        });
+
+                        return forward_sweep_;
+                    }();
+
+                    // calculate src data index
+                    constexpr auto src_data_idx = [&]() {
+                        Index ordered_idx;
+
+                        static_for<0, nDim, 1>{}([&](auto i) {
+                            ordered_idx(i) = forward_sweep[i]
+                                                 ? ordered_src_access_idx[i]
+                                                 : ordered_src_access_lengths_tuple.At(src_i)[i] -
+                                                       1 - ordered_src_access_idx[i];
+                        });
+
+                        return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                               src_scalar_per_access_tuple.At(src_i);
+                    }();
+
+                    constexpr auto src_data_idx_seq =
+                        generate_sequence_v2([&](auto i) { return Number<src_data_idx[i]>{}; },
+                                             Number<src_data_idx.Size()>{});
+
+                    const bool is_src_valid =
+                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                            src_descs.At(src_i), src_coords_.At(src_i));
+
+                    using src_vector_type = vector_type_maker_t<tuple_element_t<src_i, SrcDatas>,
+                                                                SrcsScalarPerVector::At(src_i)>;
+                    using src_vector_t    = typename src_vector_type::type;
+
+                    // copy data from src_buf into src_vector_container
+                    auto src_vector_container =
+                        src_vector_type{src_bufs.At(src_i).template Get<src_vector_t>(
+                            src_coords_.At(src_i).GetOffset(), is_src_valid)};
+
+                    // copy data from src_vector_container into src_thread_scratch_
+                    src_thread_scratch_tuple_(thread_scratch_id)
+                        .At(src_i)
+                        .template SetAsType<src_vector_t>(
+                            src_data_idx_seq,
+                            src_vector_container.template AsType<src_vector_t>()[I0]);
+
+                    constexpr auto move_on_dim = [&]() constexpr
+                    {
+                        StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                        static_for<0, nDim, 1>{}([&](auto i) {
+                            move_on_dim_(i) = ordered_src_access_idx[i] <
+                                              ordered_src_access_lengths_tuple.At(src_i)[i] - 1;
+
+                            static_for<i + 1, nDim, 1>{}([&](auto j) {
+                                move_on_dim_(i) &=
+                                    ordered_src_access_idx[j] ==
+                                    ordered_src_access_lengths_tuple.At(src_i)[j] - 1;
+                            });
+                        });
+
+                        return move_on_dim_;
+                    }
+                    ();
+
+                    // move src coord
+                    static_for<0, nDim, 1>{}([&](auto i) {
+                        if constexpr(move_on_dim[i])
+                        {
+                            if constexpr(forward_sweep[i])
+                            {
+                                move_tensor_coordinate(
+                                    src_descs.At(src_i),
+                                    src_coords_.At(src_i),
+                                    src_forward_steps_tuple.At(src_i)[src_dim_access_order[i]]);
+                            }
+                            else
+                            {
+                                move_tensor_coordinate(
+                                    src_descs.At(src_i),
+                                    src_coords_.At(src_i),
+                                    src_backward_steps_tuple.At(src_i)[src_dim_access_order[i]]);
+                            }
+                        }
+                    });
+                });
+        });
+
+        static_for<0, nSrc, 1>{}([&](auto src_i) {
+            // move src coordinate back to slice origin (or not)
+            if constexpr(SrcsResetCoordinateAfterRun::At(src_i))
+            {
+                const auto src_reset_step = make_tensor_coordinate_step(
+                    src_descs.At(src_i), GetSrcCoordinateResetStep<src_i>());
+
+                move_tensor_coordinate(src_descs.At(src_i), src_coords_.At(src_i), src_reset_step);
+            }
+        });
+    }
+
+    template <index_t ThreadScratchId>
+    __device__ void
+    TransferDataFromSrcThreadScratchToDstThreadScratch(Number<ThreadScratchId> thread_scratch_id)
+    {
+        // TODO: Add support for CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
+        // (it requires to add Elementwise support in transpose_vectors)
+        static_ford<SliceLengths>{}([&](auto idx) {
+            const auto src_data_refs = generate_tie(
+                [&](auto src_i) -> const auto& {
+                    return src_thread_scratch_tuple_[thread_scratch_id].At(src_i)[idx];
+                },
+                Number<nSrc>{});
+
+            auto dst_data_refs = generate_tie(
+                [&](auto dst_i) -> auto& { return dst_thread_scratch_tuple_.At(dst_i)(idx); },
+                Number<nDst>{});
+            unpack2(element_op_, dst_data_refs, src_data_refs);
+        });
+    }
+
+    template <typename DstBuffers, index_t ThreadScratchId = 0>
+    __device__ void RunWrite(const DstDescs& dst_descs,
+                             DstBuffers& dst_bufs,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // if there is transpose, it's done here
+        // TODO move this elsewhere
+        TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id);
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access_tuple = generate_tuple(
+            [&](auto dst_i) {
+                return generate_sequence(
+                    detail::lambda_scalar_per_access<DstVectorDim,
+                                                     DstsScalarPerVector::At(dst_i)>{},
+                    Number<nDim>{});
+            },
+            Number<nDst>{});
+
+        constexpr auto dst_access_lengths_tuple = generate_tuple(
+            [&](auto dst_i) { return SliceLengths{} / dst_scalar_per_access_tuple.At(dst_i); },
+            Number<nDst>{});
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths_tuple = generate_tuple(
+            [&](auto dst_i) {
+                return container_reorder_given_new2old(dst_access_lengths_tuple.At(dst_i),
+                                                       dst_dim_access_order);
+            },
+            Number<nDst>{});
+
+        // make forward steps
+        const auto dst_forward_steps_tuple = generate_tuple(
+            [&](auto dst_i) {
+                return generate_tuple(
+                    [&](auto i) {
+                        Index forward_step_idx;
+
+                        static_for<0, nDim, 1>{}([&](auto j) {
+                            forward_step_idx(j) =
+                                (i.value == j.value) ? dst_scalar_per_access_tuple.At(dst_i)[i] : 0;
+                        });
+
+                        return make_tensor_coordinate_step(dst_descs.At(dst_i), forward_step_idx);
+                    },
+                    Number<nDim>{});
+            },
+            Number<nDst>{});
+
+        // make backward steps
+        const auto dst_backward_steps_tuple = generate_tuple(
+            [&](auto dst_i) {
+                return generate_tuple(
+                    [&](auto i) {
+                        Index backward_step_idx;
+
+                        static_for<0, nDim, 1>{}([&](auto j) {
+                            backward_step_idx(j) = (i.value == j.value)
+                                                       ? -dst_scalar_per_access_tuple.At(dst_i)[i]
+                                                       : 0;
+                        });
+
+                        return make_tensor_coordinate_step(dst_descs.At(dst_i), backward_step_idx);
+                    },
+                    Number<nDim>{});
+            },
+            Number<nDst>{});
+
+        // loop over tensor and copy
+        static_for<0, nDst, 1>{}([&](auto dst_i) {
+            static_ford<remove_cvref_t<decltype(ordered_dst_access_lengths_tuple.At(dst_i))>>{}(
+                [&](auto ordered_dst_access_idx) {
+                    // judge move forward or move backward
+                    constexpr auto forward_sweep = [&]() {
+                        StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                        forward_sweep_(I0) = true;
+
+                        static_for<1, nDim, 1>{}([&](auto i) {
+                            index_t tmp = ordered_dst_access_idx[I0];
+
+                            static_for<1, i, 1>{}([&](auto j) {
+                                tmp = tmp * ordered_dst_access_lengths_tuple.At(dst_i)[j] +
+                                      ordered_dst_access_idx[j];
+                            });
+
+                            forward_sweep_(i) = tmp % 2 == 0;
+                        });
+
+                        return forward_sweep_;
+                    }();
+
+                    // calculate dst data index
+                    constexpr auto dst_data_idx = [&]() {
+                        Index ordered_idx;
+
+                        static_for<0, nDim, 1>{}([&](auto i) {
+                            ordered_idx(i) = forward_sweep[i]
+                                                 ? ordered_dst_access_idx[i]
+                                                 : ordered_dst_access_lengths_tuple.At(dst_i)[i] -
+                                                       1 - ordered_dst_access_idx[i];
+                        });
+
+                        return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                               dst_scalar_per_access_tuple.At(dst_i);
+                    }();
+
+                    constexpr auto dst_data_idx_seq =
+                        generate_sequence_v2([&](auto i) { return Number<dst_data_idx[i]>{}; },
+                                             Number<dst_data_idx.Size()>{});
+
+                    const bool is_dst_valid =
+                        coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                            dst_descs.At(dst_i), dst_coords_.At(dst_i));
+
+                    using dst_vector_type = vector_type_maker_t<tuple_element_t<dst_i, DstDatas>,
+                                                                DstsScalarPerVector::At(dst_i)>;
+                    using dst_vector_t    = typename dst_vector_type::type;
+
+                    // copy data from dst_thread_scratch_ into dst_vector_container
+                    auto dst_vector_container = dst_vector_type{
+                        dst_thread_scratch_tuple_.At(dst_i).template GetAsType<dst_vector_t>(
+                            dst_data_idx_seq)};
+
+                    constexpr InMemoryDataOperationEnum DstInMemOp =
+                        static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(dst_i.value));
+
+                    // copy data from dst_vector_container to dst_buf
+                    dst_bufs.At(dst_i).template Update<DstInMemOp, dst_vector_t>(
+                        dst_coords_.At(dst_i).GetOffset(),
+                        is_dst_valid,
+                        dst_vector_container.template AsType<dst_vector_t>()[I0]);
+
+                    constexpr auto move_on_dim = [&]() constexpr
+                    {
+                        StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                        static_for<0, nDim, 1>{}([&](auto i) {
+                            move_on_dim_(i) = ordered_dst_access_idx[i] <
+                                              ordered_dst_access_lengths_tuple.At(dst_i)[i] - 1;
+
+                            static_for<i + 1, nDim, 1>{}([&](auto j) {
+                                move_on_dim_(i) &=
+                                    ordered_dst_access_idx[j] ==
+                                    ordered_dst_access_lengths_tuple.At(dst_i)[j] - 1;
+                            });
+                        });
+
+                        return move_on_dim_;
+                    }
+                    ();
+
+                    // move dst coord
+                    static_for<0, nDim, 1>{}([&](auto i) {
+                        if constexpr(move_on_dim[i])
+                        {
+                            if constexpr(forward_sweep[i])
+                            {
+                                move_tensor_coordinate(
+                                    dst_descs.At(dst_i),
+                                    dst_coords_.At(dst_i),
+                                    dst_forward_steps_tuple.At(dst_i)[dst_dim_access_order[i]]);
+                            }
+                            else
+                            {
+                                move_tensor_coordinate(
+                                    dst_descs.At(dst_i),
+                                    dst_coords_.At(dst_i),
+                                    dst_backward_steps_tuple.At(dst_i)[dst_dim_access_order[i]]);
+                            }
+                        }
+                    });
+                });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        static_for<0, nDst, 1>{}([&](auto dst_i) {
+            if constexpr(DstsResetCoordinateAfterRun::At(dst_i))
+            {
+                const auto dst_reset_step = make_tensor_coordinate_step(
+                    dst_descs.At(dst_i), GetDstCoordinateResetStep<dst_i>());
+
+                move_tensor_coordinate(dst_descs.At(dst_i), dst_coords_.At(dst_i), dst_reset_step);
+            }
+        });
+    }
+
+    template <index_t src_i>
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcsScalarPerVector::At(src_i)>{},
+            Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    template <index_t dst_i>
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstsScalarPerVector::At(dst_i)>{},
+            Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access.At(dst_i);
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        static_for<0, nSrc, 1>{}([&](auto src_i) {
+            // if src coord was not reset by RunRead(), then need to adjust the step here
+            const auto adjusted_step_idx =
+                SrcsResetCoordinateAfterRun::At(src_i)
+                    ? src_slice_origin_step_idx
+                    : src_slice_origin_step_idx + GetSrcCoordinateResetStep<src_i>();
+
+            // is it OK to construct a new step every time?
+            const auto adjusted_step =
+                make_tensor_coordinate_step(src_descs.At(src_i), adjusted_step_idx);
+
+            move_tensor_coordinate(src_descs.At(src_i), src_coords_.At(src_i), adjusted_step);
+        });
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        static_for<0, nDst, 1>{}([&](auto dst_i) {
+            // if dst coord was not reset by RunWrite(), then need to adjust the step here
+            const auto adjusted_step_idx =
+                DstsResetCoordinateAfterRun::At(dst_i)
+                    ? dst_slice_origin_step_idx
+                    : dst_slice_origin_step_idx + GetDstCoordinateResetStep<dst_i>();
+
+            // is it OK to construct a new step every time?
+            const auto adjusted_step =
+                make_tensor_coordinate_step(dst_descs.At(dst_i), adjusted_step_idx);
+
+            move_tensor_coordinate(dst_descs.At(dst_i), dst_coords_.At(dst_i), adjusted_step);
+        });
+    }
+
+    template <index_t src_i>
+    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
+    {
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcsScalarPerVector::At(src_i)>{},
+            Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_access_lengths_and_vector_length =
+            container_push_back(sequence_to_tuple_of_number(src_access_lengths),
+                                Number<SrcsScalarPerVector::At(src_i)>{});
+
+        // 1st stage of transforms
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(src_access_lengths_and_vector_length[i],
+                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    template <index_t dst_i>
+    __device__ static constexpr auto GetDstThreadScratchDescriptor()
+    {
+        // 1st stage of transforms
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstsScalarPerVector::At(dst_i)>{},
+            Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_access_lengths_and_vector_length =
+            container_push_back(sequence_to_tuple_of_number(dst_access_lengths),
+                                Number<DstsScalarPerVector::At(dst_i)>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(dst_access_lengths_and_vector_length[i],
+                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    __device__ static constexpr auto MakeSrcThreadScratchTuple()
+    {
+        return generate_tuple(
+            [&](auto src_i) {
+                constexpr auto src_thread_scratch_desc =
+                    decltype(GetSrcThreadScratchDescriptor<src_i>()){};
+                using SrcThreadScratch =
+                    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                                    tuple_element_t<src_i, SrcDatas>,
+                                                    SrcsScalarPerVector::At(src_i),
+                                                    decltype(src_thread_scratch_desc),
+                                                    true>;
+                return SrcThreadScratch{};
+            },
+            Number<nSrc>{});
+    }
+
+    __device__ static constexpr auto MakeDstThreadScratchTuple()
+    {
+        return generate_tuple(
+            [&](auto dst_i) {
+                constexpr auto dst_thread_scratch_desc =
+                    decltype(GetDstThreadScratchDescriptor<dst_i>()){};
+                using DstThreadScratch =
+                    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                                    tuple_element_t<dst_i, DstDatas>,
+                                                    DstsScalarPerVector::At(dst_i),
+                                                    decltype(dst_thread_scratch_desc),
+                                                    true>;
+                return DstThreadScratch{};
+            },
+            Number<nDst>{});
+    }
+
+    private:
+    using SrcThreadScratchTuple = decltype(MakeSrcThreadScratchTuple());
+    using DstThreadScratchTuple = decltype(MakeDstThreadScratchTuple());
+
+    StaticallyIndexedArray<SrcThreadScratchTuple, NumThreadScratch> src_thread_scratch_tuple_;
+
+    DstThreadScratchTuple dst_thread_scratch_tuple_;
+
+    SrcCoords src_coords_;
+    DstCoords dst_coords_;
+    const ElementwiseOperation element_op_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -8,9 +8,11 @@
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_description/tensor_space_filling_curve.hpp"
 #include "ck/utility/is_detected.hpp"
+#include "ck/tensor/static_tensor.hpp"

-namespace ck {
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"

+namespace ck {
 // Thread-level multi-source, multi-destination tensor slice data movement
 // Assume:
 //   1. All sources and destinations are DynamicBuffer
@@ -40,7 +42,8 @@ template <typename SrcDatas,
          index_t SrcScalarPerVector,
          index_t DstScalarPerVector,
          typename SrcResetCoordinateAfterRunFlags, // Sequence<bool ...>
-          typename DstResetCoordinateAfterRunFlags> // Sequence<bool ...>
+          typename DstResetCoordinateAfterRunFlags, // Sequence<bool ...>
+          index_t NumThreadScratch = 1>
 struct ThreadwiseTensorSliceTransfer_v7r2
 {
    static constexpr auto I0 = Number<0>{};
@@ -70,16 +73,18 @@ struct ThreadwiseTensorSliceTransfer_v7r2
    static constexpr auto src_scalar_per_access = generate_sequence(
        detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});

-    using SrcSpaceFillingCurve = SpaceFillingCurve<SliceLengths,
-                                                   SrcDimAccessOrder,
-                                                   remove_cv_t<decltype(src_scalar_per_access)>>;
-
    static constexpr auto dst_scalar_per_access = generate_sequence(
        detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});

+    using SrcSpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                   SrcDimAccessOrder,
+                                                   remove_cv_t<decltype(src_scalar_per_access)>,
+                                                   false>;
+
    using DstSpaceFillingCurve = SpaceFillingCurve<SliceLengths,
                                                   DstDimAccessOrder,
-                                                   remove_cv_t<decltype(dst_scalar_per_access)>>;
+                                                   remove_cv_t<decltype(dst_scalar_per_access)>,
+                                                   false>;

    __device__ constexpr ThreadwiseTensorSliceTransfer_v7r2(
        const SrcDescs& src_descs,
@@ -135,13 +140,18 @@ struct ThreadwiseTensorSliceTransfer_v7r2
    // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
    // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
    template <typename SrcBuffers,
+              index_t ThreadScratchId                                   = 0,
              enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
-    __device__ void RunRead(const SrcDescs& src_descs, const SrcBuffers& src_bufs)
+    __device__ void RunRead(const SrcDescs& src_descs,
+                            const SrcBuffers& src_bufs,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
    {
        // loop over space-filling curve
-        static_for<0, num_access, 1>{}([&](auto iAccess) {
+        static_for<0, src_num_access, 1>{}([&](auto iAccess) {
            auto src_vectors = generate_vectors<SrcDatas, SrcScalarPerVector>();
-            auto dst_vectors = generate_vectors<DstDatas, DstScalarPerVector>();
+            auto elm_vectors = generate_vectors<DstDatas, SrcScalarPerVector>();
+
+            bool oob_val = true;

            // copy data from src_bufs into src_vectors
            static_for<0, nSrc, 1>{}([&](auto i) {
@@ -151,9 +161,10 @@ struct ThreadwiseTensorSliceTransfer_v7r2
                    coordinate_has_valid_offset_assuming_visible_index_is_valid(src_descs[i],
                                                                                src_coords_[i]);

+                oob_val = oob_val & is_src_valid;
+
                src_vectors(i).template AsType<src_vector_t>()(I0) =
-                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(),
-                                                           is_src_valid);
+                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
            });

            constexpr auto get_elem_op_vec_len = []() {
@@ -199,7 +210,7 @@ struct ThreadwiseTensorSliceTransfer_v7r2

                        using elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;

-                        return dst_vectors(iDst).template AsType<elem_op_vec_t>()(i);
+                        return elm_vectors(iDst).template AsType<elem_op_vec_t>()(i);
                    },
                    Number<nDst>{});

@@ -214,10 +225,11 @@ struct ThreadwiseTensorSliceTransfer_v7r2
                unpack2(element_op_, dst_data_refs, src_data_refs);
            });

-            dst_vectors_tuple_(iAccess) = dst_vectors;
+            elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
+            oob_vectors_tuple_(thread_scratch_id)(iAccess) = oob_val;

            // move coordinate
-            if constexpr(iAccess.value != num_access - 1)
+            if constexpr(iAccess.value != src_num_access - 1)
            {
                constexpr auto forward_step = SrcSpaceFillingCurve::GetForwardStep(iAccess);

@@ -241,15 +253,140 @@ struct ThreadwiseTensorSliceTransfer_v7r2
        });
    }

+#if 1
+    template <index_t ThreadScratchId = 0>
+    __device__ void OOBCheck(Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // loop over space-filling curve
+        static_for<0, src_num_access, 1>{}([&](auto iAccess) {
+            auto elm_vectors = elm_vectors_tuple_[thread_scratch_id][iAccess];
+            auto oob_val     = oob_vectors_tuple_[thread_scratch_id][iAccess];
+
+            static_for<0, nDst, 1>{}([&](auto i) {
+                using elm_vector_t = typename remove_cvref_t<decltype(elm_vectors[i])>::type;
+                elm_vectors(i).template AsType<elm_vector_t>()(I0) =
+                    oob_val ? elm_vectors(i).template AsType<elm_vector_t>()[I0] : elm_vector_t{0};
+            });
+
+            elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
+        });
+    }
+#endif
+
+    template <index_t ThreadScratchId = 0>
+    __device__ void
+    TransposeFromElmToDst(Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        using DstData = remove_cvref_t<decltype(DstDatas{}[I0])>;
+
+        using ElmThreadScratch =
+            StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                            DstData,
+                                            SrcScalarPerVector,
+                                            decltype(GetSrcThreadScratchDescriptor()),
+                                            true>;
+        using DstThreadScratch =
+            StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                            DstData,
+                                            DstScalarPerVector,
+                                            decltype(GetDstThreadScratchDescriptor()),
+                                            true>;
+
+        ElmThreadScratch elm_thread_scratch_;
+        DstThreadScratch dst_thread_scratch_;
+
+        elm_thread_scratch_.data_ =
+            bit_cast<decltype(elm_thread_scratch_.data_)>(elm_vectors_tuple_[thread_scratch_id]);
+
+        if constexpr(SrcVectorDim != DstVectorDim &&
+                     ((is_same<half_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
+                      (is_same<f8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0) ||
+                      (is_same<int8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
+        {
+            // each transpose does
+            // DstScalarPerVector # of src vectors in src_thread_scratch_
+            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
+            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
+            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
+
+            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
+            // TODO: make this logic generic for all scenario
+
+            constexpr auto src_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+            constexpr auto dst_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+            constexpr auto scalar_per_access = generate_sequence(
+                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
+                                                                 SrcScalarPerVector,
+                                                                 DstVectorDim,
+                                                                 DstScalarPerVector>{},
+                Number<nDim>{});
+
+            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+
+            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
+                constexpr auto data_idx = access_idx * scalar_per_access;
+
+                constexpr auto data_idx_seq = generate_sequence_v2(
+                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
+
+                using src_vector_t = vector_type_maker_t<DstData, SrcScalarPerVector>;
+                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
+
+                // get DstScalarPerVector # of read-only references to src vectors from
+                // src_thread_scratch_
+                const auto src_vector_refs = generate_tie(
+                    [&](auto i) -> const src_vector_t& {
+                        // i increment corresponds to movement in DstVectorDim
+                        return elm_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * dst_scalar_step_in_vector);
+                    },
+                    Number<num_src_vector>{});
+
+                // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
+                auto dst_vector_refs = generate_tie(
+                    [&](auto i) -> dst_vector_t& {
+                        // i increment corresponds to movement in SrcVectorDim
+                        return dst_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * src_scalar_step_in_vector);
+                    },
+                    Number<num_dst_vector>{});
+
+                // do data transpose
+                transpose_vectors<DstData, DstScalarPerVector, SrcScalarPerVector>{}(
+                    src_vector_refs, dst_vector_refs);
+            });
+        }
+        else
+        {
+            static_ford<SliceLengths>{}(
+                [&](auto idx) { dst_thread_scratch_(idx) = elm_thread_scratch_[idx]; });
+        }
+
+        dst_vectors_tuple_(thread_scratch_id) = bit_cast<DstVectorTuple>(dst_thread_scratch_.data_);
+    }
+
    // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
    // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
    template <typename DstBuffers,
-              enable_if_t<DstDescs::Size() == DstBuffers::Size(), bool> = false>
-    __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs)
+              index_t ThreadScratchId                                             = 0,
+              enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1, bool> = false>
+    __device__ void RunWrite(const DstDescs& dst_descs,
+                             DstBuffers dst_bufs,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
    {
+        OOBCheck(thread_scratch_id);
+        TransposeFromElmToDst(thread_scratch_id);
+
        // loop over space-filling curve
-        static_for<0, num_access, 1>{}([&](auto iAccess) {
-            auto dst_vectors = dst_vectors_tuple_[iAccess];
+        static_for<0, dst_num_access, 1>{}([&](auto iAccess) {
+            auto dst_vectors = dst_vectors_tuple_[thread_scratch_id][iAccess];

            // copy data from buf_vectors into dst_bufs
            static_for<0, nDst, 1>{}([&](auto i) {
@@ -269,7 +406,7 @@ struct ThreadwiseTensorSliceTransfer_v7r2
            });

            // move coordinate
-            if constexpr(iAccess.value != num_access - 1)
+            if constexpr(iAccess.value != dst_num_access - 1)
            {
                constexpr auto forward_step = DstSpaceFillingCurve::GetForwardStep(iAccess);

@@ -312,28 +449,126 @@ struct ThreadwiseTensorSliceTransfer_v7r2

    __device__ static constexpr auto GetSrcCoordinateResetStep()
    {
-        if constexpr(num_access == 0)
+        if constexpr(src_num_access == 0)
        {
            return typename SrcSpaceFillingCurve::Index{};
        }
        else
        {
-            return SrcSpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+            return SrcSpaceFillingCurve::GetStepBetween(Number<src_num_access - 1>{}, Number<0>{});
        }
    }

    __device__ static constexpr auto GetDstCoordinateResetStep()
    {
-        if constexpr(num_access == 0)
+        if constexpr(dst_num_access == 0)
        {
            return typename DstSpaceFillingCurve::Index{};
        }
        else
        {
-            return DstSpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+            return DstSpaceFillingCurve::GetStepBetween(Number<dst_num_access - 1>{}, Number<0>{});
        }
    }

+    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
+    {
+        // constexpr auto src_scalar_per_access = generate_sequence(
+        // detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
+
+        // 1st stage of transforms
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(src_access_lengths_and_vector_length[i],
+                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    __device__ static constexpr auto GetDstThreadScratchDescriptor()
+    {
+        // 1st stage of transforms
+        // constexpr auto dst_scalar_per_access = generate_sequence(
+        // detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(dst_access_lengths_and_vector_length[i],
+                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
    template <index_t ISrc>
    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs,
@@ -372,11 +607,20 @@ struct ThreadwiseTensorSliceTransfer_v7r2

    private:
    using SrcVectorsType = decltype(generate_vectors<SrcDatas, SrcScalarPerVector>());
+    using ElmVectorsType = decltype(generate_vectors<DstDatas, SrcScalarPerVector>());
    using DstVectorsType = decltype(generate_vectors<DstDatas, DstScalarPerVector>());

-    static constexpr auto num_access = SrcSpaceFillingCurve::GetNumOfAccess();
+    static constexpr auto src_num_access = SrcSpaceFillingCurve::GetNumOfAccess();
+    static constexpr auto dst_num_access = DstSpaceFillingCurve::GetNumOfAccess();
+
+    using ElmVectorTuple = StaticallyIndexedArray<ElmVectorsType, src_num_access>;
+    using DstVectorTuple = StaticallyIndexedArray<DstVectorsType, dst_num_access>;
+
+    StaticallyIndexedArray<ElmVectorTuple, NumThreadScratch> elm_vectors_tuple_;
+    StaticallyIndexedArray<DstVectorTuple, NumThreadScratch> dst_vectors_tuple_;

-    StaticallyIndexedArray<DstVectorsType, num_access> dst_vectors_tuple_;
+    using OOBVectorTuple = StaticallyIndexedArray<bool, src_num_access>;
+    StaticallyIndexedArray<OOBVectorTuple, NumThreadScratch> oob_vectors_tuple_;

    SrcCoords src_coords_;
    DstCoords dst_coords_;

--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -100,7 +100,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16,

    // Wave mode dependent propety
    static constexpr index_t wave_size = Number<WaveSize>{};
-    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
+    // * Fixed on gfx11, Will be wave mode dependent for future architectures
    static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
    static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
    // * num_acc_vgprs_per_wave alone M direction

--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/library/utility/numeric.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+
+template <index_t NDimSpatial,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t GemmK1Number,
+          index_t K0PerBlock,
+          device::ConvolutionBackwardWeightSpecialization ConvBackwardWeightSpecialization>
+struct TransformConvBwdWeightToGemm
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
+    constexpr static auto
+    make_out_grid_desc(const index_t N,
+                       const index_t Ho,
+                       const index_t Wo,
+                       const index_t K,
+                       const std::array<index_t, NDimSpatial + 3>& output_strides)
+    {
+        const index_t WoStride = output_strides[4];
+        const auto KStride     = Number<1>{};
+        return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K),
+                                            make_tuple(WoStride, KStride));
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
+    constexpr static auto
+    make_in_grid_desc(const index_t N,
+                      const index_t Hi,
+                      const index_t Wi,
+                      const index_t C,
+                      const std::array<index_t, NDimSpatial + 3>& input_strides)
+    {
+        const index_t NStride  = input_strides[1];
+        const index_t HiStride = input_strides[3];
+        const index_t WiStride = input_strides[4];
+        const auto CStride     = input_strides[2];
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor(make_tuple(N * Hi * Wi, C),
+                                                make_tuple(WiStride, CStride));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C),
+                                                make_tuple(NStride, HiStride, WiStride, CStride));
+        }
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
+    constexpr static auto
+    make_wei_grid_desc(const index_t K,
+                       const index_t Y,
+                       const index_t X,
+                       const index_t C,
+                       const std::array<index_t, NDimSpatial + 3>& weights_strides)
+    {
+        const auto CStride = Number<1>{};
+        const auto KStride = weights_strides[1];
+        return make_naive_tensor_descriptor(make_tuple(K, Y * X * C), make_tuple(KStride, CStride));
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 3, bool>::type = false>
+    constexpr static auto
+    make_out_grid_desc(const index_t N,
+                       const index_t Do,
+                       const index_t Ho,
+                       const index_t Wo,
+                       const index_t K,
+                       const std::array<index_t, NDimSpatial + 3>& output_strides)
+    {
+        const index_t WoStride = output_strides[5];
+        const auto KStride     = Number<1>{};
+        return make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, K),
+                                            make_tuple(WoStride, KStride));
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 3, bool>::type = false>
+    constexpr static auto
+    make_in_grid_desc(const index_t N,
+                      const index_t Di,
+                      const index_t Hi,
+                      const index_t Wi,
+                      const index_t C,
+                      const std::array<index_t, NDimSpatial + 3>& input_strides)
+    {
+        const index_t NStride  = input_strides[1];
+        const index_t DiStride = input_strides[3];
+        const index_t HiStride = input_strides[4];
+        const index_t WiStride = input_strides[5];
+        const auto CStride     = input_strides[2];
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor(make_tuple(N * Di * Hi * Wi, C),
+                                                make_tuple(WiStride, CStride));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N, Di, Hi, Wi, C),
+                make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+        }
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 3, bool>::type = false>
+    constexpr static auto
+    make_wei_grid_desc(const index_t K,
+                       const index_t Z,
+                       const index_t Y,
+                       const index_t X,
+                       const index_t C,
+                       const std::array<index_t, NDimSpatial + 3>& weights_strides)
+    {
+        const auto CStride = Number<1>{};
+        const auto KStride = weights_strides[1];
+        return make_naive_tensor_descriptor(make_tuple(K, Z * Y * X * C),
+                                            make_tuple(KStride, CStride));
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        const index_t N,
+        const index_t K,
+        const index_t C,
+        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+        const std::array<index_t, NDimSpatial + 3>& /* input_strides */,
+        const std::array<index_t, NDimSpatial + 3>& /* weights_strides */,
+        const std::array<index_t, NDimSpatial + 3>& /* output_strides */,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Wi            = input_spatial_lengths[0];
+        const index_t Wo            = output_spatial_lengths[0];
+        const index_t X             = filter_spatial_lengths[0];
+        const index_t ConvStrideW   = conv_filter_strides[0];
+        const index_t ConvDilationW = conv_filter_dilations[0];
+        const index_t InLeftPadW    = input_left_pads[0];
+        const index_t InRightPadW   = input_right_pads[0];
+
+        const index_t GemmKTotal = N * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * X;
+
+        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wo, K));
+
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_gemmktotal_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wi, C));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wo, K));
+            const auto in_n_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_n_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc =
+                transform_tensor_descriptor(in_n_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(X, C)),
+                                                       make_merge_transform(make_tuple(N, Wo))),
+                                            make_tuple(Sequence<1, 3>{}, Sequence<0, 2>{}),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
+
+            // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+                              wei_gemmm_gemmn_pad_grid_desc);
+        }
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        const index_t N,
+        const index_t K,
+        const index_t C,
+        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+        const std::array<index_t, NDimSpatial + 3>& input_strides,
+        const std::array<index_t, NDimSpatial + 3>& weights_strides,
+        const std::array<index_t, NDimSpatial + 3>& output_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t GemmKTotal = N * Ho * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * X * Y;
+
+        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        const auto out_grid_desc = make_out_grid_desc<NDim>(N, Ho, Wo, K, output_strides);
+        const auto in_grid_desc  = make_in_grid_desc<NDim>(N, Hi, Wi, C, input_strides);
+        const auto wei_grid_desc = make_wei_grid_desc<NDim>(K, Y, X, C, weights_strides);
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_grid_desc);
+        }
+        else
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+                              wei_gemmm_gemmn_pad_grid_desc);
+        }
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 3, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        const index_t N,
+        const index_t K,
+        const index_t C,
+        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+        const std::array<index_t, NDimSpatial + 3>& input_strides,
+        const std::array<index_t, NDimSpatial + 3>& weights_strides,
+        const std::array<index_t, NDimSpatial + 3>& output_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        const index_t Z = filter_spatial_lengths[0];
+        const index_t Y = filter_spatial_lengths[1];
+        const index_t X = filter_spatial_lengths[2];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        const index_t ConvDilationD = conv_filter_dilations[0];
+        const index_t ConvDilationH = conv_filter_dilations[1];
+        const index_t ConvDilationW = conv_filter_dilations[2];
+
+        const index_t InLeftPadD = input_left_pads[0];
+        const index_t InLeftPadH = input_left_pads[1];
+        const index_t InLeftPadW = input_left_pads[2];
+
+        const index_t InRightPadD = input_right_pads[0];
+        const index_t InRightPadH = input_right_pads[1];
+        const index_t InRightPadW = input_right_pads[2];
+
+        const index_t GemmKTotal = N * Do * Ho * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * Z * X * Y;
+
+        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        const auto out_grid_desc = make_out_grid_desc<NDim>(N, Do, Ho, Wo, K, output_strides);
+        const auto in_grid_desc  = make_in_grid_desc<NDim>(N, Di, Hi, Wi, C, input_strides);
+        const auto wei_grid_desc = make_wei_grid_desc<NDim>(K, Z, Y, X, C, weights_strides);
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_grid_desc);
+        }
+        else
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_dip_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(Z, Y, X, C)),
+                           make_merge_transform(make_tuple(N, Do, Ho, Wo))),
+                make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+                              wei_gemmm_gemmn_pad_grid_desc);
+        }
+    } // function end
+};
+
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
 #include "data_type.hpp"
@@ -297,6 +297,17 @@ enum struct AmdBufferCoherenceEnum
    GLC              = 1,
    SLC              = 2,
    GLC_SLC          = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };

 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>

--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+enum struct BlockGemmPipelineScheduler
+{
+    Intrawave,
+    Interwave,
+};
+
+enum struct TailNumber
+{
+    // Single / Double buffer pipeline
+    Odd,
+    Even,
+
+    // Long prefetch pipeline, up to 8
+    One,
+    Two,
+    Three,
+    Four,
+    Five,
+    Six,
+    Seven,
+
+    // Unroll stages > Prefetch stages, number of loop is multiple of unroll stages
+    Empty,
+    // Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add
+    // prefetchstages
+    Full,
+};
+template <index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t ABufferLoadWidth,
+          index_t BBufferLoadWidth,
+          index_t ALDSWriteWidth,
+          index_t BLDSWriteWidth,
+          index_t ALDSReadWidth,
+          index_t BLDSReadWidth,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t KPerXDL>
+struct BlockwiseGemmXdlops_pipeline_hotloop_inst
+{
+    static constexpr index_t WaveSize = 64;
+    static constexpr index_t WaveNumM = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t WaveNumN = NPerBlock / (NRepeat * NPerXDL);
+
+    static constexpr index_t A_LDS_Read_Width = ALDSReadWidth;
+    static constexpr index_t B_LDS_Read_Width = BLDSReadWidth;
+
+    static constexpr index_t A_Buffer_Load_Inst_Num =
+        MPerBlock * KPerBlock / (BlockSize * ABufferLoadWidth);
+    static constexpr index_t B_Buffer_Load_Inst_Num =
+        NPerBlock * KPerBlock / (BlockSize * BBufferLoadWidth);
+
+    static constexpr index_t A_LDS_Write_Inst_Num =
+        MPerBlock * KPerBlock / (BlockSize * ALDSWriteWidth);
+    static constexpr index_t B_LDS_Write_Inst_Num =
+        NPerBlock * KPerBlock / (BlockSize * BLDSWriteWidth);
+
+    static constexpr index_t A_LDS_Read_Inst_Num =
+        WaveNumN * MPerBlock * KPerBlock / (BlockSize * ALDSReadWidth);
+    static constexpr index_t B_LDS_Read_Inst_Num =
+        WaveNumM * MPerBlock * KPerBlock / (BlockSize * BLDSReadWidth);
+
+    static constexpr index_t C_MFMA_Inst_Num =
+        MPerBlock * NPerBlock * KPerBlock / (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
+
+    static constexpr auto Print()
+    {
+        printf(" Blk/Wave Size: %d, %d, M/N/K PerBlk: %d, %d, %d, M/N/K PerXdl: %d, %d, %d\n",
+               BlockSize,
+               WaveSize,
+               MPerBlock,
+               NPerBlock,
+               KPerBlock,
+               MPerXDL,
+               NPerXDL,
+               KPerXDL);
+
+        printf(" A/B buffer load inst: %d, %d\n A/B LDS write inst: %d, %d\n A/B LDS read inst: "
+               "%d, %d\n C MFMA inst: %d\n",
+               A_Buffer_Load_Inst_Num,
+               B_Buffer_Load_Inst_Num,
+               A_LDS_Write_Inst_Num,
+               B_LDS_Write_Inst_Num,
+               A_LDS_Read_Inst_Num,
+               B_LDS_Read_Inst_Num,
+               C_MFMA_Inst_Num);
+    }
+};
+
+} // namespace ck
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -163,6 +163,13 @@ struct scalar_type<bf8_t>
    static constexpr index_t vector_size = 1;
 };

+template <>
+struct scalar_type<bool>
+{
+    using type                           = bool;
+    static constexpr index_t vector_size = 1;
+};
+
 template <typename T>
 struct vector_type<T, 1>
 {

--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #ifndef UTILITY_DEBUG_HPP
 #define UTILITY_DEBUG_HPP
@@ -79,6 +79,13 @@ __device__ void print_shared(T const* p_shared, index_t num_elements)
    __syncthreads();
 }

+template <index_t... Ids>
+__device__ static bool is_thread_local_1d_id_idx()
+{
+    const auto tid = get_thread_local_1d_id();
+    return ((tid == Ids) || ...);
+}
+
 } // namespace debug
 } // namespace ck


--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <string_view>
+
+namespace ck {
+namespace internal {
+template <typename T>
+struct ParseEnvVal
+{
+};
+
+template <>
+struct ParseEnvVal<bool>
+{
+    static bool parse_env_var_value(const char* vp)
+    {
+        std::string value_env_str{vp};
+
+        for(auto& c : value_env_str)
+        {
+            if(std::isalpha(c) != 0)
+            {
+                c = std::tolower(static_cast<unsigned char>(c));
+            }
+        }
+
+        if(value_env_str == "disable" || value_env_str == "disabled" || value_env_str == "0" ||
+           value_env_str == "no" || value_env_str == "off" || value_env_str == "false")
+        {
+            return false;
+        }
+        else if(value_env_str == "enable" || value_env_str == "enabled" || value_env_str == "1" ||
+                value_env_str == "yes" || value_env_str == "on" || value_env_str == "true")
+        {
+            return true;
+        }
+        else
+        {
+            throw std::runtime_error("Invalid value for env variable");
+        }
+
+        return false; // shouldn't reach here
+    }
+};
+
+// Supports hexadecimals (with leading "0x"), octals (if prefix is "0") and decimals (default).
+// Returns 0 if environment variable is in wrong format (strtoull fails to parse the string).
+template <>
+struct ParseEnvVal<uint64_t>
+{
+    static uint64_t parse_env_var_value(const char* vp) { return std::strtoull(vp, nullptr, 0); }
+};
+
+template <>
+struct ParseEnvVal<std::string>
+{
+    static std::string parse_env_var_value(const char* vp) { return std::string{vp}; }
+};
+
+template <typename T>
+struct EnvVar
+{
+    private:
+    T value{};
+    bool is_unset = true;
+
+    public:
+    const T& GetValue() const { return value; }
+
+    bool IsUnset() const { return is_unset; }
+
+    void Unset() { is_unset = true; }
+
+    void UpdateValue(const T& val)
+    {
+        is_unset = false;
+        value    = val;
+    }
+
+    explicit EnvVar(const char* const name, const T& def_val)
+    {
+        // NOLINTNEXTLINE (concurrency-mt-unsafe)
+        const char* vp = std::getenv(name);
+        if(vp != nullptr) // a value was provided
+        {
+            is_unset = false;
+            value    = ParseEnvVal<T>::parse_env_var_value(vp);
+        }
+        else // no value provided, use default value
+        {
+            value = def_val;
+        }
+    }
+};
+} // end namespace internal
+
+// static inside function hides the variable and provides
+// thread-safety/locking
+// Used in global namespace
+#define CK_DECLARE_ENV_VAR(name, type, default_val)                            \
+    namespace ck::env {                                                        \
+    struct name                                                                \
+    {                                                                          \
+        static_assert(std::is_same_v<name, ::ck::env::name>,                   \
+                      "CK_DECLARE_ENV* must be used in the global namespace"); \
+        using value_type = type;                                               \
+        static ck::internal::EnvVar<type>& Ref()                               \
+        {                                                                      \
+            static ck::internal::EnvVar<type> var{#name, default_val};         \
+            return var;                                                        \
+        }                                                                      \
+    };                                                                         \
+    }
+
+#define CK_DECLARE_ENV_VAR_BOOL(name) CK_DECLARE_ENV_VAR(name, bool, false)
+
+#define CK_DECLARE_ENV_VAR_UINT64(name) CK_DECLARE_ENV_VAR(name, uint64_t, 0)
+
+#define CK_DECLARE_ENV_VAR_STR(name) CK_DECLARE_ENV_VAR(name, std::string, "")
+
+#define ENV(name) \
+    ck::env::name {}
+
+template <class EnvVar>
+inline const std::string& EnvGetString(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, std::string>);
+    return EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsEnabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsDisabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && !EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline uint64_t EnvValue(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, uint64_t>);
+    return EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsUnset(EnvVar)
+{
+    return EnvVar::Ref().IsUnset();
+}
+
+template <class EnvVar>
+void EnvUnset(EnvVar)
+{
+    EnvVar::Ref().Unset();
+}
+
+/// updates the cached value of an environment variable
+template <typename EnvVar, typename ValueType>
+void UpdateEnvVar(EnvVar, const ValueType& val)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, ValueType>);
+    EnvVar::Ref().UpdateValue(val);
+}
+
+template <typename EnvVar>
+void UpdateEnvVar(EnvVar, const std::string_view& val)
+{
+    EnvVar::Ref().UpdateValue(
+        ck::internal::ParseEnvVal<typename EnvVar::value_type>::parse_env_var_value(val.data()));
+}
+
+} // namespace ck
--- a/include/ck/utility/flush_icache.hpp
+++ b/include/ck/utility/flush_icache.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace ck {
+static __global__ void flush_icache()
+{
+    asm __volatile__("s_icache_inv \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t" ::
+                         :);
+}
+} // namespace ck
--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <ostream>

 #pragma once

@@ -24,3 +25,14 @@ constexpr LoopScheduler make_default_loop_scheduler()
 }

 } // namespace ck
+
+inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
+{
+    switch(s)
+    {
+    case ck::LoopScheduler::Default: os << "Default"; break;
+    case ck::LoopScheduler::Interwave: os << "Interwave"; break;
+    default: os << "";
+    }
+    return os;
+}
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -14,6 +14,10 @@
 namespace ck {
 namespace math {

+#if CK_WORKAROUND_SWDEV_383542
+extern "C" __device__ float __ocml_native_recip_f32(float);
+#endif
+
 // math functions for the host,  some are implemented by calling C++ std functions

 static inline __host__ float abs(float x) { return std::abs(x); };
@@ -111,6 +115,276 @@ inline __host__ double tanh<double>(double x)
    return std::tanh(x);
 };

+template <typename T>
+inline __host__ T acos(T x)
+{
+    return ck::type_convert<T>(std::acosf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float acos<float>(float x)
+{
+    return std::acosf(x);
+};
+
+template <>
+inline __host__ double acos<double>(double x)
+{
+    return std::acos(x);
+};
+
+template <typename T>
+inline __host__ T neg(T x)
+{
+    return ck::type_convert<T>(-(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float neg<float>(float x)
+{
+    return -x;
+};
+
+template <>
+inline __host__ double neg<double>(double x)
+{
+    return -x;
+};
+
+template <>
+inline __host__ int32_t neg<int32_t>(int32_t x)
+{
+    return -x;
+};
+
+template <>
+inline __host__ int8_t neg<int8_t>(int8_t x)
+{
+    return -x;
+};
+
+template <typename T>
+inline __host__ T atan(T x)
+{
+    return ck::type_convert<T>(std::atanf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float atan<float>(float x)
+{
+    return std::atanf(x);
+};
+
+template <>
+inline __host__ double atan<double>(double x)
+{
+    return std::atan(x);
+};
+
+template <typename T>
+inline __host__ T sin(T x)
+{
+    return ck::type_convert<T>(std::sinf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float sin<float>(float x)
+{
+    return std::sinf(x);
+};
+
+template <>
+inline __host__ double sin<double>(double x)
+{
+    return std::sin(x);
+};
+
+template <typename T>
+inline __host__ T asin(T x)
+{
+    return ck::type_convert<T>(std::asinf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float asin<float>(float x)
+{
+    return std::asinf(x);
+};
+
+template <>
+inline __host__ double asin<double>(double x)
+{
+    return std::asin(x);
+};
+
+template <typename T>
+inline __host__ T asinh(T x)
+{
+    return ck::type_convert<T>(std::asinhf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float asinh<float>(float x)
+{
+    return std::asinhf(x);
+};
+
+template <>
+inline __host__ double asinh<double>(double x)
+{
+    return std::asinh(x);
+};
+
+template <typename T>
+inline __host__ T cos(T x)
+{
+    return ck::type_convert<T>(std::cosf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float cos<float>(float x)
+{
+    return std::cosf(x);
+};
+
+template <>
+inline __host__ double cos<double>(double x)
+{
+    return std::cos(x);
+};
+
+template <typename T>
+inline __host__ T acosh(T x)
+{
+    return ck::type_convert<T>(std::acoshf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float acosh<float>(float x)
+{
+    return std::acoshf(x);
+};
+
+template <>
+inline __host__ double acosh<double>(double x)
+{
+    return std::acosh(x);
+};
+
+template <typename T>
+inline __host__ T tan(T x)
+{
+    return ck::type_convert<T>(std::tanf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float tan<float>(float x)
+{
+    return std::tanf(x);
+};
+
+template <>
+inline __host__ double tan<double>(double x)
+{
+    return std::tan(x);
+};
+
+template <typename T>
+inline __host__ T atanh(T x)
+{
+    return ck::type_convert<T>(std::atanhf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float atanh<float>(float x)
+{
+    return std::atanhf(x);
+};
+
+template <>
+inline __host__ double atanh<double>(double x)
+{
+    return std::atanh(x);
+};
+
+template <typename T>
+inline __host__ T sinh(T x)
+{
+    return ck::type_convert<T>(std::sinhf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float sinh<float>(float x)
+{
+    return std::sinhf(x);
+};
+
+template <>
+inline __host__ double sinh<double>(double x)
+{
+    return std::sinh(x);
+};
+
+template <typename T>
+inline __host__ T ceil(T x)
+{
+    return ck::type_convert<T>(std::ceilf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float ceil<float>(float x)
+{
+    return std::ceilf(x);
+};
+
+template <>
+inline __host__ double ceil<double>(double x)
+{
+    return std::ceil(x);
+};
+
+template <typename T>
+inline __host__ T cosh(T x)
+{
+    return ck::type_convert<T>(std::coshf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float cosh<float>(float x)
+{
+    return std::coshf(x);
+};
+
+template <>
+inline __host__ double cosh<double>(double x)
+{
+    return std::cosh(x);
+};
+
+template <typename T>
+inline __host__ T floor(T x)
+{
+    return ck::type_convert<T>(std::floorf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __host__ float floor<float>(float x)
+{
+    return std::floorf(x);
+};
+
+template <>
+inline __host__ double floor<double>(double x)
+{
+    return std::floor(x);
+};
+
+template <typename T>
+inline __host__ T rcp(T x)
+{
+    return ck::type_convert<T>(1.f / ck::type_convert<float>(x));
+};
+
 template <typename T>
 inline __host__ T exp(T x)
 {
@@ -282,6 +556,286 @@ inline __device__ double tanh<double>(double x)
    return ::tanh(x);
 };

+template <typename T>
+inline __device__ T acos(T x)
+{
+    return ck::type_convert<T>(::acosf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float acos<float>(float x)
+{
+    return ::acosf(x);
+};
+
+template <>
+inline __device__ double acos<double>(double x)
+{
+    return ::acos(x);
+};
+
+template <typename T>
+inline __device__ T neg(T x)
+{
+    return ck::type_convert<T>(-(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float neg<float>(float x)
+{
+    return -x;
+};
+
+template <>
+inline __device__ double neg<double>(double x)
+{
+    return -x;
+};
+
+template <>
+inline __device__ int32_t neg<int32_t>(int32_t x)
+{
+    return -x;
+};
+
+template <>
+inline __device__ int8_t neg<int8_t>(int8_t x)
+{
+    return -x;
+};
+
+template <>
+inline __device__ half_t neg<half_t>(half_t x)
+{
+    return __hneg(x);
+};
+
+template <typename T>
+inline __device__ T atan(T x)
+{
+    return ck::type_convert<T>(::atanf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float atan<float>(float x)
+{
+    return ::atanf(x);
+};
+
+template <>
+inline __device__ double atan<double>(double x)
+{
+    return ::atan(x);
+};
+
+template <typename T>
+inline __device__ T sin(T x)
+{
+    return ck::type_convert<T>(::sinf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float sin<float>(float x)
+{
+    return ::sinf(x);
+};
+
+template <>
+inline __device__ double sin<double>(double x)
+{
+    return ::sin(x);
+};
+
+template <>
+inline __device__ half_t sin<half_t>(half_t x)
+{
+    return ::hsin(x);
+};
+
+template <typename T>
+inline __device__ T asin(T x)
+{
+    return ck::type_convert<T>(::asinf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float asin<float>(float x)
+{
+    return ::asinf(x);
+};
+
+template <>
+inline __device__ double asin<double>(double x)
+{
+    return ::asin(x);
+};
+
+template <typename T>
+inline __device__ T asinh(T x)
+{
+    return ck::type_convert<T>(::asinhf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float asinh<float>(float x)
+{
+    return ::asinhf(x);
+};
+
+template <>
+inline __device__ double asinh<double>(double x)
+{
+    return ::asinh(x);
+};
+
+template <typename T>
+inline __device__ T acosh(T x)
+{
+    return ck::type_convert<T>(::acoshf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float acosh<float>(float x)
+{
+    return ::acoshf(x);
+};
+
+template <>
+inline __device__ double acosh<double>(double x)
+{
+    return ::acosh(x);
+};
+
+template <typename T>
+inline __device__ T tan(T x)
+{
+    return ck::type_convert<T>(::tanf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float tan<float>(float x)
+{
+    return ::tanf(x);
+};
+
+template <>
+inline __device__ double tan<double>(double x)
+{
+    return ::tan(x);
+};
+
+template <typename T>
+inline __device__ T atanh(T x)
+{
+    return ck::type_convert<T>(::atanhf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float atanh<float>(float x)
+{
+    return ::atanhf(x);
+};
+
+template <>
+inline __device__ double atanh<double>(double x)
+{
+    return ::atanh(x);
+};
+
+template <typename T>
+inline __device__ T sinh(T x)
+{
+    return ck::type_convert<T>(::sinhf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float sinh<float>(float x)
+{
+    return ::sinhf(x);
+};
+
+template <>
+inline __device__ double sinh<double>(double x)
+{
+    return ::sinh(x);
+};
+
+template <typename T>
+inline __device__ T ceil(T x)
+{
+    return ck::type_convert<T>(::ceilf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float ceil<float>(float x)
+{
+    return ::ceilf(x);
+};
+
+template <>
+inline __device__ double ceil<double>(double x)
+{
+    return ::ceil(x);
+};
+
+template <>
+inline __device__ half_t ceil<half_t>(half_t x)
+{
+    return ::hceil(x);
+};
+
+template <typename T>
+inline __device__ T cosh(T x)
+{
+    return ck::type_convert<T>(::coshf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float cosh<float>(float x)
+{
+    return ::coshf(x);
+};
+
+template <>
+inline __device__ double cosh<double>(double x)
+{
+    return ::cosh(x);
+};
+
+template <typename T>
+inline __device__ T floor(T x)
+{
+    return ck::type_convert<T>(::floorf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float floor<float>(float x)
+{
+    return ::floorf(x);
+};
+
+template <>
+inline __device__ double floor<double>(double x)
+{
+    return ::floor(x);
+};
+
+template <>
+inline __device__ half_t floor<half_t>(half_t x)
+{
+    return ::hfloor(x);
+};
+
+template <typename T>
+inline __device__ T rcp(T x)
+{
+#if !CK_WORKAROUND_SWDEV_383542
+    return __frcp_rn(x);
+#else
+    return __ocml_native_recip_f32(x);
+#endif
+};
+
 template <typename T>
 inline __device__ T exp(T x)
 {

--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

+#include <ostream>
+
 #include "ck/utility/integral_constant.hpp"
 #include "ck/utility/type.hpp"
 #include "ck/utility/functional.hpp"
@@ -897,3 +899,14 @@ template <index_t NSize, index_t I>
 using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;

 } // namespace ck
+
+template <ck::index_t... Is>
+std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
+{
+    using S = ck::Sequence<Is...>;
+    os << "{";
+    ck::static_for<0, S::Size() - ck::Number<1>{}, 1>{}(
+        [&](auto i) { os << S::At(i).value << ", "; });
+    os << S::At(S::Size() - ck::Number<1>{}).value << "}";
+    return os;
+}
--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -17,10 +17,12 @@ __device__ void block_sync_lds()
    s_barrier_wait -1 \
    " ::);
 #else
-    asm volatile("\
-    s_waitcnt lgkmcnt(0) \n \
-    s_barrier \
-    " ::);
+    // asm volatile("\
+    // s_waitcnt lgkmcnt(0) \n \
+    // s_barrier \
+    // " ::);
+    __builtin_amdgcn_s_waitcnt(0xc07f);
+    __builtin_amdgcn_s_barrier();
 #endif
 #else
    __syncthreads();

--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -162,4 +162,83 @@ struct transpose_vectors<int8_t, NX, NY>
    }
 };

+// transpose f8 4x4
+__device__ void transpose_f8_4x4(const f8x4_t& x0,
+                                 const f8x4_t& x1,
+                                 const f8x4_t& x2,
+                                 const f8x4_t& x3,
+                                 f8x4_t& y0,
+                                 f8x4_t& y1,
+                                 f8x4_t& y2,
+                                 f8x4_t& y3)
+{
+    int32_t t0, t1;
+    int32_t z0, z1, z2, z3;
+    constexpr int32_t m0 = 0x05010400;
+    constexpr int32_t m1 = 0x05040100;
+    constexpr int32_t m2 = 0x07060302;
+    constexpr int32_t m3 = 0x07030602;
+
+    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+    //                   -- -- -- --     -- -- -- --      -  -  -  -
+    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+    // index is reversed because of little endianness (least significant bits first)
+    t0 = __builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m0);
+    t1 = __builtin_amdgcn_perm(bit_cast<int32_t>(x3), bit_cast<int32_t>(x2), m0);
+    z0 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m1);
+    z1 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m2);
+    t0 = __builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m3);
+    t1 = __builtin_amdgcn_perm(bit_cast<int32_t>(x3), bit_cast<int32_t>(x2), m3);
+    z2 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m1);
+    z3 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m2);
+
+    y0 = bit_cast<f8x4_t>(z0);
+    y1 = bit_cast<f8x4_t>(z1);
+    y2 = bit_cast<f8x4_t>(z2);
+    y3 = bit_cast<f8x4_t>(z3);
+}
+
+template <index_t NX, index_t NY>
+struct transpose_vectors<f8_t, NX, NY>
+{
+    // we got [NY * NX] amount of S data to be transposed
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+
+    using S  = f8_t;
+    using VX = vector_type<f8_t, s_per_x>;
+    using VY = vector_type<f8_t, s_per_y>;
+
+    __device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
+                               StaticallyIndexedArray<VY&, NY>& vy_tuple)
+    {
+        static constexpr auto I1 = Number<1>{};
+        static constexpr auto I2 = Number<2>{};
+        static constexpr auto I3 = Number<3>{};
+        static constexpr auto I4 = Number<4>{};
+
+        static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+
+        // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+        static_for<0, NY, 4>{}([&](auto iy) {
+            static_for<0, NX, 4>{}([&](auto ix) {
+                // reference to 4 f8 data from vx_tuple
+                const auto& x_s4_0 = vx_tuple[ix].template AsType<f8x4_t>()[iy / I4];
+                const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<f8x4_t>()[iy / I4];
+                const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<f8x4_t>()[iy / I4];
+                const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<f8x4_t>()[iy / I4];
+
+                // reference to 4 f8 data from vy_tuple
+                auto& y_s4_0 = vy_tuple(iy).template AsType<f8x4_t>()(ix / I4);
+                auto& y_s4_1 = vy_tuple(iy + I1).template AsType<f8x4_t>()(ix / I4);
+                auto& y_s4_2 = vy_tuple(iy + I2).template AsType<f8x4_t>()(ix / I4);
+                auto& y_s4_3 = vy_tuple(iy + I3).template AsType<f8x4_t>()(ix / I4);
+
+                // transpose
+                transpose_f8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
+            });
+        });
+    }
+};
+
 } // namespace ck
--- a/include/ck/utility/type.hpp
+++ b/include/ck/utility/type.hpp
@@ -40,21 +40,10 @@ inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
 template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
 __host__ __device__ constexpr Y bit_cast(const X& x)
 {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
-    Y y;
+    static_assert(__has_builtin(__builtin_bit_cast), "");
+    static_assert(sizeof(X) == sizeof(Y), "Do not support cast between different size of type");

-    __builtin_memcpy(&y, &x, sizeof(X));
-
-    return y;
-#else
-    union AsType
-    {
-        X x;
-        Y y;
-    };
-
-    return AsType{x}.y;
-#endif
+    return __builtin_bit_cast(Y, x);
 }

 } // namespace ck
--- a/include/ck_tile/README.md
+++ b/include/ck_tile/README.md
+# ck_tile
+## concept
+`ck_tile` provides a programming model with templated abstractions to enable users to implement performance-critical kernels for machine learning workloads. introduces following basic concepts to help users building your own operator
+ - tensor coordinate transformation, this is the core concept of layout/index transform abstraction in both compiler time and run time.
+ - tile-based programming model, including tile-level api and the concept of distributed tensor.
+
+`ck_tile` is independently from the old ck, located under [/include/ck_tile](/include/ck_tile). You don't need to include anything from old CK, `ck_tile` has similiar (indeed almost the same) implementations for users to build operators. We will have a transition period to pull everything from old ck into `ck_tile`, stay tuned.
+
+## component
+`ck_tile` is splitted into several componenets including `core`, `host`, `ops/gemm`, `ops/fmha`... each component you only need to include a single header (e.g `#include "ck_tile/core.hpp"`, `#include "ck_tile/ops/fmha.hpp"`) then you are able to use the function/structure inside (different from old `ck`)  
+
+**[core]**  
+`ck_tile/core` contains all the basic data structure and function to build the kernel, you can only include this header and build your own operators that utilizing all the basic building blocks introduced in ck.
+
+`core/container`
+ - array, store runtime variables with fixed length (tensor index, register buffer, etc...)
+ - tuple, same as std::tuple, hold different type of data, and one of the solution to achieve multiple buffer. 
+ - sequence, compile time integer sequence used to build various internal structures, or to describe tile size
+ - other convenient structure build on top of above 3
+
+`core/numeric`
+ - gpu data type like `fp16_t`, `bf16_t`, `fp8_t`... and the conversion between each other
+ - constexpr integer similiar to std::integral_constant to be used as compile time integer.
+ - math functions and numeric utilities
+
+`core/algorithm`
+ - coordinate transformation system, used to build tensor transform and compile time indexing. This is the core idea introduced in old `ck` to describe how a tensor is build by several basic transform primitives like `merge`/`unmerge`/`embed` etc... and how we indexing into a ND tensor that finally mapped to 1D memory offset.
+
+`core/tensor`
+ - tensor descriptor, to describe how a ND tensor 
+ - distributed tensor, describe the storage of this tensor, and the distribution of how a collection of threads collaborately work for this tensor.
+ - tile level API, including `load_tile`, `store_tile`, `shuffle_tile`, `slice_tile`, etc...
+
+**[host]**  
+`ck_tile/host` contains all the host side utilities to launch a kernel, create the device buffer, and some reference implementations. This can be used to create examples (like that under ck_tile example folder) and simple executable to invoke this kernel, so if you only need `ck_tile` to build your own device library then it's OK to not include this. Based on this, it is recommended to include the specific header you needed under this folder to avoid including unwanted headers (e.g, only include `ck_tile/host/kernel_launch.hpp`), unless you are writing a host executable.
+
+**[ops/gemm, ops/fmha, ops/reduce...]**  
+our implementation of different device operators. 
+ - warp, warp tile level operator
+ - block, block tile level operator
+ - pipeline, pipeline that can achieve a customized tile level mainloop (or epilogue). By switching different pipeline to the kernel template you can have different kind of pipeline optimizations.
+ - kernel, template interface for users to instantiate a particular kernel
+
+**[ops/epilogue]**  
+epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues.
+
+## examples
+currently we put all ck_tile related example under [/example/ck_tile](/example/ck_tile/) folder. Please check each example's subfolder.
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/algorithm/cluster_descriptor.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/arch/amd_buffer_addressing.hpp"
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/container/map.hpp"
+#include "ck_tile/core/container/meta_data_buffer.hpp"
+#include "ck_tile/core/container/multi_index.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/span.hpp"
+#include "ck_tile/core/container/statically_indexed_array.hpp"
+#include "ck_tile/core/container/thread_buffer.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/float8.hpp"
+#include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/numeric/numeric.hpp"
+#include "ck_tile/core/numeric/type_convert.hpp"
+#include "ck_tile/core/numeric/vector_type.hpp"
+#include "ck_tile/core/tensor/buffer_view.hpp"
+#include "ck_tile/core/tensor/load_tile.hpp"
+#include "ck_tile/core/tensor/null_tensor.hpp"
+#include "ck_tile/core/tensor/null_tile_window.hpp"
+#include "ck_tile/core/tensor/shuffle_tile.hpp"
+#include "ck_tile/core/tensor/slice_tile.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/store_tile.hpp"
+#include "ck_tile/core/tensor/sweep_tile.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor_coordinate.hpp"
+#include "ck_tile/core/tensor/tensor_coordinate.hpp"
+#include "ck_tile/core/tensor/tensor_descriptor.hpp"
+#include "ck_tile/core/tensor/tensor_view.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
+#include "ck_tile/core/tensor/tile_elementwise.hpp"
+#include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
+#include "ck_tile/core/utility/magic_div.hpp"
+#include "ck_tile/core/utility/random.hpp"
+#include "ck_tile/core/utility/to_sequence.hpp"
+#include "ck_tile/core/utility/transpose_vectors.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/core/utility/unary_element_function.hpp"
--- a/include/ck_tile/core/README.md
+++ b/include/ck_tile/core/README.md
+# ck_tile/core #
+
+`ck_tile/core` contains every basic functions and structures to create a GPU kernel using `ck_tile`. User should only include `ck_tile/core.hpp` this single header to use all the functionality. Everything is under `ck_tile` namespace. The coding style under this folder should be similar to `std` (`snake_case` for structure/function, Camel for template types...)
+
+```
+algorithm/
+    coordinate transform and some other reusable algorithm
+arch/
+    contains some basic device building block like mma, buffer addressing, etc...
+container/
+    contains basic container data structure, array/sequence/tuple/...
+numeric/
+    data type, and data type related math
+tensor/
+    tensor descriptors and tile level API
+utility/
+    other utility function for both host/device
+```
--- a/include/ck_tile/core/algorithm/cluster_descriptor.hpp
+++ b/include/ck_tile/core/algorithm/cluster_descriptor.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+template <typename Lengths,
+          typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
+CK_TILE_HOST_DEVICE constexpr auto make_cluster_descriptor(
+    const Lengths& lengths,
+    ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type{})
+{
+    constexpr index_t ndim_low = Lengths::size();
+
+    const auto reordered_lengths = container_reorder_given_new2old(lengths, order);
+
+    const auto low_lengths = generate_tuple(
+        [&](auto idim_low) { return reordered_lengths[idim_low]; }, number<ndim_low>{});
+
+    const auto transform = make_merge_transform(low_lengths);
+
+    constexpr auto low_dim_old_top_ids = ArrangeOrder{};
+
+    constexpr auto up_dim_new_top_ids = sequence<0>{};
+
+    return make_single_stage_tensor_adaptor(
+        make_tuple(transform), make_tuple(low_dim_old_top_ids), make_tuple(up_dim_new_top_ids));
+}
+
+} // namespace ck_tile