Merge branch 'develop' into feature/support-readfirstlane-for-object-types

ae8b307a · Po Yen Chen · GitHub · ad8bc60b · ac9e01e2 · ad8bc60b
Unverified Commit ae8b307a authored May 29, 2023 by Po Yen Chen Committed by GitHub May 29, 2023
20 changed files
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "static_tensor.hpp"
-
-namespace ck {
-
-namespace detail {
-// TODO: How to fix this? It uses an struct instead of lambda because lambda
-// doesn't have constructor
-template <index_t SrcVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector>
-struct lambda_scalar_per_access_for_src_and_dst
-{
-    __host__ __device__ constexpr auto operator()(index_t i) const
-    {
-        if(i == SrcVectorDim && i == DstVectorDim)
-        {
-            return math::lcm(SrcScalarPerVector, DstScalarPerVector);
-        }
-        else if(i == SrcVectorDim)
-        {
-            return SrcScalarPerVector;
-        }
-        else if(i == DstVectorDim)
-        {
-            return DstScalarPerVector;
-        }
-        else
-        {
-            return 1;
-        }
-    }
-};
-
-} // namespace detail
-
-// Assume:
-//   1. src_desc and dst_desc are not known at compile-time
-//   2. SrcBuffer and DstBuffer are DynamicBuffer
-//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
-//   4. Use thread buffer
-template <typename SliceLengths,
-          typename SrcElementwiseOperation,
-          typename DstElementwiseOperation,
-          InMemoryDataOperationEnum DstInMemOp,
-          typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename Dst0Desc,
-          typename Dst1Desc,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorDim,
-          index_t DstVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstScalarPerVector,
-          index_t SrcScalarStrideInVector,
-          index_t DstScalarStrideInVector,
-          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
-                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
-                                           // save addr computation
-          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
-                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
-                                           // save addr computation
-struct ThreadwiseTensorSliceTransfer_v3r3
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-    using Index                   = MultiIndex<nDim>;
-
-    using SrcCoord  = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
-    using Dst1Coord = decltype(make_tensor_coordinate(Dst1Desc{}, Index{}));
-
-    using SrcCoordStep  = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-    using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
-    using Dst1CoordStep = decltype(make_tensor_coordinate_step(Dst1Desc{}, Index{}));
-
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r3(
-        const SrcDesc& src_desc,
-        const Index& src_slice_origin,
-        const SrcElementwiseOperation& src_element_op,
-        const DstDesc& dst_desc,
-        const Dst0Desc& dst0_desc,
-        const Dst1Desc& dst1_desc,
-        const Index& dst_slice_origin,
-        const DstElementwiseOperation& dst_element_op)
-        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
-          dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin)),
-          dst1_coord_(make_tensor_coordinate(dst1_desc, dst_slice_origin)),
-          src_element_op_(src_element_op),
-          dst_element_op_(dst_element_op)
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
-    {
-        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
-    }
-
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc,
-                                      const Dst0Desc& dst0_desc,
-                                      const Dst1Desc& dst1_desc,
-                                      const Index& dst_slice_origin_idx)
-    {
-        dst_coord_  = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-        dst0_coord_ = make_tensor_coordinate(dst0_desc, dst_slice_origin_idx);
-        dst1_coord_ = make_tensor_coordinate(dst1_desc, dst_slice_origin_idx);
-    }
-
-    template <typename SrcBuffer>
-    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
-    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
-                      "wrong!");
-
-        static_assert(
-            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
-            "wrong! SrcBuffer and SrcData data type are inconsistent");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // make forward steps
-        const auto src_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(src_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto src_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(src_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_src_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate src data index
-            constexpr auto src_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
-                                                      : ordered_src_access_lengths[i] - 1 -
-                                                            ordered_src_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                       src_scalar_per_access;
-            }();
-
-            constexpr auto src_data_idx_seq = generate_sequence_v2(
-                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
-
-            const bool is_src_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
-
-            using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
-            using src_vector_t    = typename src_vector_type::type;
-
-            // copy data from src_buf into src_vector_container
-            auto src_vector_container = src_vector_type{
-                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
-
-            // apply SrcElementwiseOperation on src_vector_container
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-                src_vector_container.template AsType<SrcData>()(i) =
-                    src_element_op_(src_vector_container.template AsType<SrcData>()[i]);
-            });
-
-            // copy data from src_vector_container into src_thread_scratch_
-            src_thread_scratch_.template SetAsType<src_vector_t>(
-                src_data_idx_seq, src_vector_container.template AsType<src_vector_t>()[I0]);
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move src coord
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move src coordinate back to slice origin (or not)
-        if constexpr(SrcResetCoordinateAfterRun)
-        {
-            const auto src_reset_step =
-                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
-
-            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
-        }
-    }
-
-    __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch()
-    {
-#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
-        static_ford<SliceLengths>{}([&](auto idx) {
-            // convert from SrcData to DstData here
-            dst_thread_scratch_(idx) = type_convert<DstData>(src_thread_scratch_[idx]);
-        });
-#else
-        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
-        // TODO make this logic more generic for more sub-dword datatype
-        if constexpr(SrcVectorDim != DstVectorDim &&
-                     is_same<half_t, remove_cvref_t<SrcData>>::value &&
-                     is_same<half_t, remove_cvref_t<DstData>>::value &&
-                     SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
-        {
-            // each transpose does
-            // DstScalarPerVector # of src vectors in src_thread_scratch_
-            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
-            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
-            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
-
-            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
-            // TODO: make this logic generic for all scenario
-            static_assert(SrcVectorDim != DstVectorDim, "wrong");
-
-            constexpr auto src_scalar_step_in_vector = generate_sequence(
-                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
-
-            constexpr auto dst_scalar_step_in_vector = generate_sequence(
-                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
-
-            constexpr auto scalar_per_access = generate_sequence(
-                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
-                                                                 SrcScalarPerVector,
-                                                                 DstVectorDim,
-                                                                 DstScalarPerVector>{},
-                Number<nDim>{});
-
-            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
-                constexpr auto data_idx = access_idx * scalar_per_access;
-
-                constexpr auto data_idx_seq = generate_sequence_v2(
-                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
-
-                // TODO type_convert is not used yet!!!!!
-                using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
-                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
-
-                // get DstScalarPerVector # of read-only references to src vectors from
-                // src_thread_scratch_
-                const auto src_vector_refs = generate_tie(
-                    [&](auto i) -> const src_vector_t& {
-                        // i increment corresponds to movement in DstVectorDim
-                        return src_thread_scratch_.GetVectorTypeReference(
-                            data_idx_seq + i * dst_scalar_step_in_vector);
-                    },
-                    Number<num_src_vector>{});
-
-                // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
-                auto dst_vector_refs = generate_tie(
-                    [&](auto i) -> dst_vector_t& {
-                        // i increment corresponds to movement in SrcVectorDim
-                        return dst_thread_scratch_.GetVectorTypeReference(
-                            data_idx_seq + i * src_scalar_step_in_vector);
-                    },
-                    Number<num_dst_vector>{});
-
-                // do data transpose
-                // TODO type_convert is not used yet!!!!!
-                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
-                    src_vector_refs, dst_vector_refs);
-            });
-        }
-        else
-        {
-            static_ford<SliceLengths>{}([&](auto idx) {
-                // convert from SrcData to DstData here
-                dst_thread_scratch_(idx) = type_convert<DstData>(src_thread_scratch_[idx]);
-            });
-        }
-#endif
-    }
-
-    template <typename DstBuffer, typename Dst0Buffer, typename Dst1Buffer>
-    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& dst_buf,
-                             const Dst0Desc& dst0_desc,
-                             const Dst0Buffer& dst0_buf,
-                             const Dst1Desc& dst1_desc,
-                             const Dst1Buffer& dst1_buf)
-    {
-        // if there is transpose, it's done here
-        // TODO move this elsewhere
-        TransferDataFromSrcThreadScratchToDstThreadScratch();
-
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
-                      "wrong!");
-
-        static_assert(
-            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
-            "wrong! SrcBuffer or DstBuffer data type is wrong");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // src scalar per access on each dim
-        // TODO: don't use this
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // make forward steps
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make forward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst0_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make forward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst1_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst0_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst1_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_dst_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
-                                                      : ordered_dst_access_lengths[i] - 1 -
-                                                            ordered_dst_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-
-            constexpr auto dst_data_idx_seq = generate_sequence_v2(
-                [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
-
-            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-
-            using dst_vector_type = vector_type_maker_t<DstData, DstScalarPerVector>;
-            using dst_vector_t    = typename dst_vector_type::type;
-
-            // copy data from dst_thread_scratch_ into dst_vector_container
-            auto dst_vector_container = dst_vector_type{
-                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
-
-            // apply DstElementwiseOperation on dst_vector_container
-            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
-                dst_vector_container.template AsType<DstData>()(i) =
-                    dst_element_op_(dst_vector_container.template AsType<DstData>()[i]);
-            });
-
-            // copy data from dst_vector_container to dst_buf
-            dst_buf.template Set<dst_vector_t>(
-                dst_coord_.GetOffset(),
-                is_dst_valid,
-                dst_vector_container.template AsType<dst_vector_t>()[I0]);
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move dst coord
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move dst coordinate back to slice origin (or not)
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_reset_step =
-                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
-
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
-        }
-    }
-
-    __device__ static constexpr auto GetSrcCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            // TODO: BUG: should start at 1
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_src_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate src data index after last iteration in RunRead(), if it has not being reset by
-        // RunRead()
-        constexpr auto src_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                   src_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_src_data_step = [&]() {
-            Index reset_src_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
-
-            return reset_src_data_step_;
-        }();
-
-        return reset_src_data_step;
-    }
-
-    __device__ static constexpr auto GetDstCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_dst_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-
-            return reset_dst_data_step_;
-        }();
-
-        return reset_dst_data_step;
-    }
-
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
-                                       const Index& src_slice_origin_step_idx)
-    {
-        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
-                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
-    }
-
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
-                                       const Index& src_slice_origin_step_idx)
-    {
-        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
-                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
-    }
-
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Dst0Desc dst0_desc,
-                                       const Dst1Desc dst1_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by RunWrite(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
-                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
-        move_tensor_coordinate(dst0_desc, dst0_coord_, adjusted_step);
-        move_tensor_coordinate(dst1_desc, dst1_coord_, adjusted_step);
-    }
-
-    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
-    {
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_access_lengths_and_vector_length = container_push_back(
-            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
-
-        // 1st stage of transforms
-        constexpr auto desc0 =
-            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
-
-        // 2nd stage of transforms
-        constexpr auto transforms = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == SrcVectorDim)
-                {
-                    return make_merge_transform_v3_division_mod(
-                        make_tuple(src_access_lengths_and_vector_length[i],
-                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
-                }
-                else
-                {
-                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto low_dim_idss = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == SrcVectorDim)
-                {
-                    return Sequence<i.value, nDim>{};
-                }
-                else
-                {
-                    return Sequence<i.value>{};
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto up_dim_idss =
-            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
-
-        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
-    }
-
-    __device__ static constexpr auto GetDstThreadScratchDescriptor()
-    {
-        // 1st stage of transforms
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
-            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
-
-        constexpr auto desc0 =
-            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
-
-        // 2nd stage of transforms
-        constexpr auto transforms = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == DstVectorDim)
-                {
-                    return make_merge_transform_v3_division_mod(
-                        make_tuple(dst_access_lengths_and_vector_length[i],
-                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
-                }
-                else
-                {
-                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto low_dim_idss = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == DstVectorDim)
-                {
-                    return Sequence<i.value, nDim>{};
-                }
-                else
-                {
-                    return Sequence<i.value>{};
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto up_dim_idss =
-            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
-
-        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
-    }
-
-    private:
-    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
-    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
-
-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
-                                    SrcData,
-                                    SrcScalarPerVector,
-                                    decltype(src_thread_scratch_desc_),
-                                    true>
-        src_thread_scratch_;
-
-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
-                                    DstData,
-                                    DstScalarPerVector,
-                                    decltype(dst_thread_scratch_desc_),
-                                    true>
-        dst_thread_scratch_;
-
-    SrcCoord src_coord_;
-    DstCoord dst_coord_;
-    const SrcElementwiseOperation src_element_op_;
-    const DstElementwiseOperation dst_element_op_;
-};
-
-} // namespace ck
-#endif
--- a/include/ck/utility/amd_llvm_intrinsic.hpp
+++ b/include/ck/utility/amd_llvm_intrinsic.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_AMD_LLVM_INTRINSIC_HPP
-#define CK_AMD_LLVM_INTRINSIC_HPP
-
-#include "data_type.hpp"
-
-namespace ck {
-
-__device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
-
-} // namespace ck
-#endif
--- a/include/ck/utility/print.hpp
+++ b/include/ck/utility/print.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_PRINT_HPP
-#define CK_PRINT_HPP
-
-#include "array.hpp"
-#include "statically_indexed_array.hpp"
-#include "container_helper.hpp"
-#include "sequence.hpp"
-
-namespace ck {
-
-template <typename T>
-__host__ __device__ void print_array(const char* s, T a)
-{
-    constexpr index_t nsize = a.Size();
-
-    printf("%s size %d, {", s, nsize);
-    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
-    printf("}\n");
-}
-
-} // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-template <typename ADataType,
-          typename BDataType,
-          typename C0DataType,
-          typename CDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct ReferenceGemmBias2D : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_m_k,
-                 const Tensor<BDataType>& b_k_n,
-                 const Tensor<C0DataType>& c0_m_n,
-                 Tensor<CDataType>& c_m_n,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : a_m_k_{a_m_k},
-              b_k_n_{b_k_n},
-              c0_m_n_{c0_m_n},
-              c_m_n_{c_m_n},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_m_k_;
-        const Tensor<BDataType>& b_k_n_;
-        const Tensor<CDataType>& c0_m_n_;
-        Tensor<CDataType>& c_m_n_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceGemmBias2D::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_mk_kn_mn = [&](auto m, auto n) {
-                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
-
-                AccDataType a   = 0;
-                AccDataType b   = 0;
-                AccDataType acc = 0;
-
-                for(int k = 0; k < K; ++k)
-                {
-                    arg.a_element_op_(a, ck::type_convert<AccDataType>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(b, ck::type_convert<AccDataType>(arg.b_k_n_(k, n)));
-                    acc += a * b;
-                }
-
-                CDataType cast_acc = static_cast<CDataType>(acc);
-                arg.c_element_op_(arg.c_m_n_(m, n), cast_acc, arg.c0_m_n_(m, n));
-            };
-
-            make_ParallelTensorFunctor(
-                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
-                             const Tensor<BDataType>& b_k_n,
-                             const Tensor<C0DataType>& c0_m_n,
-                             Tensor<CDataType>& c_m_n,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
-    {
-        return Argument{a_m_k, b_k_n, c0_m_n, c_m_n, a_element_op, b_element_op, c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceGemmBias2D"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct ReferenceGemmBiasActivation : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_m_k,
-                 const Tensor<BDataType>& b_k_n,
-                 Tensor<CDataType>& c_m_n,
-                 const Tensor<CDataType>& c0_n,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : a_m_k_{a_m_k},
-              b_k_n_{b_k_n},
-              c_m_n_{c_m_n},
-              c0_n_{c0_n},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_m_k_;
-        const Tensor<BDataType>& b_k_n_;
-        Tensor<CDataType>& c_m_n_;
-        const Tensor<CDataType>& c0_n_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceGemmBiasActivation::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_mk_kn_mn = [&](auto m, auto n) {
-                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
-
-                float v_acc = 0;
-
-                for(int k = 0; k < K; ++k)
-                {
-                    float v_a;
-                    float v_b;
-
-                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
-
-                    v_acc += v_a * v_b;
-                }
-
-                float v_c;
-
-                arg.c_element_op_(v_c, v_acc, static_cast<float>(arg.c0_n_(n)));
-
-                arg.c_m_n_(m, n) = v_c;
-            };
-
-            make_ParallelTensorFunctor(
-                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
-                             const Tensor<BDataType>& b_k_n,
-                             Tensor<CDataType>& c_m_n,
-                             const Tensor<CDataType>& c0_n,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
-    {
-        return Argument{a_m_k, b_k_n, c_m_n, c0_n, a_element_op, b_element_op, c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceGemmBiasActivation"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_m_k,
-                 const Tensor<BDataType>& b_k_n,
-                 Tensor<CDataType>& c_m_n,
-                 const Tensor<CDataType>& c0_n,
-                 const Tensor<CDataType>& c1_m_n,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : a_m_k_{a_m_k},
-              b_k_n_{b_k_n},
-              c_m_n_{c_m_n},
-              c0_n_{c0_n},
-              c1_m_n_{c1_m_n},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_m_k_;
-        const Tensor<BDataType>& b_k_n_;
-        Tensor<CDataType>& c_m_n_;
-        const Tensor<CDataType>& c0_n_;
-        const Tensor<CDataType>& c1_m_n_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceGemmBiasActivationAdd::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_mk_kn_mn = [&](auto m, auto n) {
-                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
-
-                float v_acc = 0;
-
-                for(int k = 0; k < K; ++k)
-                {
-                    float v_a;
-                    float v_b;
-
-                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
-
-                    v_acc += v_a * v_b;
-                }
-
-                float v_c;
-
-                arg.c_element_op_(v_c,
-                                  v_acc,
-                                  static_cast<float>(arg.c0_n_(n)),
-                                  static_cast<float>(arg.c1_m_n_(m, n)));
-
-                arg.c_m_n_(m, n) = v_c;
-            };
-
-            make_ParallelTensorFunctor(
-                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
-                             const Tensor<BDataType>& b_k_n,
-                             Tensor<CDataType>& c_m_n,
-                             const Tensor<CDataType>& c0_n,
-                             const Tensor<CDataType>& c1_m_n,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
-    {
-        return Argument{
-            a_m_k, b_k_n, c_m_n, c0_n, c1_m_n, a_element_op, b_element_op, c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceGemmBiasActivationAdd"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <index_t InOutRank,
+          index_t WindowRank,
+          typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+struct ReferencePoolingFwd : public device::BaseOperator
+{
+    using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
+
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in,
+                 Tensor<OutDataType>& out,
+                 Tensor<IndexDataType>& out_indices,
+                 const std::vector<ck::index_t>& window_spatial_lengths,
+                 const std::vector<ck::index_t>& window_strides,
+                 const std::vector<ck::index_t>& in_left_pads,
+                 const std::vector<ck::index_t>& /*in_right_pads*/)
+            : in_(in),
+              out_(out),
+              out_indices_(out_indices),
+              window_spatial_lengths_(window_spatial_lengths),
+              window_strides_(window_strides),
+              in_left_pads_(in_left_pads),
+              reduceLength_(1)
+        {
+            static_for<0, WindowRank, 1>{}(
+                [&](auto I) { reduceLength_ *= window_spatial_lengths[I]; });
+        }
+
+        const Tensor<InDataType>& in_;
+        Tensor<OutDataType>& out_;
+        Tensor<IndexDataType>& out_indices_;
+        const std::vector<ck::index_t>& window_spatial_lengths_;
+        const std::vector<ck::index_t>& window_strides_;
+        const std::vector<ck::index_t>& in_left_pads_;
+        int reduceLength_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float RunPooling3dFwd(const Argument& arg)
+        {
+
+            auto elementwise_ops =
+                ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+                    arg.reduceLength_);
+
+            auto in_elementwise_op  = std::get<0>(elementwise_ops);
+            auto acc_elementwise_op = std::get<1>(elementwise_ops);
+
+            if constexpr(!OutputIndex)
+            {
+                using Accumulation = ck::detail::
+                    AccumulateWithNanCheck<PropagateNan, ReduceOperation, ComputeDataType>;
+
+                auto f_ncdhw = [&](auto n, auto c, auto do_, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+
+                    for(ck::index_t z = 0; z < arg.window_spatial_lengths_[0]; ++z)
+                    {
+                        ck::index_t di = do_ * arg.window_strides_[0] + z - arg.in_left_pads_[0];
+                        for(ck::index_t y = 0; y < arg.window_spatial_lengths_[1]; ++y)
+                        {
+                            ck::index_t hi = ho * arg.window_strides_[1] + y - arg.in_left_pads_[1];
+                            for(ck::index_t x = 0; x < arg.window_spatial_lengths_[2]; ++x)
+                            {
+                                ck::index_t wi =
+                                    wo * arg.window_strides_[2] + x - arg.in_left_pads_[2];
+                                if(di >= 0 &&
+                                   di < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                                   hi >= 0 &&
+                                   hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]) &&
+                                   wi >= 0 &&
+                                   wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
+                                {
+                                    ComputeDataType currVal =
+                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+
+                                    in_elementwise_op(currVal, currVal);
+
+                                    Accumulation::Calculate(accuVal, currVal);
+                                }
+                            }
+                        }
+                    }
+                    acc_elementwise_op(accuVal, accuVal);
+
+                    arg.out_(n, c, do_, ho, wo) = accuVal;
+                };
+
+                make_ParallelTensorFunctor(f_ncdhw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3],
+                                           arg.out_.mDesc.GetLengths()[4])(
+                    std::thread::hardware_concurrency());
+            }
+            else
+            {
+                using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                                ReduceOperation,
+                                                                                ComputeDataType,
+                                                                                IndexDataType>;
+
+                auto f_ncdhw = [&](auto n, auto c, auto do_, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+                    IndexDataType accuIndex = 0;
+
+                    for(ck::index_t z = 0; z < arg.window_spatial_lengths_[0]; ++z)
+                    {
+                        ck::index_t di = do_ * arg.window_strides_[0] + z - arg.in_left_pads_[0];
+                        for(ck::index_t y = 0; y < arg.window_spatial_lengths_[1]; ++y)
+                        {
+                            ck::index_t hi = ho * arg.window_strides_[1] + y - arg.in_left_pads_[1];
+                            for(ck::index_t x = 0; x < arg.window_spatial_lengths_[2]; ++x)
+                            {
+                                ck::index_t wi =
+                                    wo * arg.window_strides_[2] + x - arg.in_left_pads_[2];
+                                if(di >= 0 &&
+                                   di < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                                   hi >= 0 &&
+                                   hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]) &&
+                                   wi >= 0 &&
+                                   wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
+                                {
+                                    ComputeDataType currVal =
+                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+                                    IndexDataType currIndex =
+                                        arg.in_.GetOffsetFromMultiIndex(n, c, di, hi, wi);
+
+                                    in_elementwise_op(currVal, currVal);
+
+                                    Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                                }
+                            }
+                        }
+                    }
+
+                    acc_elementwise_op(accuVal, accuVal);
+
+                    arg.out_(n, c, do_, ho, wo)         = accuVal;
+                    arg.out_indices_(n, c, do_, ho, wo) = accuIndex;
+                };
+
+                make_ParallelTensorFunctor(f_ncdhw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3],
+                                           arg.out_.mDesc.GetLengths()[4])(
+                    std::thread::hardware_concurrency());
+            };
+
+            return 0;
+        }
+
+        float RunPooling2dFwd(const Argument& arg)
+        {
+
+            auto elementwise_ops =
+                ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+                    arg.reduceLength_);
+
+            auto in_elementwise_op  = std::get<0>(elementwise_ops);
+            auto acc_elementwise_op = std::get<1>(elementwise_ops);
+
+            if constexpr(!OutputIndex)
+            {
+                using Accumulation = ck::detail::
+                    AccumulateWithNanCheck<PropagateNan, ReduceOperation, ComputeDataType>;
+
+                auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+
+                    for(ck::index_t y = 0; y < arg.window_spatial_lengths_[0]; ++y)
+                    {
+                        ck::index_t hi = ho * arg.window_strides_[0] + y - arg.in_left_pads_[0];
+                        for(ck::index_t x = 0; x < arg.window_spatial_lengths_[1]; ++x)
+                        {
+                            ck::index_t wi = wo * arg.window_strides_[1] + x - arg.in_left_pads_[1];
+                            if(hi >= 0 &&
+                               hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                               wi >= 0 &&
+                               wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
+                            {
+                                ComputeDataType currVal =
+                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+
+                                in_elementwise_op(currVal, currVal);
+
+                                Accumulation::Calculate(accuVal, currVal);
+                            }
+                        }
+                    }
+
+                    acc_elementwise_op(accuVal, accuVal);
+                    arg.out_(n, c, ho, wo) = accuVal;
+                };
+
+                make_ParallelTensorFunctor(f_nchw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3])(
+                    std::thread::hardware_concurrency());
+            }
+            else
+            {
+                using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                                ReduceOperation,
+                                                                                ComputeDataType,
+                                                                                IndexDataType>;
+
+                auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+                    IndexDataType accuIndex = 0;
+
+                    for(ck::index_t y = 0; y < arg.window_spatial_lengths_[0]; ++y)
+                    {
+                        ck::index_t hi = ho * arg.window_strides_[0] + y - arg.in_left_pads_[0];
+                        for(ck::index_t x = 0; x < arg.window_spatial_lengths_[1]; ++x)
+                        {
+                            ck::index_t wi = wo * arg.window_strides_[1] + x - arg.in_left_pads_[1];
+                            if(hi >= 0 &&
+                               hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                               wi >= 0 &&
+                               wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
+                            {
+                                ComputeDataType currVal =
+                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+
+                                IndexDataType currIndex =
+                                    arg.in_.GetOffsetFromMultiIndex(n, c, hi, wi);
+
+                                in_elementwise_op(currVal, currVal);
+
+                                Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                            }
+                        }
+                    }
+
+                    acc_elementwise_op(accuVal, accuVal);
+                    arg.out_(n, c, ho, wo)         = accuVal;
+                    arg.out_indices_(n, c, ho, wo) = accuIndex;
+                };
+
+                make_ParallelTensorFunctor(f_nchw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3])(
+                    std::thread::hardware_concurrency());
+            };
+
+            return 0;
+        }
+
+        float Run(const Argument& arg)
+        {
+            // TODO - support generic pooling
+            if constexpr(InOutRank == 5 && WindowRank == 3)
+                return RunPooling3dFwd(arg);
+            else if constexpr(InOutRank == 4 && WindowRank == 2)
+                return RunPooling2dFwd(arg);
+            else
+                throw std::runtime_error("Only support pooling3d or pooling2d so far");
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in,
+                             Tensor<OutDataType>& out,
+                             Tensor<IndexDataType>& out_indices,
+                             const std::vector<ck::index_t>& window_spatial_lengths,
+                             const std::vector<ck::index_t>& window_strides,
+                             const std::vector<ck::index_t>& in_left_pads,
+                             const std::vector<ck::index_t>& in_right_pads)
+    {
+        return Argument{in,
+                        out,
+                        out_indices,
+                        window_spatial_lengths,
+                        window_strides,
+                        in_left_pads,
+                        in_right_pads};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferencePoolingFwd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -3,10 +3,8 @@

 #pragma once

-#include <cstdlib>
 #include <vector>
 #include <memory>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -3,10 +3,8 @@

 #pragma once

-#include <cstdlib>
 #include <vector>
 #include <memory>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -3,8 +3,7 @@

 #pragma once

-#include <cstdlib>
-
+#include <vector>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
@@ -4,7 +4,7 @@
 #pragma once

 #include <cstdlib>
-
+#include <vector>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -3,10 +3,8 @@

 #pragma once

-#include <cstdlib>
 #include <memory>
 #include <vector>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
@@ -3,10 +3,8 @@

 #pragma once

-#include <cstdlib>
 #include <vector>
 #include <memory>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"