onnxruntime

78e355fd · gaoqiong · fae08684 · 78e355fd · 78e355fd · 78e355fd
Commit 78e355fd authored Dec 20, 2022 by gaoqiong
20 changed files
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. It does not keep reference to tensor descriptor
+// 3. Run() does not construct new tensor coordinate
+template <typename ThreadGroup,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename Src0Data,
+          typename Src1Data,
+          typename DstData,
+          typename Src0Desc,
+          typename Src1Desc,
+          typename DstDesc,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          bool ThreadTransferSrc0ResetCoordinateAfterRun,
+          bool ThreadTransferSrc1ResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct ThreadGroupTensorSliceTransfer_v6r2
+{
+    static constexpr index_t nDim = remove_reference_t<Src0Desc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v6r2(const Src0Desc& src0_desc,
+                                                             const Index& src0_block_slice_origin,
+                                                             const Src1Desc& src1_desc,
+                                                             const Index& src1_block_slice_origin,
+                                                             const DstDesc& dst_desc,
+                                                             const Index& dst_block_slice_origin,
+                                                             const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src0_desc,
+                               make_zero_multi_index<nDim>(),
+                               src1_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<Src0Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<Src1Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrc0SliceOrigin(
+                src0_desc, src0_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetSrc1SliceOrigin(
+                src1_desc, src1_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename Src0Buffer, typename Src1Buffer, typename DstBuffer>
+    __device__ void Run(const Src0Desc& src0_desc,
+                        const Src0Buffer& src0_buf,
+                        const Src1Desc& src1_desc,
+                        const Src1Buffer& src1_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(src0_desc, src0_buf, src1_desc, src1_buf, dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc0SliceWindow(src0_desc, step);
+        }
+    }
+
+    __device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc1SliceWindow(src1_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v6r2<Src0Data,
+                                           Src1Data,
+                                           DstData,
+                                           Src0Desc,
+                                           Src1Desc,
+                                           DstDesc,
+                                           ElementwiseOperation,
+                                           decltype(thread_slice_lengths),
+                                           DimAccessOrder,
+                                           VectorDim,
+                                           ScalarPerVector,
+                                           DstInMemOp,
+                                           ThreadTransferSrc0ResetCoordinateAfterRun,
+                                           ThreadTransferSrc1ResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <typename ThreadGroup,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename Src0Data,
+          typename Src1Data,
+          typename Src2Data,
+          typename DstData,
+          typename Src0Desc,
+          typename Src1Desc,
+          typename Src2Desc,
+          typename DstDesc,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          bool ThreadTransferSrc0ResetCoordinateAfterRun,
+          bool ThreadTransferSrc1ResetCoordinateAfterRun,
+          bool ThreadTransferSrc2ResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct ThreadGroupTensorSliceTransfer_v6r3
+{
+    static constexpr index_t nDim = remove_reference_t<Src0Desc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v6r3(const Src0Desc& src0_desc,
+                                                             const Index& src0_block_slice_origin,
+                                                             const Src1Desc& src1_desc,
+                                                             const Index& src1_block_slice_origin,
+                                                             const Src2Desc& src2_desc,
+                                                             const Index& src2_block_slice_origin,
+                                                             const DstDesc& dst_desc,
+                                                             const Index& dst_block_slice_origin,
+                                                             const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src0_desc,
+                               make_zero_multi_index<nDim>(),
+                               src1_desc,
+                               make_zero_multi_index<nDim>(),
+                               src2_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<Src0Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<Src1Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<Src2Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrc0SliceOrigin(
+                src0_desc, src0_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetSrc1SliceOrigin(
+                src1_desc, src1_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetSrc2SliceOrigin(
+                src2_desc, src2_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename Src0Buffer, typename Src1Buffer, typename Src2Buffer, typename DstBuffer>
+    __device__ void Run(const Src0Desc& src0_desc,
+                        const Src0Buffer& src0_buf,
+                        const Src1Desc& src1_desc,
+                        const Src1Buffer& src1_buf,
+                        const Src2Desc& src2_desc,
+                        const Src2Buffer& src2_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(
+                src0_desc, src0_buf, src1_desc, src1_buf, src2_desc, src2_buf, dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc0SliceWindow(src0_desc, step);
+        }
+    }
+
+    __device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc1SliceWindow(src1_desc, step);
+        }
+    }
+
+    __device__ void MoveSrc2SliceWindow(const Src2Desc& src2_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc2SliceWindow(src2_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v6r3<Src0Data,
+                                           Src1Data,
+                                           Src2Data,
+                                           DstData,
+                                           Src0Desc,
+                                           Src1Desc,
+                                           Src2Desc,
+                                           DstDesc,
+                                           ElementwiseOperation,
+                                           decltype(thread_slice_lengths),
+                                           DimAccessOrder,
+                                           VectorDim,
+                                           ScalarPerVector,
+                                           DstInMemOp,
+                                           ThreadTransferSrc0ResetCoordinateAfterRun,
+                                           ThreadTransferSrc1ResetCoordinateAfterRun,
+                                           ThreadTransferSrc2ResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"
+
+namespace ck {
+
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags>
+struct ThreadGroupTensorSliceTransfer_v7
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) { return src_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nSrc>{});
+
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) { return dst_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nDst>{});
+
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(src_descs, src_bufs, dst_descs, dst_bufs);
+        }
+    }
+
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7<SrcDatas,
+                                         DstDatas,
+                                         SrcDescs,
+                                         DstDescs,
+                                         ElementwiseOperation,
+                                         DstInMemOps,
+                                         decltype(thread_slice_lengths),
+                                         DimAccessOrder,
+                                         VectorDim,
+                                         ScalarPerVector,
+                                         ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                         ThreadTransferDstResetCoordinateAfterRunFlags>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionBackwardDataSpecialization
+{
+    Default,
+    Filter1x1Stride1Pad0,
+};
+
+inline std::string
+getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionBackwardDataSpecialization::Default: return "Default";
+    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
+        return "FFilter1x1Stride1Pad0";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionBackwardWeightSpecialization
+{
+    Default,
+    Filter1x1Stride1Pad0,
+    Filter1x1Pad0,
+    OddC,
+};
+
+inline std::string
+getConvBackwardWeightSpecializationString(const ConvolutionBackwardWeightSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionBackwardWeightSpecialization::Default: return "Default";
+    case ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0:
+        return "Filter1x1Stride1Pad0";
+    case ConvolutionBackwardWeightSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionBackwardWeightSpecialization::OddC: return "OddC";
+    default: return "Unrecognized specialization!";
+    }
+}
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionForwardSpecialization
+{
+    Default,
+    Filter1x1Pad0,
+    Filter1x1Stride1Pad0,
+    OddC,
+};
+
+inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionForwardSpecialization::Default: return "Default";
+    case ConvolutionForwardSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionForwardSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
+    case ConvolutionForwardSpecialization::OddC: return "OddC";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+#include <string>
+#include <sstream>
+
+#include "ck/stream_config.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct BaseArgument
+{
+    BaseArgument()                    = default;
+    BaseArgument(const BaseArgument&) = default;
+    BaseArgument& operator=(const BaseArgument&) = default;
+
+    virtual ~BaseArgument() {}
+
+    void* p_workspace_ = nullptr;
+};
+
+struct BaseInvoker
+{
+    BaseInvoker()                   = default;
+    BaseInvoker(const BaseInvoker&) = default;
+    BaseInvoker& operator=(const BaseInvoker&) = default;
+
+    virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
+    {
+        return float{0};
+    }
+
+    virtual ~BaseInvoker() {}
+};
+
+struct BaseOperator
+{
+    BaseOperator()                    = default;
+    BaseOperator(const BaseOperator&) = default;
+    BaseOperator& operator=(const BaseOperator&) = default;
+
+    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
+    virtual std::string GetTypeString() const { return ""; }
+
+    virtual std::string GetTypeIdName() const { return typeid(*this).name(); }
+
+    virtual std::string GetTypeIdHashCode() const
+    {
+        std::ostringstream oss;
+
+        oss << std::hex << typeid(*this).hash_code();
+
+        return oss.str();
+    };
+
+    virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }
+
+    virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const
+    {
+        assert(p_arg);
+        p_arg->p_workspace_ = p_workspace;
+    }
+
+    virtual ~BaseOperator() {}
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedContractionMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_gs_ms_ns_lengths,
+                        const std::vector<index_t>& a_gs_ms_ks_strides,
+                        const std::vector<index_t>& b_gs_ns_ks_lengths,
+                        const std::vector<index_t>& b_gs_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                        const std::vector<index_t>& e_gs_ms_ns_lengths,
+                        const std::vector<index_t>& e_gs_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB,
+                        ck::index_t BatchStrideC,
+                        ck::index_t Batch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceBatchedGemmPtr = std::unique_ptr<DeviceBatchedGemm<ALayout,
+                                                               BLayout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               AElementwiseOperation,
+                                                               BElementwiseOperation,
+                                                               CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
+#pragma once
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct BatchedGemmEPermuteDesc
+{
+    ck::index_t G0_, G1_, M_, N_;
+    ck::index_t stride_G0_, stride_G1_, stride_M_, stride_N_;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmEPermute : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t stride_A,
+                        index_t stride_B,
+                        index_t batch_stride_A,
+                        index_t batch_stride_B,
+                        BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                        index_t BatchCount,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemmGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        ck::index_t BatchStrideC,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmMultiD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename A0Layout,
+          typename B0Layout,
+          typename D0sLayout,
+          typename B1Layout,
+          typename D1sLayout,
+          typename E1Layout,
+          typename A0DataType,
+          typename B0DataType,
+          typename D0sDataType,
+          typename B1DataType,
+          typename D1sDataType,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation>
+struct DeviceBatchedGemmMultipleDGemmMultipleD : public BaseOperator
+{
+    static constexpr index_t NumD0Tensor = D0sDataType::Size();
+    static constexpr index_t NumD1Tensor = D1sDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a0,
+                        const void* p_b0,
+                        std::array<const void*, NumD0Tensor> p_d0s,
+                        const void* p_b1,
+                        std::array<const void*, NumD1Tensor> p_d1s,
+                        void* p_e1,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA0,
+                        ck::index_t StrideB0,
+                        std::array<ck::index_t, NumD0Tensor> StrideD0s,
+                        ck::index_t StrideB1,
+                        std::array<ck::index_t, NumD1Tensor> StrideD1s,
+                        ck::index_t StrideE1,
+                        ck::index_t BatchStrideA0,
+                        ck::index_t BatchStrideB0,
+                        std::array<ck::index_t, NumD0Tensor> BatchStrideD0s,
+                        ck::index_t BatchStrideB1,
+                        std::array<ck::index_t, NumD1Tensor> BatchStrideD1s,
+                        ck::index_t BatchStrideE1,
+                        A0ElementwiseOperation a0_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        CDE0ElementwiseOperation cde0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CDE1ElementwiseOperation cde1_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          bool MaskOutUpperTriangle> // TODO: enum for mask type
+struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        ck::index_t BatchStrideC,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          MaskingSpecialization MaskingSpec>
+struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
+{
+    static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size();
+    static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b0,
+        const void* p_b1,
+        void* p_c,
+        const std::array<void*, NumAcc0Bias> p_acc0_biases,
+        const std::array<void*, NumAcc1Bias> p_acc1_biases,
+        const std::vector<index_t>& a_gs_ms_ks_lengths,
+        const std::vector<index_t>& a_gs_ms_ks_strides,
+        const std::vector<index_t>& b_gs_ns_ks_lengths,
+        const std::vector<index_t>& b_gs_ns_ks_strides,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
+        const std::vector<index_t>& c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
+        const std::vector<index_t>& c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths,
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_strides,
+        const std::array<std::vector<index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
+        const std::array<std::vector<index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+        AElementwiseOperation a_element_op,
+        B0ElementwiseOperation b0_element_op,
+        Acc0ElementwiseOperation acc0_element_op,
+        B1ElementwiseOperation b1_element_op,
+        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct DeviceBatchNormBwd : public BaseOperator
+{
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> dyStrides,
+                        const std::array<index_t, Rank> dxStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnDscaleDbiasStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormBwdPtr = std::unique_ptr<DeviceBatchNormBwd<XDataType,
+                                                                 DxDataType,
+                                                                 DyDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 DscaleDbiasDataType,
+                                                                 MeanVarDataType,
+                                                                 DyElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct DeviceBatchNormFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, Rank> xyLengths,
+        const std::array<index_t, Rank> xStrides,
+        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+        const void* p_x,
+        const void* bnScale,
+        const void* bnBias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
+        void* p_y,
+        void* resultSaveMean,
+        void* resultSaveInvVariance,
+        double exponentialAverageFactor,
+        void* resultRunningMean,
+        void* resultRunningVariance) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormFwdPtr = std::unique_ptr<DeviceBatchNormFwd<XDataType,
+                                                                 YDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 BiasDataType,
+                                                                 MeanVarDataType,
+                                                                 YElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct DeviceBatchNormInfer : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, Rank> xyLengths,
+        const std::array<index_t, Rank> xStrides,
+        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+        const void* p_x,
+        const void* bnScale,
+        const void* bnBias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
+        const void* estimatedMean,
+        const void* estimatedInvVariance,
+        void* p_y) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormInferPtr = std::unique_ptr<DeviceBatchNormInfer<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     YElementwiseOp,
+                                                                     Rank,
+                                                                     NumBatchNormReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceCGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a_real,
+                                                              const void* p_a_imag,
+                                                              const void* p_b_real,
+                                                              const void* p_b_imag,
+                                                              void* p_c_real,
+                                                              void* p_c_imag,
+                                                              void* p_workspace,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t KBatch = 1) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual std::size_t GetWorkspaceSize(index_t MRaw,
+                                         index_t NRaw,
+                                         index_t KRaw,
+                                         index_t StrideA,
+                                         index_t StrideB,
+                                         index_t StrideC)     = 0;
+};
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceCGemmPtr = std::unique_ptr<
+    DeviceCGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceContractionMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_ms_ns_lengths,
+                        const std::vector<index_t>& a_ms_ks_strides,
+                        const std::vector<index_t>& b_ns_ks_lengths,
+                        const std::vector<index_t>& b_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                        const std::vector<index_t>& e_ms_ns_lengths,
+                        const std::vector<index_t>& e_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck