Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance

Merge branch 'develop' of...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance
212b9299 · aska-0096 · 06903279 · d3adc665 · 212b9299 · 212b9299
Commit 212b9299 authored Jun 29, 2023 by aska-0096
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -39,7 +39,7 @@ __global__ void
                                const CDEElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();
@@ -548,11 +548,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                }
            }

-            hipGetErrorString(
-                hipMemcpy(arg.p_workspace_,
-                          arg.gemm_desc_kernel_arg_.data(),
-                          arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
-                          hipMemcpyHostToDevice));
+            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+                                                  arg.gemm_desc_kernel_arg_.data(),
+                                                  arg.gemm_desc_kernel_arg_.size() *
+                                                      sizeof(GemmBiasTransKernelArg),
+                                                  hipMemcpyHostToDevice,
+                                                  stream_config.stream_id_));

            float ave_time = 0;


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -35,7 +35,7 @@ __global__ void
                                       const index_t group_count)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];

@@ -406,10 +406,12 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                }
            }

-            hip_check_error(hipMemcpy(arg.p_workspace_,
-                                      arg.gemm_kernel_args_.data(),
-                                      arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                      hipMemcpyHostToDevice));
+            hip_check_error(
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.gemm_kernel_args_.data(),
+                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));

            float ave_time = 0;


--- a/include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// output[indices] = input
+template <typename DOutDataType,
+          typename IndexDataType,
+          typename DInDataType,
+          ck::index_t InOutVectorSize>
+struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDataType, DInDataType>
+{
+    using DInDataType_AutomicAddPreCast =
+        conditional_t<is_same_v<DInDataType, float> || is_same_v<DInDataType, double>,
+                      DInDataType,
+                      float>;
+
+    using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+    using UnaryConvert = ck::tensor_operation::element_wise::UnaryConvert;
+
+    static constexpr auto I0 = Number<0>{};
+
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t loop_step)
+    {
+        const auto m   = desc_m.GetLength(I0);
+        const auto pad = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+
+    static auto MakeDescriptor_M(index_t length, index_t loop_step)
+    {
+        const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
+        return PadDescriptor_M_1d(desc_m, loop_step);
+    }
+
+    using InOutGrid1dDesc = decltype(MakeDescriptor_M(1, 1));
+
+    using GridwisePutElementSet = GridwisePutElement_1D<InOutGrid1dDesc,
+                                                        DOutDataType,
+                                                        IndexDataType,
+                                                        DInDataType,
+                                                        PassThrough,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        InOutVectorSize>;
+
+    using GridwisePutElementAtomicAdd = GridwisePutElement_1D<InOutGrid1dDesc,
+                                                              DOutDataType,
+                                                              IndexDataType,
+                                                              DInDataType_AutomicAddPreCast,
+                                                              PassThrough,
+                                                              InMemoryDataOperationEnum::AtomicAdd,
+                                                              InOutVectorSize>;
+
+    using GridwiseCasting = GridwiseElementwise_1D<Tuple<InOutGrid1dDesc>,
+                                                   Tuple<InOutGrid1dDesc>,
+                                                   Tuple<const DInDataType_AutomicAddPreCast*>,
+                                                   Tuple<DInDataType*>,
+                                                   UnaryConvert,
+                                                   InOutVectorSize,
+                                                   Sequence<InOutVectorSize>,
+                                                   Sequence<InOutVectorSize>>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const DOutDataType* p_dout,
+                 const IndexDataType* p_indices,
+                 DInDataType* p_din,
+                 index_t dout_length,
+                 index_t din_length,
+                 const std::vector<ck::index_t>& window_lengths,
+                 const std::vector<ck::index_t>& window_strides)
+            : p_dout_{p_dout},
+              p_indices_{p_indices},
+              p_din_{p_din},
+              dout_length_raw_{dout_length},
+              din_length_raw_{din_length},
+              blockSize_{256},
+              windowOverlap_{false}
+        {
+            for(size_t i = 0; i < window_lengths.size(); ++i)
+            {
+                windowOverlap_ |= window_lengths.at(i) > window_strides.at(i);
+            }
+        }
+
+        const DOutDataType* p_dout_;
+        const IndexDataType* p_indices_;
+        DInDataType* p_din_;
+        index_t dout_length_raw_;
+        index_t din_length_raw_;
+        index_t blockSize_;
+        bool windowOverlap_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize               = getAvailableComputeUnitCount(stream_config);
+            index_t loop_step              = gridSize * arg.blockSize_ * InOutVectorSize;
+            InOutGrid1dDesc din_grid_desc  = MakeDescriptor_M(arg.din_length_raw_, loop_step);
+            InOutGrid1dDesc dout_grid_desc = MakeDescriptor_M(arg.dout_length_raw_, loop_step);
+
+            if constexpr(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>)
+            {
+                hip_check_error(hipMemsetAsync(arg.p_din_,
+                                               0,
+                                               arg.din_length_raw_ * sizeof(DInDataType),
+                                               stream_config.stream_id_));
+
+                if(arg.windowOverlap_)
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+                else
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+            }
+            else
+            {
+                if(arg.windowOverlap_)
+                {
+                    if(arg.p_workspace_ == nullptr)
+                        throw std::runtime_error("wrong! WorkSpace pointer has not been set");
+
+                    hip_check_error(
+                        hipMemsetAsync(arg.p_workspace_,
+                                       0,
+                                       arg.din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast),
+                                       stream_config.stream_id_));
+
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType_AutomicAddPreCast,
+                                                                  PassThrough>;
+
+                    const auto cast_kernel =
+                        kernel_elementwise_1d<GridwiseCasting,
+                                              Tuple<InOutGrid1dDesc>,
+                                              Tuple<InOutGrid1dDesc>,
+                                              Tuple<const DInDataType_AutomicAddPreCast*>,
+                                              Tuple<DInDataType*>,
+                                              UnaryConvert>;
+
+                    float elapsed_time = launch_and_time_kernel(
+                        stream_config,
+                        put_kernel,
+                        dim3(gridSize),
+                        dim3(arg.blockSize_),
+                        0,
+                        dout_grid_desc,
+                        arg.p_dout_,
+                        arg.p_indices_,
+                        static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
+                        PassThrough{});
+
+                    elapsed_time += launch_and_time_kernel(
+                        stream_config,
+                        cast_kernel,
+                        dim3(gridSize),
+                        dim3(arg.blockSize_),
+                        0,
+                        ck::make_tuple(din_grid_desc),
+                        ck::make_tuple(din_grid_desc),
+                        static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
+                        arg.p_din_,
+                        UnaryConvert{});
+
+                    return elapsed_time;
+                }
+                else
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+
+                    hip_check_error(hipMemsetAsync(arg.p_din_,
+                                                   0,
+                                                   arg.din_length_raw_ * sizeof(DInDataType),
+                                                   stream_config.stream_id_));
+
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+            }
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        bool needCast = pArg_->windowOverlap_ &&
+                        !(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>);
+
+        if(!needCast)
+            return 0;
+        else
+            return pArg_->din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast);
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg->din_length_raw_ % InOutVectorSize != 0 ||
+           pArg->dout_length_raw_ % InOutVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        const void* p_indices,
+                        void* p_din,
+                        index_t dout_length,
+                        index_t din_length,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> window_strides) override
+    {
+        // Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
+        // physical size of the packed tensor
+        return std::make_unique<Argument>(static_cast<const DOutDataType*>(p_dout),
+                                          static_cast<const IndexDataType*>(p_indices),
+                                          static_cast<DInDataType*>(p_din),
+                                          dout_length,
+                                          din_length,
+                                          window_lengths,
+                                          window_strides);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_put_element.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// output[indices] = input
+template <typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum MemOp,
+          ck::index_t InVectorSize>
+struct DevicePutElementImpl
+    : public DevicePutElement<InDataType, IndexDataType, OutDataType, ElementwiseOperation, MemOp>
+{
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
+    {
+        constexpr auto I0 = Number<0>{};
+
+        const auto m            = desc_m.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * InVectorSize;
+        const auto pad          = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+
+    static auto MakeDescriptor_M(index_t length, index_t gridSize, index_t blockSize)
+    {
+        const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
+        return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
+    }
+
+    using InGrid1dDesc = decltype(MakeDescriptor_M(1, 1, 1));
+
+    using GridwisePutElement = GridwisePutElement_1D<InGrid1dDesc,
+                                                     InDataType,
+                                                     IndexDataType,
+                                                     OutDataType,
+                                                     ElementwiseOperation,
+                                                     MemOp,
+                                                     InVectorSize>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_input,
+                 const IndexDataType* p_indices,
+                 OutDataType* p_output,
+                 index_t input_length,
+                 ElementwiseOperation elementwise_op)
+            : p_input_{p_input},
+              p_indices_{p_indices},
+              p_output_{p_output},
+              input_length_raw_{input_length},
+              elementwise_op_{elementwise_op},
+              blockSize_{256}
+        {
+        }
+
+        const InDataType* p_input_;
+        const IndexDataType* p_indices_;
+        OutDataType* p_output_;
+        index_t input_length_raw_;
+        ElementwiseOperation elementwise_op_;
+        index_t blockSize_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize = getAvailableComputeUnitCount(stream_config);
+            InGrid1dDesc in_grid_desc =
+                MakeDescriptor_M(arg.input_length_raw_, gridSize, arg.blockSize_);
+
+            const auto kernel = kernel_put_element_1d<GridwisePutElement,
+                                                      InGrid1dDesc,
+                                                      InDataType,
+                                                      IndexDataType,
+                                                      OutDataType,
+                                                      ElementwiseOperation>;
+
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(gridSize),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        in_grid_desc,
+                                                        arg.p_input_,
+                                                        arg.p_indices_,
+                                                        arg.p_output_,
+                                                        arg.elementwise_op_);
+            return elapsed_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if(pArg->input_length_raw_ % InVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_input,
+                                                      const void* p_indices,
+                                                      void* p_output,
+                                                      index_t input_length,
+                                                      index_t,
+                                                      ElementwiseOperation elementwise_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_input),
+                                          static_cast<const IndexDataType*>(p_indices),
+                                          static_cast<OutDataType*>(p_output),
+                                          input_length,
+                                          elementwise_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
                                                OutDataType,
                                                InElementwiseOp,
                                                AccElementwiseOp,
-                                                Rank>
+                                                Rank,
+                                                NumReduceDim>
 {
-    static constexpr index_t kRank            = Rank;
-    static constexpr index_t kNumReduceDim    = NumReduceDim;
-    static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
-
-    virtual index_t GetRank() const override { return kRank; }
-
-    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
-
    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;

    static constexpr index_t NumSrcDim = Rank;
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
    {
        if constexpr(InSrcVectorDim == 0)
        {
-            if constexpr(kNumInvariantDim == 0)
+            if constexpr(NumInvariantDim == 0)
            {
                return false;
            }
            else
            {
-                if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
                {
                    return false;
                }
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
        }

        // To improve
-        if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
+        if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
+++ b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once