Merge branch 'develop' into aosewski/ggemm_multi_d2

defa2071 · Adam Osewski · 28a68428 · f2398f61 · defa2071 · defa2071
Commit defa2071 authored Nov 15, 2023 by Adam Osewski
20 changed files
--- a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+++ b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+bool run_convnd_fwd_example(int argc, char* argv[])
+{
+    print_helper_msg();
+    bool do_verification = true;
+    // Use floats for SoftRelu by default to avoid overflow after e^x.
+    int init_method =
+        std::is_same_v<OutElementOp, ck::tensor_operation::element_wise::SoftRelu> ? 2 : 1;
+    bool time_kernel = false;
+    // Following shapes are selected to avoid overflow. Expect inf in case of
+    // size increase for some elementwise ops.
+    ck::utils::conv::ConvParam conv_param{
+        3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    const auto run = [&]() {
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+        return run_grouped_conv_fwd<NDimSpatial,
+                                    InDataType,
+                                    WeiDataType,
+                                    OutDataType,
+                                    InElementOp,
+                                    WeiElementOp,
+                                    OutElementOp,
+                                    DeviceGroupedConvNDFwdActivInstance>(do_verification,
+                                                                         init_method,
+                                                                         time_kernel,
+                                                                         conv_param,
+                                                                         in_g_n_c_wis_desc,
+                                                                         wei_g_k_c_xs_desc,
+                                                                         out_g_n_k_wos_desc,
+                                                                         in_element_op,
+                                                                         wei_element_op,
+                                                                         out_element_op);
+    };
+    if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run();
+    }
+    return false;
+}
--- a/example/63_layernorm4d_fwd/CMakeLists.txt
+++ b/example/63_layernorm4d_fwd/CMakeLists.txt
+add_example_executable(example_layernorm4d_fwd_fp16 layernorm4d_fwd_fp16.cpp)
+add_example_executable(example_layernorm4d_fwd_splitk_fp16 layernorm4d_fwd_splitk_fp16.cpp)
--- a/example/63_layernorm4d_fwd/common.hpp
+++ b/example/63_layernorm4d_fwd/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
--- a/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp
+++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+#define SAVE_MEAN_INV_STD
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationFwdImpl<XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             ComputeDataType,
+                                                             YDataType,
+                                                             SaveMeanInvStdDataType,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             256, // BlockSize
+                                                             8,   // ClusterM
+                                                             32,  // ClusterK
+                                                             1,   // SliceM
+                                                             8,   // SliceK
+                                                             1,   // XYVectorDim (0=M, 1=K)
+                                                             8,   // SrcScalarPerVector
+                                                             1,   // GammaVecDim (0=M, 1=K)
+                                                             8,   // GammaScalarPerVector
+                                                             1,   // BetaVecDim (0=M, 1=K)
+                                                             8,   // BetaScalarPerVector
+                                                             8,   // YScalarPerVector
+                                                             1>;  // SaveMeanInvStdScalarPerVector
+#include "run_layernorm4d_fwd_example.inc"
+int main() { return run_layernorm4d_fwd_example<DeviceInstance>(); }
--- a/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp
+++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+#define SAVE_MEAN_INV_STD
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl<
+    XDataType,
+    GammaDataType,
+    BetaDataType,
+    ComputeDataType,
+    YDataType,
+    SaveMeanInvStdDataType,
+    PassThrough,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    8,   // ClusterM
+    32,  // ClusterK
+    1,   // SliceM
+    8,   // SliceK
+    1,   // XYVectorDim (0=M, 1=K)
+    8,   // XScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    8,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    8,   // BetaScalarPerVector
+    8,   // YScalarPerVector
+    1>;  // SaveMeanInvStdScalarPerVector
+#include "run_layernorm4d_fwd_example.inc"
+int main() { return run_layernorm4d_fwd_example<DeviceInstance>(); }
--- a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+++ b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+template <typename DeviceInstance>
+int run_layernorm4d_fwd_example()
+{
+    bool time_kernel = false;
+    ck::index_t N = 256;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t C = 8;
+    Tensor<XDataType> x({N, H, W, C});
+    Tensor<GammaDataType> gamma({H, W, C});
+    Tensor<BetaDataType> beta({H, W, C});
+    Tensor<YDataType> y({N, H, W, C});
+    Tensor<SaveMeanInvStdDataType> save_mean({N});
+    Tensor<SaveMeanInvStdDataType> save_inv_std({N});
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+    beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+#ifdef SAVE_MEAN_INV_STD
+    DeviceMem save_mean_dev(sizeof(SaveMeanInvStdDataType) * save_mean.mDesc.GetElementSpaceSize());
+    DeviceMem save_inv_std_dev(sizeof(SaveMeanInvStdDataType) *
+                               save_inv_std.mDesc.GetElementSpaceSize());
+#endif
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {N, H, W, C},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, W * C, C, 1},
+        {0, W * C, C, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                    save_mean.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                    save_mean.mDesc.GetStrides().end()},
+        {1, 2, 3},
+        1e-4,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.GetDeviceBuffer(),
+        save_inv_std_dev.GetDeviceBuffer(),
+#else
+        nullptr,
+        nullptr,
+#endif
+        PassThrough{});
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y({N, H, W, C});
+        Tensor<SaveMeanInvStdDataType> host_save_mean({N});
+        Tensor<SaveMeanInvStdDataType> host_save_inv_std({N});
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           YDataType,
+                                                           SaveMeanInvStdDataType,
+                                                           ComputeDataType,
+                                                           PassThrough,
+                                                           Rank,
+                                                           NumReduceDim>;
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(x,
+                                             gamma,
+                                             beta,
+                                             host_y,
+                                             host_save_mean,
+                                             host_save_inv_std,
+                                             PassThrough{},
+                                             {N, H, W, C},
+                                             {1, 2, 3},
+                                             1e-4);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results (y)", 1e-3, 1e-3);
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.FromDevice(save_mean.mData.data());
+        save_inv_std_dev.FromDevice(save_inv_std.mData.data());
+        pass &= ck::utils::check_err(
+            save_mean, host_save_mean, "Error: Incorrect results (mean)", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(
+            save_inv_std, host_save_inv_std, "Error: Incorrect results (inv_std)", 1e-3, 1e-3);
+#endif
+    }
+    return (pass ? 0 : 1);
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -11,31 +11,27 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    if(DEFINED DTYPES)
    foreach(source IN LISTS FILE_NAME)
        set(test 0)
-        foreach(type IN LISTS DTYPES)
+        if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
-            if(type MATCHES "fp16")
+            set(test 1)
-                set(type1 "_f16")
+        endif()
-            elseif(type MATCHES "fp32")
+        if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
-                set(type1 "_f32")
+            set(test 1)
-            elseif(type MATCHES "fp8")
+        endif()
-                set(type1 "_f8")
+        if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
-            elseif(type MATCHES "bf16")
+            set(test 1)
-                set(type1 "_b16")
+        endif()
-            elseif(type MATCHES "fp64")
+        if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
-                set(type1 "_f64")
+            set(test 1)
-            elseif(type MATCHES "int8")
+        endif()
-                set(type1 "_i8")
+        if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
-            endif()
+            set(test 1)
-            if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+        endif()
-                #if filename matches any selected type, exit type loop and do no exclude the file from the list
+        if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
-                set(test 0)
+            set(test 1)
-                break()
+        endif()
-            elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+        if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
-                source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+            set(test 1)
-                NOT(source MATCHES type OR source MATCHES type1))
+        endif()
-                    #if filename contains a type which doesn't match any selected type, mark it for removal
-                    set(test 1)
-            endif()
-        endforeach()
        if(test EQUAL 1)
            message("removing example source file ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
@@ -74,31 +70,27 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    if(DEFINED DTYPES)
    foreach(source IN LISTS FILE_NAME)
        set(test 0)
-        foreach(type IN LISTS DTYPES)
+        if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
-                if(type MATCHES "fp16")
+            set(test 1)
-                    set(type1 "_f16")
+        endif()
-                elseif(type MATCHES "fp32")
+        if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
-                    set(type1 "_f32")
+            set(test 1)
-                elseif(type MATCHES "fp8")
+        endif()
-                    set(type1 "_f8")
+        if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
-                elseif(type MATCHES "bf16")
+            set(test 1)
-                    set(type1 "_b16")
+        endif()
-                elseif(type MATCHES "fp64")
+        if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
-                    set(type1 "_f64")
+            set(test 1)
-                elseif(type MATCHES "int8")
+        endif()
-                    set(type1 "_i8")
+        if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
-                endif()
+            set(test 1)
-                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+        endif()
-                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
+        if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
-                    set(test 0)
+            set(test 1)
-                    break()
+        endif()
-                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+        if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
-                  source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+            set(test 1)
-                  NOT(source MATCHES type OR source MATCHES type1))
+        endif()
-                    #if filename contains a type which doesn't match any selected type, mark it for removal
-                    set(test 1)
-                endif()
-        endforeach()
        if(test EQUAL 1)
            message("removing example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -66,6 +66,10 @@
 #define CK_USE_AMD_V_FMAC_F32
 #define CK_USE_AMD_V_DOT2_F32_F16
 #define CK_USE_AMD_V_DOT4_I32_I8
+#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
+#define CK_USE_AMD_V_FMAC_F32
+#define CK_USE_AMD_V_DOT2_F32_F16
+#define CK_USE_AMD_V_DOT4_I32_I8_GFX11
 #endif
 // MFMA instruction

--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -3,8 +3,10 @@
 #pragma once
+#include <sstream>
 #include <hip/hip_runtime.h>
+// To be removed, which really does not tell the location of failed HIP functional call
 inline void hip_check_error(hipError_t x)
 {
    if(x != hipSuccess)
@@ -15,3 +17,16 @@ inline void hip_check_error(hipError_t x)
        throw std::runtime_error(ss.str());
    }
 }
+#define HIP_CHECK_ERROR(retval_or_funcall)                                         \
+    do                                                                             \
+    {                                                                              \
+        hipError_t _tmpVal = retval_or_funcall;                                    \
+        if(_tmpVal != hipSuccess)                                                  \
+        {                                                                          \
+            std::ostringstream ostr;                                               \
+            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+                 << hipGetErrorString(_tmpVal);                                    \
+            throw std::runtime_error(ostr.str());                                  \
+        }                                                                          \
+    } while(0)
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
@@ -33,7 +33,8 @@ template <index_t NumDimM,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename ComputeDataType = ADataType>
 struct DeviceContractionMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
@@ -14,11 +14,12 @@ namespace device {
 /**
 * \brief Convolution Tensor Rearrange.
 *
- * This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to
+ * This Device operator supports converting an image to
- * the gemm problem([N * Do * Ho * Wo, Z *  Y * X * C]) (Image to Column) and
+ * the GEMM representation (Image to Column) and
- * conversion gemm form to the image (Column to Image).
+ * converting a GEMM form to the image (Column to Image).
- *
+ * Supported layouts:
- * Note that G must be equal to 1.
+ * [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z *  Y * X * C]
+ * [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z *  Y * X * C]
 *
 * \tparam NDimSpatial Number of spatial dimensions.
 * \tparam ImageLayout Input Layout.
@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
     *
     * \param p_in A pointer to the device memory of the input image.
     * \param p_out A pointer to the device memory of the output.
+     * \param G Convolution number of groups.
     * \param N Convolution batch size.
     * \param C Convolution number of channels.
     * \param input_spatial_lengths Input spatial lengths.
     * \param filter_spatial_lengths Filter spatial lengths.
     * \param output_spatial_lengths Output spatial lengths.
     * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
-     * \param gemm_m_k_strides Gemm form strides.
+     * \param gemm_g_m_k_strides Gemm form strides.
     * \param conv_filter_strides Convolution filter strides.
     * \param conv_filter_dilations Convolution filter dilations.
     * \param input_left_pads Convolution left pads.
@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in,
                        void* p_out,
+                        const ck::index_t G,
                        const ck::index_t N,
                        const ck::index_t C,
                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& gemm_m_k_strides,
+                        const std::array<index_t, 3>& gemm_g_m_k_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <memory>
+#include <array>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+struct DeviceElementwise : public BaseOperator
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op,
+                        UnaryOperation unary_op,
+                        Scale scale_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+using DeviceElementwisePtr = std::unique_ptr<DeviceElementwise<InDataTypeTuple,
+                                                               OutDataTypeTuple,
+                                                               ElementwiseOperation,
+                                                               UnaryOperation,
+                                                               Scale,
+                                                               NumDim>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/utility/is_detected.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename T>
+using is_tuple = decltype(std::declval<T&>().IsTuple());
+/**
+ * \brief Grouped Convolution Forward
+ *
+ * \details
+ * input : input image A[G, N, C, Hi, Wi], A1[G, N, C, Hi, Wi]...
+ * input : weight B[G, K, C, Y, X], B1[G, K, C, Y, X]...
+ * input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+ * output : output image E[G, N, K, Ho, Wo]
+ *
+ * C = a_op(A, A1...) * b_op(B, B1...)
+ * E = cde_op(C, D0, D1, ...)
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ALayout Input layout (also for a1, a2...).
+ * \tparam BLayout Weight layout (also for b1, b2...).
+ * \tparam DsLayout Ds layouts.
+ * \tparam ELayout Output layout.
+ * \tparam ADataType Input data type. Pass tuple if there is multiple A.
+ * \tparam BDataType Weight data type. Pass tuple if there is multiple B.
+ * \tparam DsDataType D data types.
+ * \tparam EDataType Output data type.
+ * \tparam AElementwiseOperation A elementwise operation.
+ * \tparam BElementwiseOperation B elementwise operation.
+ * \tparam CDEElementwiseOperation CDE elementwise operation.
+ * \tparam ComputeType Compute data type (default: ADataType, first if tuple passed).
+ */
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename ComputeType =
+              decltype(UnpackDataType<is_detected<is_tuple, ADataType>::value,
+                                      Number<0>,
+                                      ADataType>())> // ComputeType is InputType by default (first
+                                                     // in tuple for MultiAB), unpack if tuple was
+                                                     // passed
+struct DeviceGroupedConvFwdMultipleABD : public BaseOperator
+{
+    static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
+    static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
+    static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
+    static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+    // If DataType is tuple, user has to pass std::array with pointers.
+    using APointers =
+        std::conditional_t<isMultiA, std::array<const void*, NumATensor>&, const void*>;
+    using BPointers =
+        std::conditional_t<isMultiB, std::array<const void*, NumBTensor>&, const void*>;
+    /**
+     * \brief Make argument pointer for grouped conv fwd.
+     *
+     * \param p_a A pointer to the input (std::array<const void*, NumA> with
+                  pointers for multiple A).
+     * \param p_b A pointer to the weight (std::array<const void*, NumA> with
+                  pointers for multiple B).
+     * \param p_ds A pointers to the Ds.
+     * \param p_e A pointers to the output.
+     * \param a_g_n_c_wis_lengths Input lengths [G, N, C, Spatial...] (for 3d).
+     * \param a_g_n_c_wis_strides Input strides [G, N, C, Spatial...] (for 3d).
+     * \param b_g_k_c_xs_lengths Weight lengths [G, K, C, Spatial...] (for 3d).
+     * \param b_g_k_c_xs_strides Weight strides [G, K, C, Spatial...] (for 3d).
+     * \param ds_g_n_k_wos_lengths Ds lengths [G, N, K, Spatial...] (for 3d).
+     * \param ds_g_n_k_wos_strides Ds strides [G, N, K, Spatial...] (for 3d).
+     * \param e_g_n_k_wos_lengths Output lengths [G, N, K, Spatial...] (for 3d).
+     * \param e_g_n_k_wos_strides Output strides [G, N, K, Spatial...] (for 3d).
+     * \param conv_filter_strides Convolution filter strides.
+     * \param conv_filter_dilations Convolution filter dilations.
+     * \param input_left_pads Input left paddings.
+     * \param input_right_pads Input right paddings.
+     * \param a_element_op A elementwise operation object.
+     * \param b_element_op B elementwise operation object.
+     * \param cde_element_op CDE elementwise operation object.
+     * \return Pointer to the argument.
+     */
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        APointers p_a,
+        BPointers p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -3,21 +3,33 @@
 #pragma once
-#include <array>
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-// Convolution Forward:
+/**
-//   input : input image A[G, N, C, Hi, Wi],
+ * \brief Grouped Convolution Forward
-//   input : weight B[G, K, C, Y, X],
+ *
-//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+ * \note This structure is deprecated (left for backwards compatibility). Please use
-//   output : output image E[G, N, K, Ho, Wo]
+ *       DeviceGroupedConvFwdMultipleABD.
-//   C = a_op(A) * b_op(B)
+ *
-//   E = cde_op(C, D0, D1, ...)
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ALayout Input layout (also for a1, a2...).
+ * \tparam BLayout Weight layout (also for b1, b2...).
+ * \tparam DsLayout Ds layouts.
+ * \tparam ELayout Output layout.
+ * \tparam ADataType Input data type. Pass tuple if there is multiple A.
+ * \tparam BDataType Weight data type. Pass tuple if there is multiple B.
+ * \tparam DsDataType D data types.
+ * \tparam EDataType Output data type.
+ * \tparam AElementwiseOperation A elementwise operation.
+ * \tparam BElementwiseOperation B elementwise operation.
+ * \tparam CDEElementwiseOperation CDE elementwise operation.
+ * \tparam ComputeType Compute data type (default: ADataType, first if tuple passed).
+ */
 template <index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
@@ -30,36 +42,25 @@ template <index_t NDimSpatial,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CDEElementwiseOperation,
-          typename ComputeType = ADataType>
+          typename ComputeType =
-struct DeviceGroupedConvFwdMultipleD : public BaseOperator
+              decltype(UnpackDataType<is_detected<is_tuple, ADataType>::value,
-{
+                                      Number<0>,
-    static constexpr index_t NumDTensor = DsDataType::Size();
+                                      ADataType>())> // ComputeType is InputType by default (first
+                                                     // in tuple for MultiAB), unpack if tuple was
-    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+                                                     // passed
+using DeviceGroupedConvFwdMultipleD = DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+                                                                      ALayout,
-        const void* p_a, // input image
+                                                                      BLayout,
-        const void* p_b, // weight
+                                                                      DsLayout,
-        const std::array<const void*, NumDTensor>& p_ds,
+                                                                      ELayout,
-        void* p_e, // output image
+                                                                      ADataType,
-        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                                                                      BDataType,
-        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                                                                      DsDataType,
-        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                                                      EDataType,
-        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                                                                      AElementwiseOperation,
-        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+                                                                      BElementwiseOperation,
-        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+                                                                      CDEElementwiseOperation,
-        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                                                                      ComputeType>;
-        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
-        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-        const std::array<index_t, NDimSpatial>& input_left_pads,
-        const std::array<index_t, NDimSpatial>& input_right_pads,
-        const AElementwiseOperation& a_element_op,
-        const BElementwiseOperation& b_element_op,
-        const CDEElementwiseOperation& cde_element_op) = 0;
-    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-};
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceNormalizationBwdGammaBeta : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
+                        const std::vector<index_t> dyStrides,
+                        const std::vector<index_t> xStrides,
+                        const std::vector<index_t> meanStrides,
+                        const std::vector<index_t> invStdStrides,
+                        const std::vector<index_t> outLengths,
+                        const std::vector<index_t> dgammaStrides,
+                        const std::vector<index_t> dbetaStrides,
+                        const std::vector<index_t> reduceDims,
+                        const void* p_dy,
+                        const void* p_x,
+                        const void* p_mean,
+                        const void* p_invStd,
+                        void* p_dgamma,
+                        void* p_dbeta) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceNormalizationBwdGammaBetaPtr =
+    std::unique_ptr<DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                    XDataType,
+                                                    MeanInvStdDataType,
+                                                    DGammaDataType,
+                                                    DBetaDataType,
+                                                    Rank,
+                                                    NumReduceDim>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -19,7 +19,7 @@ template <typename XDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
-struct DeviceNormalization : public BaseOperator
+struct DeviceNormalizationFwd : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const std::vector<index_t> lengths,
@@ -50,14 +50,14 @@ template <typename XDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
-using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization<XDataType,
+using DeviceNormalizationFwdPtr = std::unique_ptr<DeviceNormalizationFwd<XDataType,
-                                                                   GammaDataType,
+                                                                         GammaDataType,
-                                                                   BetaDataType,
+                                                                         BetaDataType,
-                                                                   YDataType,
+                                                                         YDataType,
-                                                                   SaveMeanInvStdDataType,
+                                                                         SaveMeanInvStdDataType,
-                                                                   YElementwiseOperation,
+                                                                         YElementwiseOperation,
-                                                                   Rank,
+                                                                         Rank,
-                                                                   NumReduceDim>>;
+                                                                         NumReduceDim>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
@@ -17,15 +17,18 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/io.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-// Image to column for input layout NDHWC:
+// Column to Image:
-//   input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C]
+//   input : gemm form [G, N * Do * Ho * Wo, Z * Y * X * C]
-//   output : image [N, Di, Hi, Wi, C]
+//   output : input image [G, N, Di, Hi, Wi, C]
+//   input : gemm form [N * Do * Ho * Wo, G, Z * Y * X * C]
+//   output : input image [N, Di, Hi, Wi, G, C]
 template <index_t NDimSpatial,
          typename ImageLayout,
          typename InputDataType,
@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
                                       OutputDataType,
                                       conv_tensor_rearrange_op::ColumnToImage>
 {
+    static constexpr bool is_NSpatialGC =
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
+    static constexpr bool is_GNSpatialC =
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl
                            const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                            const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                            const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                            const std::array<index_t, 2>& gemm_m_k_strides,
+                            const std::array<index_t, 3>& gemm_g_m_k_strides,
                            const std::array<index_t, NDimSpatial>& independent_filters,
                            const std::array<index_t, NDimSpatial>& effs)
    {
@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl
            C * ck::accumulate_n<index_t>(
                    filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
-        const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
+        const index_t NStride = DoHoWo * gemm_g_m_k_strides[I1] * gemm_g_m_k_strides[I2];
        // Calculate the appropriate stride for each set of independent filters
        // in each dimension
-        const index_t WStride =
+        const index_t WStride = math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) *
-            math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0];
+                                gemm_g_m_k_strides[I1];
        const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
-                                output_spatial_lengths[XIdx] * gemm_m_k_strides[I0];
+                                output_spatial_lengths[XIdx] * gemm_g_m_k_strides[I1];
        const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
                                output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
-                                gemm_m_k_strides[I0];
+                                gemm_g_m_k_strides[I1];
        // Create descriptor for independent filters in each dimension and
        // then merge them into column form
        if constexpr(NDimSpatial == 1)
        {
            const auto desc_gemm_form =
                make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX),
-                                             make_tuple(NStride, WStride, gemm_m_k_strides[I1]));
+                                             make_tuple(NStride, WStride, gemm_g_m_k_strides[I2]));
            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
                desc_gemm_form,
                make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])),
@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl
        {
            const auto desc_gemm_form = make_naive_tensor_descriptor(
                make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX),
-                make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1]));
+                make_tuple(NStride, HStride, WStride, gemm_g_m_k_strides[I2]));
            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
                desc_gemm_form,
                make_tuple(make_merge_transform(
@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl
                           independent_filters[YIdx],
                           independent_filters[XIdx],
                           CZYX),
-                make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1]));
+                make_tuple(NStride, DStride, HStride, WStride, gemm_g_m_k_strides[I2]));
            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
                desc_gemm_form,
                make_tuple(make_merge_transform(make_tuple(N,
@@ -262,24 +273,27 @@ struct DeviceColumnToImageImpl
                                                                  ThreadClusterLengths,
                                                                  ScalarPerVector,
                                                                  InMemoryDataOperationEnum::Add,
-                                                                  Block2ETileMap>;
+                                                                  Block2ETileMap,
+                                                                  ComputePtrOffsetOfStridedBatch<>>;
    struct Argument : public BaseArgument
    {
        Argument(const void* p_in, // input image
                 void* p_out,      // output image
+                 const ck::index_t G,
                 const ck::index_t N,
                 const ck::index_t C,
                 const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                 const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                 const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                 const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                 const std::array<index_t, 2>& gemm_m_k_strides,
+                 const std::array<index_t, 3>& gemm_g_m_k_strides,
                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                 const std::array<index_t, NDimSpatial>& input_left_pads,
                 const std::array<index_t, NDimSpatial>& input_right_pads)
-            : C_(C),
+            : G_(G),
+              C_(C),
              X_(filter_spatial_lengths[NDimSpatial - I1]),
              p_in_{static_cast<const InputDataType*>(p_in)},
              p_out_{static_cast<OutputDataType*>(p_out)},
@@ -289,6 +303,9 @@ struct DeviceColumnToImageImpl
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads}
        {
+            compute_ptr_offset_of_batch_.BatchStrideA_ = gemm_g_m_k_strides[I0];
+            compute_ptr_offset_of_batch_.BatchStrideC_ = image_g_n_c_wis_strides[I0];
            const index_t x_eff =
                (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
            const index_t y_eff =
@@ -354,7 +371,7 @@ struct DeviceColumnToImageImpl
                                                    filter_spatial_lengths,
                                                    output_spatial_lengths,
                                                    conv_filter_strides,
-                                                    gemm_m_k_strides,
+                                                    gemm_g_m_k_strides,
                                                    independent_filters,
                                                    effs);
                        const auto out_grid_desc_m_k =
@@ -387,10 +404,9 @@ struct DeviceColumnToImageImpl
                        // Memory offsets to next set of independent filters,
                        // move to independent filters in each dimension
                        const index_t in_offset =
-                            x_idx * gemm_m_k_strides[0] +
+                            (x_idx + y_idx * output_spatial_lengths[XIdx] +
-                            y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] +
+                             z_idx * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx]) *
-                            z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] *
+                            gemm_g_m_k_strides[I1];
-                                output_spatial_lengths[XIdx];
                        // Move to independent filters in appropriate dimensions
                        const index_t out_offset =
                            x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
@@ -417,6 +433,7 @@ struct DeviceColumnToImageImpl
            }
        }
+        const ck::index_t G_;
        const ck::index_t C_;
        const ck::index_t X_;
@@ -434,6 +451,8 @@ struct DeviceColumnToImageImpl
        std::vector<const InputDataType*> p_in_container_;
        std::vector<OutputDataType*> p_out_container_;
+        ComputePtrOffsetOfStridedBatch<> compute_ptr_offset_of_batch_;
    };
    struct Invoker : public BaseInvoker
@@ -451,6 +470,7 @@ struct DeviceColumnToImageImpl
                                                        OutputGridDesc,
                                                        OutputDataType,
                                                        Block2ETileMap,
+                                                        ComputePtrOffsetOfStridedBatch<>,
                                                        GridwiseTensorRearrangeKernel>;
            // Execute each set of independent filters
@@ -460,7 +480,7 @@ struct DeviceColumnToImageImpl
                    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
                        arg.out_grid_desc_m_k_container_[i]);
                const index_t grid_size =
-                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]);
+                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_;
                elapsed_time += launch_and_time_kernel(stream_config,
                                                       kernel,
                                                       dim3(grid_size),
@@ -470,7 +490,9 @@ struct DeviceColumnToImageImpl
                                                       arg.p_in_container_[i],
                                                       arg.out_grid_desc_m_k_container_[i],
                                                       arg.p_out_container_[i],
-                                                       block_2_tile_map);
+                                                       arg.G_,
+                                                       block_2_tile_map,
+                                                       arg.compute_ptr_offset_of_batch_);
            }
            return elapsed_time;
        }
@@ -485,8 +507,7 @@ struct DeviceColumnToImageImpl
    bool IsSupportedArgument(const Argument& arg)
    {
        using namespace tensor_layout::convolution;
-        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
+        if constexpr(!(is_NSpatialGC || is_GNSpatialC))
-                       std::is_same_v<ImageLayout, GNDHWC>))
        {
            return false;
        }
@@ -534,13 +555,14 @@ struct DeviceColumnToImageImpl
    static auto MakeArgument(const void* p_in, // input image
                             void* p_out,      // output image
+                             const ck::index_t G,
                             const ck::index_t N,
                             const ck::index_t C,
                             const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                             const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                             const std::array<index_t, 2>& gemm_m_k_strides,
+                             const std::array<index_t, 3>& gemm_g_m_k_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                             const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -548,13 +570,14 @@ struct DeviceColumnToImageImpl
    {
        return Argument{static_cast<const InputDataType*>(p_in),
                        static_cast<OutputDataType*>(p_out),
+                        G,
                        N,
                        C,
                        input_spatial_lengths,
                        filter_spatial_lengths,
                        output_spatial_lengths,
                        image_g_n_c_wis_strides,
-                        gemm_m_k_strides,
+                        gemm_g_m_k_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -566,13 +589,14 @@ struct DeviceColumnToImageImpl
    std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in, // input image
                        void* p_out,      // output image
+                        const ck::index_t G,
                        const ck::index_t N,
                        const ck::index_t C,
                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& gemm_m_k_strides,
+                        const std::array<index_t, 3>& gemm_g_m_k_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -580,13 +604,14 @@ struct DeviceColumnToImageImpl
    {
        return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
                                          static_cast<OutputDataType*>(p_out),
+                                          G,
                                          N,
                                          C,
                                          input_spatial_lengths,
                                          filter_spatial_lengths,
                                          output_spatial_lengths,
                                          image_g_n_c_wis_strides,
-                                          gemm_m_k_strides,
+                                          gemm_g_m_k_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -385,9 +385,11 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
    // desc for blockwise copy
    using AsGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(AsGridDesc_M_K{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(
+            AsGridDesc_M_K{}))>;
    using BsGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(BsGridDesc_N_K{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(
+            BsGridDesc_N_K{}))>;
    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
            DsGridDesc_M_N{}))>;
@@ -397,7 +399,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
    // block-to-e-tile map
    using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -429,7 +431,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
              bs_grid_desc_bk0_n_bk1_{},
              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op}
@@ -481,10 +483,10 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                                           block_2_etile_map_))
            {
                as_grid_desc_ak0_m_ak1_ =
-                    GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
+                    GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
                bs_grid_desc_bk0_n_bk1_ =
-                    GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
+                    GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -145,7 +145,8 @@ template <index_t NumDimM,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          typename ComputeDataType = ADataType,
+          LoopScheduler LoopSched  = make_default_loop_scheduler()>
 struct DeviceContractionMultipleD_Xdl_CShuffle
    : public DeviceContractionMultipleD<NumDimM,
                                        NumDimN,
@@ -156,7 +157,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                                        EDataType,
                                        AElementwiseOperation,
                                        BElementwiseOperation,
-                                        CDEElementwiseOperation>
+                                        CDEElementwiseOperation,
+                                        ComputeDataType>
 {
    using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
@@ -310,8 +312,6 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
-    using ComputeDataType = ADataType;
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
        ADataType, // TODO: distinguish A/B datatype
@@ -595,7 +595,9 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            return false;
        }
-        if(ck::get_device_name() != "gfx90a" && std::is_same<ADataType, double>::value)
+        if(ck::get_device_name() != "gfx90a" && ck::get_device_name() != "gfx940" &&
+           ck::get_device_name() != "gfx941" && ck::get_device_name() != "gfx942" &&
+           std::is_same<ADataType, double>::value)
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim_m, // choose how to set dims
+          index_t NumDim_n,
+          index_t NumDim_k,
+          index_t MPerThread,
+          index_t NPerThread,
+          index_t KPerThread,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
+                                                          OutDataTypeTuple,
+                                                          ElementwiseOperation,
+                                                          NumDim_m + NumDim_n + NumDim_k>
+{
+    static constexpr index_t NumDim = NumDim_m + NumDim_n + NumDim_k;
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+    static auto GenerateInDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                return static_cast<const DataType*>(nullptr);
+            },
+            Number<NumInput>{});
+    }
+    static auto GenerateOutDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                return static_cast<DataType*>(nullptr);
+            },
+            Number<NumOutput>{});
+    }
+    using InDataTypePointerTuple  = decltype(GenerateInDataTypePointerTuple());
+    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
+    template <typename Desc_MNK>
+    static auto PadDescriptor_MNK(Desc_MNK desc_mnk,
+                                  index_t gridSize,
+                                  index_t blockSize,
+                                  index_t num_threads_m,
+                                  index_t num_threads_n,
+                                  index_t num_threads_k)
+    {
+        std::ignore = blockSize;
+        std::ignore = gridSize;
+        const auto m = desc_mnk.GetLength(I0);
+        const auto n = desc_mnk.GetLength(I1);
+        const auto k = desc_mnk.GetLength(I2);
+        const index_t loop_step_m = num_threads_m * MPerThread;
+        const index_t loop_step_n = num_threads_n * NPerThread;
+        const index_t loop_step_k = num_threads_k * KPerThread;
+        const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
+        const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
+        const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k;
+        const auto desc_mnk_pad =
+            transform_tensor_descriptor(desc_mnk,
+                                        make_tuple(make_right_pad_transform(m, pad_m),
+                                                   make_right_pad_transform(n, pad_n),
+                                                   make_right_pad_transform(k, pad_k)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        return desc_mnk_pad;
+    }
+    static auto MakeDescriptor_MNK(const std::array<index_t, NumDim>& lengths,
+                                   const std::array<index_t, NumDim>& stride,
+                                   index_t gridSize,
+                                   index_t blockSize,
+                                   index_t num_threads_m,
+                                   index_t num_threads_n,
+                                   index_t num_threads_k)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDim_m, 1>::type();
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDim_m, NumDim_m + NumDim_n, 1>::type();
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDim_m + NumDim_n, NumDim, 1>::type();
+        const auto mLengths = get_container_subset(tupleOfShape, mDimIds);
+        const auto nLengths = get_container_subset(tupleOfShape, nDimIds);
+        const auto kLengths = get_container_subset(tupleOfShape, kDimIds);
+        // merge nd to 3d desc - [s0 * s1 * ...]
+        if constexpr(NumDim > 3)
+        {
+            const auto desc_mnk = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(mLengths),
+                           make_merge_transform(nLengths),
+                           make_merge_transform(kLengths)),
+                make_tuple(mDimIds, nDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            return PadDescriptor_MNK(
+                desc_mnk, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
+        }
+        else
+            return PadDescriptor_MNK(
+                desc, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
+    }
+    template <index_t TupleSize>
+    static auto GenerateInOutGrid3dDescTuple(Number<TupleSize>)
+    {
+        return generate_tuple(
+            [&](auto) {
+                if constexpr(NumDim > 3)
+                {
+                    return MakeDescriptor_MNK({1, 1, 1}, {1, 1, 1}, 1, 1, 1, 1, 1);
+                }
+                else
+                {
+                    return MakeDescriptor_MNK({1}, {1}, 1, 1, 1, 1, 1);
+                };
+            },
+            Number<TupleSize>{});
+    }
+    using OutGrid3dDescTuple = decltype(GenerateInOutGrid3dDescTuple(Number<NumOutput>{}));
+    using InGrid3dDescTuple  = decltype(GenerateInOutGrid3dDescTuple(Number<NumInput>{}));
+    using GridwiseElementwise = GridwiseElementwise_3D<InGrid3dDescTuple,
+                                                       OutGrid3dDescTuple,
+                                                       InDataTypePointerTuple,
+                                                       OutDataTypePointerTuple,
+                                                       ElementwiseOperation,
+                                                       MPerThread,
+                                                       NPerThread,
+                                                       KPerThread,
+                                                       InScalarPerVectorSeq,
+                                                       OutScalarPerVectorSeq>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op)
+            : lengths_(lengths),
+              inStridesArray_(inStridesArray),
+              outStridesArray_(outStridesArray),
+              elementwise_op_(elementwise_op),
+              blockSize_(256)
+        {
+            static_assert(NumDim_m > 0, "");
+            static_assert(NumDim_n > 0, "");
+            static_assert(NumDim_k > 0, "");
+            in_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                    return static_cast<const DataType*>(in_dev_buffers[I.value]);
+                },
+                Number<NumInput>{});
+            out_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                    return static_cast<DataType*>(out_dev_buffers[I.value]);
+                },
+                Number<NumOutput>{});
+        }
+        InDataTypePointerTuple in_dev_buffers_;
+        OutDataTypePointerTuple out_dev_buffers_;
+        std::array<index_t, NumDim> lengths_;
+        std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
+        std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
+        ElementwiseOperation elementwise_op_;
+        index_t blockSize_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize      = getAvailableComputeUnitCount(stream_config) * arg.blockSize_;
+            index_t num_threads_m = gridSize / (16 * 16);
+            index_t num_threads_n = 16;
+            index_t num_threads_k = 16;
+            auto in_grid_3d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MNK(arg.lengths_,
+                                              arg.inStridesArray_[I.value],
+                                              gridSize,
+                                              arg.blockSize_,
+                                              num_threads_m,
+                                              num_threads_n,
+                                              num_threads_k);
+                },
+                Number<NumInput>{});
+            auto out_grid_3d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MNK(arg.lengths_,
+                                              arg.outStridesArray_[I.value],
+                                              gridSize,
+                                              arg.blockSize_,
+                                              num_threads_m,
+                                              num_threads_n,
+                                              num_threads_k);
+                },
+                Number<NumOutput>{});
+            const auto kernel = kernel_elementwise_3d<GridwiseElementwise,
+                                                      InGrid3dDescTuple,
+                                                      OutGrid3dDescTuple,
+                                                      InDataTypePointerTuple,
+                                                      OutDataTypePointerTuple,
+                                                      ElementwiseOperation>;
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(gridSize),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        in_grid_3d_desc_tuple,
+                                                        out_grid_3d_desc_tuple,
+                                                        arg.in_dev_buffers_,
+                                                        arg.out_dev_buffers_,
+                                                        arg.elementwise_op_,
+                                                        num_threads_m,
+                                                        num_threads_n,
+                                                        num_threads_k);
+            return elapsed_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg == nullptr)
+            return false;
+        if(pArg->lengths_.back() % MPerThread != 0)
+            return false;
+        auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
+                                          const std::array<index_t, NumDim>& strides,
+                                          index_t scalarPerVector,
+                                          index_t vectorDim) {
+            if(strides[vectorDim] == 1 &&
+               (lengths[vectorDim] % scalarPerVector == 0 ||
+                lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
+            {
+                return true;
+            }
+            if(strides[vectorDim] >= scalarPerVector)
+            {
+                return true;
+            }
+            return false;
+        };
+        bool valid = true;
+        static_for<0, NumInput, 1>{}([&](auto I) {
+            valid = valid && IsScalarPerVectorValid(pArg->lengths_,
+                                                    pArg->inStridesArray_[I.value],
+                                                    InScalarPerVectorSeq::At(I),
+                                                    NumDim_m - 1);
+        });
+        static_for<0, NumOutput, 1>{}([&](auto I) {
+            valid = valid && IsScalarPerVectorValid(pArg->lengths_,
+                                                    pArg->outStridesArray_[I.value],
+                                                    OutScalarPerVectorSeq::At(I),
+                                                    NumDim - 1);
+        });
+        return valid;
+    }
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op) override
+    {
+        return std::make_unique<Argument>(lengths,
+                                          inStridesArray,
+                                          outStridesArray,
+                                          in_dev_buffers,
+                                          out_dev_buffers,
+                                          elementwise_op);
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    }
+}; // namespace device
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck