Support multi AB for grouped conv fwd xdl (#1027)

* Support multi AB for grouped conv fwd xdl * Add instances * Add client example * Add example * Add interface test * Minor fixes Minor fixes Minor fixes * Comment fixes * Fixes * Reference fix * Test xdl fixes * Improve multi_ab interface test

Support multi AB for grouped conv fwd xdl (#1027)
* Support multi AB for grouped conv fwd xdl * Add instances * Add client example * Add example * Add interface test * Minor fixes Minor fixes Minor fixes * Comment fixes * Fixes * Reference fix * Test xdl fixes * Improve multi_ab interface test
49e52bb3 · Bartłomiej Kocot · GitHub · 1db75603 · 49e52bb3 · 49e52bb3
Unverified Commit 49e52bb3 authored Nov 10, 2023 by Bartłomiej Kocot Committed by GitHub Nov 10, 2023
20 changed files
--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
@@ -63,7 +63,7 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
        K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
    std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
    std::array<ck::index_t, 6> out_strides{
-        C, Do * Ho * Wo * G * C, 1, Ho * Wo * G * C, Wo * G * C, G * C};
+        K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};

--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
+add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp32 grouped_conv_fwd_scaleadd_ab_fp32.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp32 PRIVATE composable_kernel::device_operations)
+add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp16 grouped_conv_fwd_scaleadd_ab_fp16.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp16 PRIVATE composable_kernel::device_operations)
+add_executable(client_grouped_convnd_fwd_scaleadd_ab_bf16 grouped_conv_fwd_scaleadd_ab_bf16.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_bf16 PRIVATE composable_kernel::device_operations)
+add_executable(client_grouped_convnd_fwd_scaleadd_ab_int8 grouped_conv_fwd_scaleadd_ab_int8.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composable_kernel::device_operations)
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InLayout    = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout   = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout   = ck::tensor_layout::convolution::NDHWGK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ScaleAdd    = ck::tensor_operation::element_wise::ScaleAdd;
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 64; // batch size
+static constexpr ck::index_t K             = 64; // output channel
+static constexpr ck::index_t C             = 32; // input channel (per group)
+static constexpr ck::index_t Z             = 3;  // filter D
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Di            = 14; // input D
+static constexpr ck::index_t Hi            = 14; // input H
+static constexpr ck::index_t Wi            = 14; // input W
+static constexpr ck::index_t Do            = 14; // output D
+static constexpr ck::index_t Ho            = 14; // output H
+static constexpr ck::index_t Wo            = 14; // output W
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int execute_conv_fwd_scaleadd_ab()
+{
+    constexpr ck::index_t NumAs = 2;
+    constexpr ck::index_t NumBs = 2;
+    constexpr float scale = 1.5f;
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
+    // However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
+    // Hence, we need to adjust the order of strides.
+    std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
+    std::array<ck::index_t, 6> in_strides{
+        C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
+    std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
+    std::array<ck::index_t, 6> wei_strides{
+        K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
+    std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
+    std::array<ck::index_t, 6> out_strides{
+        K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
+    using InputDtype      = ck::tuple_element_t<0, InDataType>;
+    using InputBiasDtype  = ck::tuple_element_t<1, InDataType>;
+    using WeightDtype     = ck::tuple_element_t<0, WeiDataType>;
+    using WeightBiasDtype = ck::tuple_element_t<1, WeiDataType>;
+    SimpleDeviceMem in(sizeof(InputDtype) * N * Di * Hi * Wi * G * C);
+    SimpleDeviceMem in_bias(sizeof(InputBiasDtype) * N * Di * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeightDtype) * G * K * Z * Y * X * C);
+    SimpleDeviceMem wei_bias(sizeof(WeightBiasDtype) * G * K * Z * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 OutLayout,
+                                                                                 InDataType,
+                                                                                 WeiDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 OutDataType,
+                                                                                 ScaleAdd,
+                                                                                 ScaleAdd,
+                                                                                 PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    std::array<const void*, NumAs> as = {in.GetDeviceBuffer(), in_bias.GetDeviceBuffer()};
+    std::array<const void*, NumBs> bs = {wei.GetDeviceBuffer(), wei_bias.GetDeviceBuffer()};
+    std::array<const void*, 0> ds{};
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(as,
+                                                        bs,
+                                                        ds,
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        ScaleAdd{scale},
+                                                        ScaleAdd{scale},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Z * Y * X +
+                               N * Di * Hi * Wi * G * C + G * K * Z * Y * X * C;
+            std::size_t num_bytes = 2 * sizeof(InDataType) * N * Di * Hi * Wi * G * C +
+                                    2 * sizeof(WeiDataType) * G * K * Z * Y * X * C +
+                                    sizeof(OutDataType) * N * Do * Ho * Wo * G * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(as,
+                                                        bs,
+                                                        ds,
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        ScaleAdd{scale},
+                                                        ScaleAdd{scale},
+                                                        PassThrough{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+using InDataType  = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
+using WeiDataType = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
+using OutDataType = ck::bhalf_t;
+#include "grouped_conv_fwd_scaleadd_ab.inc"
+int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+using InDataType  = ck::Tuple<ck::half_t, ck::half_t>;
+using WeiDataType = ck::Tuple<ck::half_t, ck::half_t>;
+using OutDataType = ck::half_t;
+#include "grouped_conv_fwd_scaleadd_ab.inc"
+int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+using InDataType  = ck::Tuple<float, float>;
+using WeiDataType = ck::Tuple<float, float>;
+using OutDataType = float;
+#include "grouped_conv_fwd_scaleadd_ab.inc"
+int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+using InDataType  = ck::Tuple<int8_t, int8_t>;
+using WeiDataType = ck::Tuple<int8_t, int8_t>;
+using OutDataType = int8_t;
+#include "grouped_conv_fwd_scaleadd_ab.inc"
+int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/example/62_conv_fwd_activ/CMakeLists.txt
+++ b/example/62_conv_fwd_activ/CMakeLists.txt
@@ -30,6 +30,15 @@ foreach(gpu IN LISTS GPU_TARGETS)
      # Elu
      add_example_executable(example_convnd_fwd_xdl_elu_fp16 convnd_fwd_xdl_elu_fp16.cpp)
      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_elu_fp16)
+      # ScaleAdd on A and B
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
      # ScaleAdd ScaleAdd Relu
      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)

--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
@@ -226,13 +226,16 @@ bool run_grouped_conv_fwd(bool do_verification,
    if(do_verification)
    {
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                         InDataType,
                                                         WeiDataType,
                                                         OutDataType,
                                                         InElementOp,
                                                         WeiElementOp,
                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
                                                         NumDs>();
        auto ref_invoker  = ref_conv.MakeInvoker();
@@ -246,6 +249,8 @@ bool run_grouped_conv_fwd(bool do_verification,
                                                  in_element_op,
                                                  wei_element_op,
                                                  out_element_op,
+                                                  {},
+                                                  {},
                                                  d_tensors);
        ref_invoker.Run(ref_argument);

--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+using DataType    = ck::bhalf_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+#include "../run_convnd_fwd_activ_example.inc"
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+using DataType    = ck::half_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+#include "../run_convnd_fwd_activ_example.inc"
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+using DataType    = float;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+#include "../run_convnd_fwd_activ_example.inc"
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+using DataType    = int8_t;
+using AccDataType = int32_t;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+#include "../run_convnd_fwd_activ_example.inc"
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+++ b/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+constexpr ck::index_t NDimSpatial = 3;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <typename DataType,
+          typename AccDataType,
+          typename InDataTypes,
+          typename WeiDataTypes,
+          typename InElementOp,
+          typename WeiElementOp>
+using DeviceGroupedConvNDMultiABFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataTypes,
+        WeiDataTypes,
+        AccDataType,
+        DataType,
+        ck::Tuple<>,
+        DataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+namespace {
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumAs = 2;
+    constexpr ck::index_t NumBs = 2;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_bias(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<WeiDataType> wei_bias(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        in_bias.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        wei_bias.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        in_bias.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        wei_bias.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem in_bias_device_buf(sizeof(InDataType) * in_bias.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem wei_bias_device_buf(sizeof(WeiDataType) * wei_bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    in_bias_device_buf.ToDevice(in_bias.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    wei_bias_device_buf.ToDevice(wei_bias.mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    std::array<const void*, NumAs> as{in_device_buf.GetDeviceBuffer(),
+                                      in_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, NumBs> bs{wei_device_buf.GetDeviceBuffer(),
+                                      wei_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, 0> ds{};
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(as,
+                                      bs,
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {},
+                                      {},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop = conv_param.GetFlops() +
+                       2 * conv_param.GetOutputByte<InDataType>() / sizeof(InDataType) +
+                       2 * conv_param.GetOutputByte<WeiDataType>() / sizeof(WeiDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetInputByte<InDataType>() +
+                            conv_param.GetWeightByte<WeiDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        const std::array<Tensor<InDataType>, NumAs - 1> elementwise_a_tensors  = {in_bias};
+        const std::array<Tensor<WeiDataType>, NumBs - 1> elementwise_b_tensors = {wei_bias};
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp,
+                                                                     NumAs - 1,
+                                                                     NumBs - 1>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  elementwise_a_tensors,
+                                                  elementwise_b_tensors);
+        ref_invoker.Run(ref_argument);
+        out_device_buf.FromDevice(out_device.mData.data());
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+    return true;
+}
+} // namespace
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -6,18 +6,42 @@
 #include <array>
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/utility/is_detected.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-// Convolution Forward:
+template <typename T>
-//   input : input image A[G, N, C, Hi, Wi],
+using is_tuple = decltype(std::declval<T&>().IsTuple());
-//   input : weight B[G, K, C, Y, X],
-//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+/**
-//   output : output image E[G, N, K, Ho, Wo]
+ * \brief Grouped Convolution Forward
-//   C = a_op(A) * b_op(B)
+ *
-//   E = cde_op(C, D0, D1, ...)
+ * \details
+ * input : input image A[G, N, C, Hi, Wi], A1[G, N, C, Hi, Wi]...
+ * input : weight B[G, K, C, Y, X], B1[G, K, C, Y, X]...
+ * input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+ * output : output image E[G, N, K, Ho, Wo]
+ *
+ * C = a_op(A, A1...) * b_op(B, B1...)
+ * E = cde_op(C, D0, D1, ...)
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ALayout Input layout (also for a1, a2...).
+ * \tparam BLayout Weight layout (also for b1, b2...).
+ * \tparam DsLayout Ds layouts.
+ * \tparam ELayout Output layout.
+ * \tparam ADataType Input data type. Pass tuple if there is multiple A.
+ * \tparam BDataType Weight data type. Pass tuple if there is multiple B.
+ * \tparam DsDataType D data types.
+ * \tparam EDataType Output data type.
+ * \tparam AElementwiseOperation A elementwise operation.
+ * \tparam BElementwiseOperation B elementwise operation.
+ * \tparam CDEElementwiseOperation CDE elementwise operation.
+ * \tparam ComputeType Compute data type (default: ADataType, first if tuple passed).
+ */
 template <index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
@@ -30,18 +54,60 @@ template <index_t NDimSpatial,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CDEElementwiseOperation,
-          typename ComputeType = ADataType>
+          typename ComputeType =
+              decltype(UnpackDataType<is_detected<is_tuple, ADataType>::value,
+                                      Number<0>,
+                                      ADataType>())> // ComputeType is InputType by default (first
+                                                     // in tuple for MultiAB), unpack if tuple was
+                                                     // passed
 struct DeviceGroupedConvFwdMultipleD : public BaseOperator
 {
+    static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
+    static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
+    static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
+    static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
    static constexpr index_t NumDTensor = DsDataType::Size();
    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+    // If DataType is tuple, user has to pass std::array with pointers.
+    using APointers =
+        std::conditional_t<isMultiA, std::array<const void*, NumATensor>&, const void*>;
+    using BPointers =
+        std::conditional_t<isMultiB, std::array<const void*, NumBTensor>&, const void*>;
+    /**
+     * \brief Make argument pointer for grouped conv fwd.
+     *
+     * \param p_a A pointer to the input (std::array<const void*, NumA> with
+                  pointers for multiple A).
+     * \param p_b A pointer to the weight (std::array<const void*, NumA> with
+                  pointers for multiple B).
+     * \param p_ds A pointers to the Ds.
+     * \param p_e A pointers to the output.
+     * \param a_g_n_c_wis_lengths Input lengths [G, N, C, Spatial...] (for 3d).
+     * \param a_g_n_c_wis_strides Input strides [G, N, C, Spatial...] (for 3d).
+     * \param b_g_k_c_xs_lengths Weight lengths [G, K, C, Spatial...] (for 3d).
+     * \param b_g_k_c_xs_strides Weight strides [G, K, C, Spatial...] (for 3d).
+     * \param ds_g_n_k_wos_lengths Ds lengths [G, N, K, Spatial...] (for 3d).
+     * \param ds_g_n_k_wos_strides Ds strides [G, N, K, Spatial...] (for 3d).
+     * \param e_g_n_k_wos_lengths Output lengths [G, N, K, Spatial...] (for 3d).
+     * \param e_g_n_k_wos_strides Output strides [G, N, K, Spatial...] (for 3d).
+     * \param conv_filter_strides Convolution filter strides.
+     * \param conv_filter_dilations Convolution filter dilations.
+     * \param input_left_pads Input left paddings.
+     * \param input_right_pads Input right paddings.
+     * \param a_element_op A elementwise operation object.
+     * \param b_element_op B elementwise operation object.
+     * \param cde_element_op CDE elementwise operation object.
+     * \return Pointer to the argument.
+     */
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
-        const void* p_a, // input image
+        APointers p_a,
-        const void* p_b, // weight
+        BPointers p_b,
        const std::array<const void*, NumDTensor>& p_ds,
-        void* p_e, // output image
+        void* p_e,
        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,

--- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
@@ -263,8 +263,7 @@ struct DeviceColumnToImageImpl
        decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
            InputGridDesc{}))>;
-    using GridwiseTensorRearrangeKernel =
+    using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
-        GridwiseTensorRearrange<InputGridDesc,
                                                                  InputDataType,
                                                                  OutputGridDesc,
                                                                  OutputDataType,
@@ -275,7 +274,7 @@ struct DeviceColumnToImageImpl
                                                                  ScalarPerVector,
                                                                  InMemoryDataOperationEnum::Add,
                                                                  Block2ETileMap,
-                                ComputePtrOffsetOfStridedBatch<I0>>;
+                                                                  ComputePtrOffsetOfStridedBatch<>>;
    struct Argument : public BaseArgument
    {
@@ -453,7 +452,7 @@ struct DeviceColumnToImageImpl
        std::vector<const InputDataType*> p_in_container_;
        std::vector<OutputDataType*> p_out_container_;
-        ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
+        ComputePtrOffsetOfStridedBatch<> compute_ptr_offset_of_batch_;
    };
    struct Invoker : public BaseInvoker
@@ -471,7 +470,7 @@ struct DeviceColumnToImageImpl
                                                        OutputGridDesc,
                                                        OutputDataType,
                                                        Block2ETileMap,
-                                                        ComputePtrOffsetOfStridedBatch<I0>,
+                                                        ComputePtrOffsetOfStridedBatch<>,
                                                        GridwiseTensorRearrangeKernel>;
            // Execute each set of independent filters

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -385,9 +385,11 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
    // desc for blockwise copy
    using AsGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(AsGridDesc_M_K{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(
+            AsGridDesc_M_K{}))>;
    using BsGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(BsGridDesc_N_K{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(
+            BsGridDesc_N_K{}))>;
    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
            DsGridDesc_M_N{}))>;
@@ -397,7 +399,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
    // block-to-e-tile map
    using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -429,7 +431,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
              bs_grid_desc_bk0_n_bk1_{},
              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op}
@@ -481,10 +483,10 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                                           block_2_etile_map_))
            {
                as_grid_desc_ak0_m_ak1_ =
-                    GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
+                    GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
                bs_grid_desc_bk0_n_bk1_ =
-                    GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
+                    GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -305,9 +305,11 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
    // desc for blockwise copy
    using AsGridDesc_AK0_M_AK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(AsGridDesc_M_K{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(
+            AsGridDesc_M_K{}))>;
    using BsGridDesc_BK0_N_BK1 =
-        remove_cvref_t<decltype(GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(BsGridDesc_N_K{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(
+            BsGridDesc_N_K{}))>;
    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
            DsGridDesc_M_N{}))>;
@@ -317,7 +319,7 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
    // block-to-e-tile map
    using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -349,7 +351,7 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
              bs_grid_desc_bk0_n_bk1_{},
              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op},
@@ -407,10 +409,10 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
                                           block_2_etile_map_))
            {
                as_grid_desc_ak0_m_ak1_ =
-                    GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
+                    GridwiseGemm::MakeDefaultAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
                bs_grid_desc_bk0_n_bk1_ =
-                    GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
+                    GridwiseGemm::MakeDefaultBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -517,7 +517,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
        // for computing batch offset
-        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_batch_;
        // element-wise op
        AElementwiseOp a_element_op_;
@@ -579,7 +579,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
                        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
                        has_main_loop>;
                    return launch_and_time_kernel(

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -677,7 +677,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
        std::vector<Block2ETileMap> block_2_etile_map_container_;
        // for computing batch offset
-        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_batch_;
        // element-wise op
        AElementwiseOp a_element_op_;
@@ -746,7 +746,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                        DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
                        Block2ETileMap,
-                        ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
                        has_main_loop>;
                    return launch_and_time_kernel(