Merge branch 'develop' into add_contraction_example_fp64

19f1c759 · zjing14 · GitHub · 473617e9 · e9fd1228 · 19f1c759
Unverified Commit 19f1c759 authored Feb 15, 2023 by zjing14 Committed by GitHub Feb 15, 2023
20 changed files
--- a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
-add_executable(client_grouped_conv2d_bwd_weight grouped_conv2d_bwd_weight.cpp)
-target_link_libraries(client_grouped_conv2d_bwd_weight PRIVATE composable_kernel::device_operations)
+add_executable(client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp)
+add_executable(client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp)
+add_executable(client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp)
+
+target_link_libraries(client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_operations)
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
@@ -13,27 +13,8 @@
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-
-using InLayout    = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
-using OutLayout   = ck::tensor_layout::convolution::GNHWK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

-static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t N             = 256;
-static constexpr ck::index_t K             = 192;
-static constexpr ck::index_t C             = 192;
-static constexpr ck::index_t Y             = 3;
-static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 28;
-static constexpr ck::index_t Wi            = 28;
-static constexpr ck::index_t Ho            = 28;
-static constexpr ck::index_t Wo            = 28;
-
 struct SimpleDeviceMem
 {
    SimpleDeviceMem() = delete;
@@ -50,22 +31,93 @@ struct SimpleDeviceMem
    void* p_mem_;
 };

-int main()
+template <ck::index_t NumDimSpatial>
+std::size_t GetFlops(ck::index_t G,
+                     ck::index_t N,
+                     ck::index_t K,
+                     ck::index_t C,
+                     const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+                     const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
 {
-    std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
-    std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
-    std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+    // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
+    return static_cast<std::size_t>(2) * G * N * K * C *
+           std::accumulate(std::begin(output_spatial_lengths),
+                           std::end(output_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>()) *
+           std::accumulate(std::begin(filter_spatial_lengths),
+                           std::end(filter_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}

-    std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
-    std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
-    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
-    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+template <typename InDataType, ck::index_t NumDimSpatial>
+std::size_t GetInputByte(ck::index_t G,
+                         ck::index_t N,
+                         ck::index_t C,
+                         const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths)
+{
+    // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
+    return sizeof(InDataType) * (G * N * C *
+                                 std::accumulate(std::begin(input_spatial_lengths),
+                                                 std::end(input_spatial_lengths),
+                                                 static_cast<std::size_t>(1),
+                                                 std::multiplies<>()));
+}

-    ck::index_t split_k = 2;
+template <typename WeiDataType, ck::index_t NumDimSpatial>
+std::size_t GetWeightByte(ck::index_t G,
+                          ck::index_t K,
+                          ck::index_t C,
+                          const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
+{
+    // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
+    return sizeof(WeiDataType) * (G * K * C *
+                                  std::accumulate(std::begin(filter_spatial_lengths),
+                                                  std::end(filter_spatial_lengths),
+                                                  static_cast<std::size_t>(1),
+                                                  std::multiplies<>()));
+}

-    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+template <typename OutDataType, ck::index_t NumDimSpatial>
+std::size_t GetOutputByte(ck::index_t G,
+                          ck::index_t N,
+                          ck::index_t K,
+                          const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths)
+{
+    // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
+    return sizeof(OutDataType) * (G * N * K *
+                                  std::accumulate(std::begin(output_spatial_lengths),
+                                                  std::end(output_spatial_lengths),
+                                                  static_cast<std::size_t>(1),
+                                                  std::multiplies<std::size_t>()));
+}
+
+template <ck::index_t NumDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool run_grouped_conv_bwd_weight(
+    ck::index_t G,
+    ck::index_t N,
+    ck::index_t K,
+    ck::index_t C,
+    const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
+    const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
+    const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
+    const std::array<ck::index_t, NumDimSpatial>& input_right_pads)
+{
+
+    ck::index_t split_k = 2;
+    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths));
+    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths));
+    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths));

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
                                                                              InLayout,
@@ -120,10 +172,12 @@ int main()
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

-            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
-                                    sizeof(WeiDataType) * G * K * Y * X * C +
-                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+            std::size_t flop =
+                GetFlops<NumDimSpatial>(G, N, K, C, output_spatial_lengths, filter_spatial_lengths);
+            std::size_t num_bytes =
+                GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths) +
+                GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths) +
+                GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths);

            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -149,7 +203,7 @@ int main()
    if(best_op_id < 0)
    {
        std::cerr << "no suitable instance" << std::endl;
-        return EXIT_FAILURE;
+        return false;
    }

    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
@@ -187,4 +241,6 @@ int main()

        std::cout << "Done" << std::endl;
    }
+
+    return true;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(
+               G, N, K, C, {Hi, Wi}, {Y, X}, {Ho, Wo}, {1, 1}, {1, 1}, {1, 1}, {1, 1})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 8;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 128;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  {Di, Hi, Wi},
+                                                  {Z, Y, X},
+                                                  {Do, Ho, Wo},
+                                                  {1, 1, 1},
+                                                  {1, 1, 1},
+                                                  {1, 1, 1},
+                                                  {1, 1, 1})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = float;
+using WeiDataType = float;
+using OutDataType = float;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 8;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 128;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  {Di, Hi, Wi},
+                                                  {Z, Y, X},
+                                                  {Do, Ho, Wo},
+                                                  {1, 1, 1},
+                                                  {1, 1, 1},
+                                                  {1, 1, 1},
+                                                  {1, 1, 1})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/15_convnd_bwd_data/CMakeLists.txt
+++ b/client_example/15_convnd_bwd_data/CMakeLists.txt
+add_executable(client_conv3d_bwd_data_fp16 conv3d_bwd_data_fp16.cpp)
+add_executable(client_conv3d_bwd_data_fp32 conv3d_bwd_data_fp32.cpp)
+
+target_link_libraries(client_conv3d_bwd_data_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_conv3d_bwd_data_fp32 PRIVATE composable_kernel::device_operations)
--- a/client_example/15_convnd_bwd_data/common.hpp
+++ b/client_example/15_convnd_bwd_data/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+std::size_t GetFlops(ck::index_t N,
+                     ck::index_t K,
+                     ck::index_t C,
+                     const std::vector<ck::index_t>& output_spatial_lengths,
+                     const std::vector<ck::index_t>& weights_spatial_lengths)
+{
+    // 2 * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
+
+    return static_cast<std::size_t>(2) * N * K * C *
+           std::accumulate(std::begin(output_spatial_lengths),
+                           std::end(output_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>()) *
+           std::accumulate(std::begin(weights_spatial_lengths),
+                           std::end(weights_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}
+
+template <typename InDataType>
+std::size_t
+GetInputByte(ck::index_t N, ck::index_t C, const std::vector<ck::index_t>& input_spatial_lengths)
+{
+    // sizeof(InDataType) * (N * C * <input spatial lengths product>) +
+    return sizeof(InDataType) * N * C *
+           std::accumulate(std::begin(input_spatial_lengths),
+                           std::end(input_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}
+
+template <typename WeiDataType>
+std::size_t
+GetWeightByte(ck::index_t K, ck::index_t C, const std::vector<ck::index_t>& weights_spatial_lengths)
+{
+    // sizeof(WeiDataType) * (K * C * <filter spatial lengths product>) +
+    return sizeof(WeiDataType) * K * C *
+           std::accumulate(std::begin(weights_spatial_lengths),
+                           std::end(weights_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}
+
+template <typename OutDataType>
+std::size_t
+GetOutputByte(ck::index_t N, ck::index_t K, const std::vector<ck::index_t>& output_spatial_lengths)
+{
+    // sizeof(OutDataType) * (N * K * <output spatial lengths product>);
+    return sizeof(OutDataType) * N * K *
+           std::accumulate(std::begin(output_spatial_lengths),
+                           std::end(output_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<std::size_t>());
+}
+
+template <ck::index_t NumDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool run_conv_bwd_data(ck::index_t N,
+                       ck::index_t K,
+                       ck::index_t C,
+                       const std::vector<ck::index_t>& in_spatial_lengths,
+                       const std::vector<ck::index_t>& wei_spatial_lengths,
+                       const std::vector<ck::index_t>& out_spatial_lengths)
+{
+    std::size_t in_mem_size  = GetInputByte<InDataType>(N, C, in_spatial_lengths);
+    std::size_t wei_mem_size = GetWeightByte<WeiDataType>(K, C, wei_spatial_lengths);
+    std::size_t out_mem_size = GetOutputByte<OutDataType>(N, K, out_spatial_lengths);
+
+    SimpleDeviceMem in(in_mem_size);
+    SimpleDeviceMem wei(wei_mem_size);
+    SimpleDeviceMem out(out_mem_size);
+
+    std::vector<ck::index_t> filter_strides(NumDimSpatial, 1);
+    std::vector<ck::index_t> filter_dilations(NumDimSpatial, 1);
+    std::vector<ck::index_t> input_left_pads(NumDimSpatial, 1);
+    std::vector<ck::index_t> input_right_pads(NumDimSpatial, 1);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NumDimSpatial,
+                                                                     InLayout,
+                                                                     WeiLayout,
+                                                                     OutLayout,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    std::size_t flop      = GetFlops(N, K, C, out_spatial_lengths, wei_spatial_lengths);
+    std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return false;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+    return true;
+}
--- a/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp
+++ b/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using WeiLayout = ck::tensor_layout::convolution::KZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+int main()
+{
+    return run_conv_bwd_data<NumDimSpatial,
+                             InDataType,
+                             WeiDataType,
+                             OutDataType,
+                             InLayout,
+                             WeiLayout,
+                             OutLayout>(N, K, C, {Di, Hi, Wi}, {Z, Y, X}, {Do, Ho, Wo})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp
+++ b/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = float;
+using WeiDataType = float;
+using OutDataType = float;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using WeiLayout = ck::tensor_layout::convolution::KZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+int main()
+{
+    return run_conv_bwd_data<NumDimSpatial,
+                             InDataType,
+                             WeiDataType,
+                             OutDataType,
+                             InLayout,
+                             WeiLayout,
+                             OutLayout>(N, K, C, {Di, Hi, Wi}, {Z, Y, X}, {Do, Ho, Wo})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/16_convnd_fwd/CMakeLists.txt
+++ b/client_example/16_convnd_fwd/CMakeLists.txt
+add_executable(client_conv3d_fwd_fp16 conv3d_fwd_fp16.cpp)
+add_executable(client_conv3d_fwd_fp32 conv3d_fwd_fp32.cpp)
+
+target_link_libraries(client_conv3d_fwd_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_conv3d_fwd_fp32 PRIVATE composable_kernel::device_operations)
--- a/client_example/16_convnd_fwd/common.hpp
+++ b/client_example/16_convnd_fwd/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+template <ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetFlops(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths,
+         const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
+{
+    // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
+    ck::index_t G = weights_lengths[0];
+    ck::index_t N = output_lengths[1];
+    ck::index_t K = weights_lengths[1];
+    ck::index_t C = weights_lengths[2];
+
+    return static_cast<std::size_t>(2) * G * N * K * C *
+           std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim),
+                           std::end(output_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>()) *
+           std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim),
+                           std::end(weights_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}
+
+template <typename InDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetInputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& input_lengths)
+{
+    // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
+    return sizeof(InDataType) * std::accumulate(std::begin(input_lengths),
+                                                std::end(input_lengths),
+                                                static_cast<std::size_t>(1),
+                                                std::multiplies<>());
+}
+
+template <typename WeiDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetWeightByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
+{
+    // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
+    return sizeof(WeiDataType) * std::accumulate(std::begin(weights_lengths),
+                                                 std::end(weights_lengths),
+                                                 static_cast<std::size_t>(1),
+                                                 std::multiplies<>());
+}
+
+template <typename OutDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetOutputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths)
+{
+    // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
+    return sizeof(OutDataType) * std::accumulate(std::begin(output_lengths),
+                                                 std::end(output_lengths),
+                                                 static_cast<std::size_t>(1),
+                                                 std::multiplies<std::size_t>());
+}
+
+template <ck::index_t NumDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          ck::index_t NumNonSpatialDim = 3>
+bool run_grouped_conv_fwd(std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_lengths,
+                          std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_lengths,
+                          std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_lengths)
+{
+    std::size_t in_mem_size  = GetInputByte<InDataType, NumDimSpatial>(in_lengths);
+    std::size_t wei_mem_size = GetWeightByte<WeiDataType, NumDimSpatial>(wei_lengths);
+    std::size_t out_mem_size = GetOutputByte<OutDataType, NumDimSpatial>(out_lengths);
+
+    SimpleDeviceMem in(in_mem_size);
+    SimpleDeviceMem wei(wei_mem_size);
+    SimpleDeviceMem out(out_mem_size);
+
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_strides;
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_strides;
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_strides;
+    in_strides.fill(0);
+    wei_strides.fill(0);
+    out_strides.fill(0);
+    in_strides.back()  = 1;
+    wei_strides.back() = 1;
+    out_strides.back() = 1;
+
+    std::partial_sum(rbegin(in_lengths),
+                     std::prev(rend(in_lengths)),
+                     std::next(rbegin(in_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(wei_lengths),
+                     std::prev(rend(wei_lengths)),
+                     std::next(rbegin(wei_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(out_lengths),
+                     std::prev(rend(out_lengths)),
+                     std::next(rbegin(out_strides)),
+                     std::multiplies<>{});
+
+    // transpose NDHWGC/KZYXGC/NDHWGK to GNDHWC/GKZYXC/GNDHWK to GNCDHW/GKCZYX/GNKDHW
+    std::rotate(std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 2), rend(in_lengths));
+    std::rotate(rbegin(in_lengths),
+                std::next(rbegin(in_lengths)),
+                std::next(rbegin(in_lengths), NumDimSpatial + 1));
+
+    std::rotate(std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 2), rend(in_strides));
+    std::rotate(rbegin(in_strides),
+                std::next(rbegin(in_strides)),
+                std::next(rbegin(in_strides), NumDimSpatial + 1));
+
+    std::rotate(
+        std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 2), rend(wei_lengths));
+    std::rotate(rbegin(wei_lengths),
+                std::next(rbegin(wei_lengths)),
+                std::next(rbegin(wei_lengths), NumDimSpatial + 1));
+
+    std::rotate(
+        std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 2), rend(wei_strides));
+    std::rotate(rbegin(wei_strides),
+                std::next(rbegin(wei_strides)),
+                std::next(rbegin(wei_strides), NumDimSpatial + 1));
+
+    std::rotate(
+        std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 2), rend(out_lengths));
+    std::rotate(rbegin(out_lengths),
+                std::next(rbegin(out_lengths)),
+                std::next(rbegin(out_lengths), NumDimSpatial + 1));
+
+    std::rotate(
+        std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 2), rend(out_strides));
+    std::rotate(rbegin(out_strides),
+                std::next(rbegin(out_strides)),
+                std::next(rbegin(out_strides), NumDimSpatial + 1));
+
+    std::array<ck::index_t, NumDimSpatial> conv_filter_strides;
+    std::array<ck::index_t, NumDimSpatial> conv_filter_dilations;
+    std::array<ck::index_t, NumDimSpatial> input_left_pads;
+    std::array<ck::index_t, NumDimSpatial> input_right_pads;
+    conv_filter_strides.fill(1);
+    conv_filter_dilations.fill(1);
+    input_left_pads.fill(1);
+    input_right_pads.fill(1);
+
+    std::size_t flop      = GetFlops<NumDimSpatial>(out_lengths, wei_lengths);
+    std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size;
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 OutLayout,
+                                                                                 InDataType,
+                                                                                 WeiDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 OutDataType,
+                                                                                 PassThrough,
+                                                                                 PassThrough,
+                                                                                 PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            in.GetDeviceBuffer(),
+            wei.GetDeviceBuffer(),
+            std::array<const void*, 0>{},
+            out.GetDeviceBuffer(),
+            in_lengths,
+            in_strides,
+            wei_lengths,
+            wei_strides,
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
+            out_lengths,
+            out_strides,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            PassThrough{},
+            PassThrough{},
+            PassThrough{});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return false;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            in.GetDeviceBuffer(),
+            wei.GetDeviceBuffer(),
+            std::array<const void*, 0>{},
+            out.GetDeviceBuffer(),
+            in_lengths,
+            in_strides,
+            wei_lengths,
+            wei_strides,
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
+            out_lengths,
+            out_strides,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            PassThrough{},
+            PassThrough{},
+            PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+    return true;
+}
--- a/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
+++ b/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::KZYXGC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+
+int main()
+{
+    return run_grouped_conv_fwd<NumDimSpatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InLayout,
+                                WeiLayout,
+                                OutLayout>(
+               {N, Di, Hi, Wi, G, C}, {K, Z, Y, X, G, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
+++ b/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = float;
+using WeiDataType = float;
+using OutDataType = float;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::KZYXGC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+
+int main()
+{
+    return run_grouped_conv_fwd<NumDimSpatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InLayout,
+                                WeiLayout,
+                                OutLayout>(
+               {N, Di, Hi, Wi, G, C}, {K, Z, Y, X, G, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
+++ b/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
@@ -53,7 +53,6 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi
    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());

-
    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());
    d0_device_buf.ToDevice(d0_m_n.mData.data());
@@ -84,8 +83,8 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi

    if(!device_op.IsSupportedArgument(argument))
    {
-	    std::cout << "wrong! this device_op instance does not support this problem" << std::endl;
-	    return true;
+        std::cout << "wrong! this device_op instance does not support this problem" << std::endl;
+        return true;
    }

    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -168,13 +168,6 @@
 // tuning parameter
 #define CK_WORKAROUND_SWDEV_325164 0

-// workaround: a BF16 attention kernel for gfx908 is likely affected by a compiler issue
-#ifdef __gfx908__
-#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 1
-#else // __gfx90a__, ...
-#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 0
-#endif // __gfx908__
-
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0


--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -1077,14 +1077,6 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
                }
            } // end gemm1

-            // workaround compiler issue; see ck/ck.hpp
-            if constexpr(CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE == 1 &&
-                         is_same_v<FloatAB, bhalf_t> && MPerBlock == 256 && NPerBlock == 128 &&
-                         Gemm1NPerBlock == 128)
-            {
-                __builtin_amdgcn_sched_barrier(0);
-            }
-
            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
                gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
            constexpr auto cm0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -879,14 +879,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
                }
            } // end gemm1

-            // workaround compiler issue; see ck/ck.hpp
-            if constexpr(CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE == 1 &&
-                         is_same_v<FloatAB, bhalf_t> && MPerBlock == 256 && NPerBlock == 128 &&
-                         Gemm1NPerBlock == 128)
-            {
-                __builtin_amdgcn_sched_barrier(0);
-            }
-
            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
                gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
            constexpr auto cm0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0);

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -244,6 +244,63 @@ void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);

+// grouped conv3d forward, NDHWGC/KZYXGC/NDHWGK
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              KZYXGC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              KZYXGC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              KZYXGC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              KZYXGC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
 template <ck::index_t NumDimSpatial,
          typename InLayout,
          typename WeiLayout,
@@ -385,6 +442,31 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(op_ptrs);
            }
        }
+        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                          is_same_v<WeiLayout, KZYXGC> && is_same_v<OutLayout, NDHWGK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(op_ptrs);
+            }
+        }

        return op_ptrs;
    }

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -3,4 +3,9 @@ add_instance_library(device_grouped_conv3d_fwd_instance
   device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
   device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
   device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
+
+   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
+   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
+   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
+   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
 )
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
+using KZYXGC = ck::tensor_layout::convolution::KZYXGC;
+using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
+using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              KZYXGC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck