update example

0e8d7ed3 · Chao Liu · 61510f0a · 0e8d7ed3 · 0e8d7ed3 · 0e8d7ed3
Commit 0e8d7ed3 authored Jul 25, 2022 by Chao Liu
11 changed files
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -23,8 +23,8 @@ void print_helper_msg()
    std::cout << "arg1: verification (0=no, 1=yes)\n"
              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
              << "arg3: time kernel (0=no, 1=yes)\n"
-              << "arg4: N spatial dimensions (default 2)\n"
              << "Following arguments (depending on number of spatial dims):\n"
+              << " N spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
              << " G, N, K, C, \n"
              << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
              << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
@@ -35,7 +35,7 @@ void print_helper_msg()
              << std::endl;
 }
-ck::utils::conv::ConvParam parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
+ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
 {
    const ck::index_t G = std::stoi(argv[arg_idx++]);
    const ck::index_t N = std::stoi(argv[arg_idx++]);
@@ -92,11 +92,7 @@ ck::utils::conv::ConvParam parse_conv_params(int num_dim_spatial, int arg_idx, c
                                      input_right_pads};
 }
-// FIXME: current implementation only support NCHW/NHWC layout
 template <ck::index_t NDimSpatial,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
          typename InDataType,
          typename WeiDataType,
          typename OutDataType,
@@ -104,32 +100,24 @@ template <ck::index_t NDimSpatial,
          typename WeiElementOp,
          typename OutElementOp,
          typename DeviceConvNDFwdInstance>
-int run_conv_fwd(bool do_verification,
+int run_grouped_conv_fwd(bool do_verification,
-                 int init_method,
+                         int init_method,
-                 bool time_kernel,
+                         bool time_kernel,
-                 const ck::utils::conv::ConvParam& conv_param,
+                         const ck::utils::conv::ConvParam& conv_param,
-                 const InElementOp& in_element_op,
+                         const HostTensorDescriptor& in_g_n_c_wis_desc,
-                 const WeiElementOp& wei_element_op,
+                         const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                 const OutElementOp& out_element_op)
+                         const HostTensorDescriptor& out_g_n_k_wos_desc,
+                         const InElementOp& in_element_op,
+                         const WeiElementOp& wei_element_op,
+                         const OutElementOp& out_element_op)
 {
-    const auto in_g_n_c_wis_desc =
-        ck::utils::conv::make_input_host_tensor_descriptor_packed<InLayout>(conv_param);
-    const auto wei_g_k_c_xs_desc =
-        ck::utils::conv::make_weight_host_tensor_descriptor_packed<WeiLayout>(conv_param);
-    const auto bias_g_n_k_wos_desc =
-        ck::utils::conv::make_output_host_tensor_descriptor_packed<OutLayout>(conv_param);
-    const auto out_g_n_k_wos_desc =
-        ck::utils::conv::make_output_host_tensor_descriptor_packed<OutLayout>(conv_param);
    Tensor<InDataType> in(in_g_n_c_wis_desc);
    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
-    Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
    std::cout << "in: " << in.mDesc << std::endl;
    std::cout << "wei: " << wei.mDesc << std::endl;
-    std::cout << "bias: " << bias.mDesc << std::endl;
    std::cout << "out: " << out_host.mDesc << std::endl;
    switch(init_method)
@@ -138,29 +126,23 @@ int run_conv_fwd(bool do_verification,
    case 1:
        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
        break;
    default:
        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
-    bias_device_buf.ToDevice(bias.mData.data());
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
@@ -174,8 +156,6 @@ int run_conv_fwd(bool do_verification,
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
-    copy(bias_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
-    copy(bias_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
    copy(conv_param.conv_filter_strides_, conv_filter_strides);
@@ -183,29 +163,28 @@ int run_conv_fwd(bool do_verification,
    copy(conv_param.input_left_pads_, input_left_pads);
    copy(conv_param.input_right_pads_, input_right_pads);
-    // do GEMM
+    // do Conv
    auto conv     = DeviceConvNDFwdInstance{};
    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
-        in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
-        wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
-        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+                                      out_device_buf.GetDeviceBuffer(),
-        out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
-        a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
-        a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
-        b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
-        b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}},
+                                      e_g_n_k_wos_lengths,
-        e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
-        e_g_n_k_wos_strides,
+                                      conv_filter_strides,
-        conv_filter_strides,
+                                      conv_filter_dilations,
-        conv_filter_dilations,
+                                      input_left_pads,
-        input_left_pads,
+                                      input_right_pads,
-        input_right_pads,
+                                      in_element_op,
-        in_element_op,
+                                      wei_element_op,
-        wei_element_op,
+                                      out_element_op);
-        out_element_op);
    if(!conv.IsSupportedArgument(argument))
    {
@@ -226,36 +205,28 @@ int run_conv_fwd(bool do_verification,
    if(do_verification)
    {
-        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-        Tensor<OutDataType> c_host(out_g_n_k_wos_desc);
        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                     InDataType,
                                                                     WeiDataType,
                                                                     OutDataType,
                                                                     InElementOp,
                                                                     WeiElementOp,
-                                                                     PassThrough>();
+                                                                     OutElementOp>();
        auto ref_invoker  = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in,
                                                  wei,
-                                                  c_host,
+                                                  out_host,
                                                  conv_param.conv_filter_strides_,
                                                  conv_param.conv_filter_dilations_,
                                                  conv_param.input_left_pads_,
                                                  conv_param.input_right_pads_,
                                                  in_element_op,
                                                  wei_element_op,
-                                                  PassThrough{});
+                                                  out_element_op);
        ref_invoker.Run(ref_argument);
-        // TODO: implement elementwise operation for host
-        out_host.ForEach(
-            [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
        out_device_buf.FromDevice(out_device.mData.data());
        return ck::utils::check_err(

--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -3,12 +3,15 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-using InDataType  = ck::bhalf_t;
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-using WeiDataType = ck::bhalf_t;
-using OutDataType = ck::bhalf_t;
+using InDataType       = ck::bhalf_t;
-using AccDataType = float;
+using WeiDataType      = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using OutDataType      = ck::bhalf_t;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -17,57 +20,72 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto ConvFwdDefault =
+static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-template <ck::index_t NDimSpatial>
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
-    NDimSpatial,    //
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
-    InDataType,     //
+using DeviceGroupedConvNDFwdInstance =
-    WeiDataType,    //
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-    OutDataType,    //
+        NDimSpatial,
-    AccDataType,    //
+        InLayout,
-    InElementOp,    // Input Elementwise Operation
+        WeiLayout,
-    WeiElementOp,   // Weights Elementwise Operation
+        ck::Tuple<>,
-    OutElementOp,   // Output Elementwise Operation
+        OutLayout,
-    ConvFwdDefault, // ConvForwardSpecialization
+        InDataType,
-    256,            // BlockSize
+        WeiDataType,
-    128,            // MPerBlock
+        AccDataType,
-    256,            // NPerBlock
+        CShuffleDataType,
-    4,              // K0PerBlock
+        ck::Tuple<>,
-    8,              // K1
+        OutDataType,
-    32,             // MPerXdl
+        InElementOp,
-    32,             // NPerXdl
+        WeiElementOp,
-    2,              // MXdlPerWave
+        OutElementOp,
-    4,              // NXdlPerWave
+        ConvSpec,    // ConvForwardSpecialization
-    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
+        GemmSpec,    // GemmSpecialization
-    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+        1,           //
-    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+        256,         // BlockSize
-    2,              // ABlockTransferSrcVectorDim
+        128,         // MPerBlock
-    8,              // ABlockTransferSrcScalarPerVector
+        256,         // NPerBlock
-    8,              // ABlockTransferDstScalarPerVector_K1
+        32,          // KPerBlock
-    true,           // ABlockLdsAddExtraM
+        8,           // AK1
-    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
+        8,           // BK1
-    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+        32,          // MPerXdl
-    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+        32,          // NPerXdl
-    2,              // BBlockTransferSrcVectorDim
+        2,           // MXdlPerWave
-    8,              // BBlockTransferSrcScalarPerVector
+        4,           // NXdlPerWave
-    8,              // BBlockTransferDstScalarPerVector_K1
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-    true,           // BBlockLdsAddExtraN
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-    7,              // CThreadTransferSrcDstVectorDim
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-    1>;             // CThreadTransferDstScalarPerVector
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
 int main(int argc, char* argv[])
 {
+    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    int num_dim_spatial  = 2;
-    ck::utils::conv::ConvParam params{
+    ck::utils::conv::ConvParam conv_param{
-        2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
@@ -81,77 +99,119 @@ int main(int argc, char* argv[])
    }
    else
    {
-        do_verification = std::stoi(argv[1]);
+        do_verification                   = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
+        init_method                       = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        time_kernel                       = std::stoi(argv[3]);
-        num_dim_spatial = std::stoi(argv[4]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        params = parse_conv_params(num_dim_spatial, 5, argv);
+        conv_param = parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
-    if(num_dim_spatial == 1)
+    if(conv_param.num_dim_spatial_ == 1)
    {
-        return run_conv_fwd<1,
+        using InLayout  = ctc::GNWC;
-                            ck::tensor_layout::convolution::NWC,
+        using WeiLayout = ctc::GKXC;
-                            ck::tensor_layout::convolution::KXC,
+        using OutLayout = ctc::GNWK;
-                            ck::tensor_layout::convolution::NWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<1>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            1,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 2)
+    else if(conv_param.num_dim_spatial_ == 2)
    {
-        return run_conv_fwd<2,
+        using InLayout  = ctc::GNHWC;
-                            ck::tensor_layout::convolution::NHWC,
+        using WeiLayout = ctc::GKYXC;
-                            ck::tensor_layout::convolution::KYXC,
+        using OutLayout = ctc::GNHWK;
-                            ck::tensor_layout::convolution::NHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<2>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            2,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 3)
+    else if(conv_param.num_dim_spatial_ == 3)
    {
-        return run_conv_fwd<3,
+        using InLayout  = ctc::GNDHWC;
-                            ck::tensor_layout::convolution::NDHWC,
+        using WeiLayout = ctc::GKZYXC;
-                            ck::tensor_layout::convolution::KZYXC,
+        using OutLayout = ctc::GNDHWK;
-                            ck::tensor_layout::convolution::NDHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<3>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            3,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
    return 0;

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -3,160 +3,88 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-using InDataType  = ck::half_t;
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
+using InDataType       = ck::half_t;
-using AccDataType = float;
+using WeiDataType      = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using OutDataType      = ck::half_t;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-// using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-#if 0
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-template <ck::index_t NDimSpatial>
-using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
-    NDimSpatial,    //
-    InDataType,     //
-    WeiDataType,    //
-    OutDataType,    //
-    AccDataType,    //
-    InElementOp,    // Input Elementwise Operation
-    WeiElementOp,   // Weights Elementwise Operation
-    OutElementOp,   // Output Elementwise Operation
-    ConvFwdDefault, // ConvForwardSpecialization
-    256,            // BlockSize
-    128,            // MPerBlock
-    256,            // NPerBlock
-    4,              // K0PerBlock
-    8,              // K1
-    32,             // MPerXdl
-    32,             // NPerXdl
-    2,              // MXdlPerWave
-    4,              // NXdlPerWave
-    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
-    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>, 1    // ABlockTransferSrcAccessOrder
-    2,              // ABlockTransferSrcVectorDim
-    8,              // ABlockTransferSrcScalarPerVector
-    8,              // ABlockTransferDstScalarPerVector_K1
-    true,           // ABlockLdsExtraM
-    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
-    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
-    2,              // BBlockTransferSrcVectorDim
-    8,              // BBlockTransferSrcScalarPerVector
-    8,              // BBlockTransferDstScalarPerVector_K1
-    true,           // BBlockLdsExtraN
-    7,              // CThreadTransferSrcDstVectorDim
-    1>;             // CThreadTransferDstScalarPerVector
-#else
-using CShuffleDataType = ck::half_t;
-using DDataType        = ck::half_t;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-template <ck::index_t NDimSpatial>
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
-using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvFwdMultipleD_Xdl_CShuffle<
+using DeviceGroupedConvNDFwdInstance =
-    NDimSpatial,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-#if 0
+        NDimSpatial,
-    ck::tuple_element_t<NDimSpatial - 1,
+        InLayout,
-                        ck::Tuple<ck::tensor_layout::convolution::G_NW_C,
+        WeiLayout,
-                                  ck::tensor_layout::convolution::G_NHW_C,
+        ck::Tuple<>,
-                                  ck::tensor_layout::convolution::G_NDHW_C>>,
+        OutLayout,
-    ck::tuple_element_t<NDimSpatial - 1,
+        InDataType,
-                        ck::Tuple<ck::tensor_layout::convolution::G_K_X_C,
+        WeiDataType,
-                                  ck::tensor_layout::convolution::G_K_YX_C,
+        AccDataType,
-                                  ck::tensor_layout::convolution::G_K_ZYX_C>>,
+        CShuffleDataType,
-    ck::Tuple<ck::tuple_element_t<NDimSpatial - 1,
+        ck::Tuple<>,
-                                  ck::Tuple<ck::tensor_layout::convolution::G_NW_K,
+        OutDataType,
-                                            ck::tensor_layout::convolution::G_NHW_K,
+        InElementOp,
-                                            ck::tensor_layout::convolution::G_NDHW_K>>>,
+        WeiElementOp,
-    ck::tuple_element_t<NDimSpatial - 1,
+        OutElementOp,
-                        ck::Tuple<ck::tensor_layout::convolution::G_NW_K,
+        ConvSpec,    // ConvForwardSpecialization
-                                  ck::tensor_layout::convolution::G_NHW_K,
+        GemmSpec,    // GemmSpecialization
-                                  ck::tensor_layout::convolution::G_NDHW_K>>,
+        1,           //
-#else
+        256,         // BlockSize
-    ck::tuple_element_t<NDimSpatial - 1,
+        128,         // MPerBlock
-                        ck::Tuple<ck::tensor_layout::convolution::NWGC,
+        256,         // NPerBlock
-                                  ck::tensor_layout::convolution::NHWGC,
+        32,          // KPerBlock
-                                  ck::tensor_layout::convolution::NDHWGC>>,
+        8,           // AK1
-    ck::tuple_element_t<NDimSpatial - 1,
+        8,           // BK1
-                        ck::Tuple<ck::tensor_layout::convolution::KXGC,
+        32,          // MPerXdl
-                                  ck::tensor_layout::convolution::KYXGC,
+        32,          // NPerXdl
-                                  ck::tensor_layout::convolution::KZYXGC>>,
+        2,           // MXdlPerWave
-    ck::Tuple<ck::tuple_element_t<NDimSpatial - 1,
+        4,           // NXdlPerWave
-                                  ck::Tuple<ck::tensor_layout::convolution::NWGK,
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-                                            ck::tensor_layout::convolution::NHWGK,
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-                                            ck::tensor_layout::convolution::NDHWGK>>>,
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-    ck::tuple_element_t<NDimSpatial - 1,
+        2,           // ABlockTransferSrcVectorDim
-                        ck::Tuple<ck::tensor_layout::convolution::NWGK,
+        8,           // ABlockTransferSrcScalarPerVector
-                                  ck::tensor_layout::convolution::NHWGK,
+        8,           // ABlockTransferDstScalarPerVector_AK1
-                                  ck::tensor_layout::convolution::NDHWGK>>,
+        1,           // ABlockLdsExtraM
-#endif
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-    InDataType,
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-    WeiDataType,
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-    AccDataType,
+        2,           // BBlockTransferSrcVectorDim
-    CShuffleDataType,
+        8,           // BBlockTransferSrcScalarPerVector
-    ck::Tuple<DDataType>,
+        8,           // BBlockTransferDstScalarPerVector_BK1
-    OutDataType,
+        1,           // BBlockLdsExtraN
-    InElementOp,
+        1,
-    WeiElementOp,
+        1,
-    OutElementOp,
+        S<1, 32, 1, 8>,
-    ConvSpec,    // ConvForwardSpecialization
+        8>;
-    GemmSpec,    // GemmSpecialization
-    1,           //
-    256,         // BlockSize
-    128,         // MPerBlock
-    256,         // NPerBlock
-    32,          // KPerBlock
-    8,           // K1
-    32,          // MPerXdl
-    32,          // NPerXdl
-    2,           // MXdlPerWave
-    4,           // NXdlPerWave
-    S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
-    S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-    2,           // ABlockTransferSrcVectorDim
-    8,           // ABlockTransferSrcScalarPerVector
-    8,           // ABlockTransferDstScalarPerVector_K1
-    1,           // ABlockLdsExtraM
-    S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
-    S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-    2,           // BBlockTransferSrcVectorDim
-    8,           // BBlockTransferSrcScalarPerVector
-    8,           // BBlockTransferDstScalarPerVector_K1
-    1,           // BBlockLdsExtraN
-    1,
-    1,
-    S<1, 32, 1, 8>,
-    8>;
-#endif
 int main(int argc, char* argv[])
 {
+    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    int num_dim_spatial  = 2;
-    ck::utils::conv::ConvParam params{
+    ck::utils::conv::ConvParam conv_param{
        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
@@ -171,77 +99,119 @@ int main(int argc, char* argv[])
    }
    else
    {
-        do_verification = std::stoi(argv[1]);
+        do_verification                   = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
+        init_method                       = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        time_kernel                       = std::stoi(argv[3]);
-        num_dim_spatial = std::stoi(argv[4]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        params = parse_conv_params(num_dim_spatial, 5, argv);
+        conv_param = parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
-    if(num_dim_spatial == 1)
+    if(conv_param.num_dim_spatial_ == 1)
    {
-        return run_conv_fwd<1,
+        using InLayout  = ctc::GNWC;
-                            ck::tensor_layout::convolution::NWGC,
+        using WeiLayout = ctc::GKXC;
-                            ck::tensor_layout::convolution::KXGC,
+        using OutLayout = ctc::GNWK;
-                            ck::tensor_layout::convolution::NWGK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<1>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            1,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 2)
+    else if(conv_param.num_dim_spatial_ == 2)
    {
-        return run_conv_fwd<2,
+        using InLayout  = ctc::GNHWC;
-                            ck::tensor_layout::convolution::NHWGC,
+        using WeiLayout = ctc::GKYXC;
-                            ck::tensor_layout::convolution::KYXGC,
+        using OutLayout = ctc::GNHWK;
-                            ck::tensor_layout::convolution::NHWGK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<2>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            2,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 3)
+    else if(conv_param.num_dim_spatial_ == 3)
    {
-        return run_conv_fwd<3,
+        using InLayout  = ctc::GNDHWC;
-                            ck::tensor_layout::convolution::NDHWGC,
+        using WeiLayout = ctc::GKZYXC;
-                            ck::tensor_layout::convolution::KZYXGC,
+        using OutLayout = ctc::GNDHWK;
-                            ck::tensor_layout::convolution::NDHWGK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<3>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            3,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
    return 0;

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -3,12 +3,15 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-using InDataType  = float;
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-using WeiDataType = float;
-using OutDataType = float;
+using InDataType       = float;
-using AccDataType = float;
+using WeiDataType      = float;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using OutDataType      = float;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -17,57 +20,72 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto ConvFwdDefault =
+static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-template <ck::index_t NDimSpatial>
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
-    NDimSpatial,    // NDimSpatial
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
-    InDataType,     //
+using DeviceGroupedConvNDFwdInstance =
-    WeiDataType,    //
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-    OutDataType,    //
+        NDimSpatial,
-    AccDataType,    //
+        InLayout,
-    InElementOp,    // Input Elementwise Operation
+        WeiLayout,
-    WeiElementOp,   // Weights Elementwise Operation
+        ck::Tuple<>,
-    OutElementOp,   // Output Elementwise Operation
+        OutLayout,
-    ConvFwdDefault, // ConvForwardSpecialization
+        InDataType,
-    256,            // BlockSize
+        WeiDataType,
-    256,            // MPerBlock
+        AccDataType,
-    128,            // NPerBlock
+        CShuffleDataType,
-    4,              // K0PerBlock
+        ck::Tuple<>,
-    4,              // K1
+        OutDataType,
-    32,             // MPerXDL
+        InElementOp,
-    32,             // NPerXDL
+        WeiElementOp,
-    4,              // MXdlPerWave
+        OutElementOp,
-    2,              // NXdlPerWave
+        ConvSpec,    // ConvForwardSpecialization
-    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
+        GemmSpec,    // GemmSpecialization
-    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+        1,           //
-    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+        256,         // BlockSize
-    2,              // ABlockTransferSrcVectorDim
+        128,         // MPerBlock
-    4,              // ABlockTransferSrcScalarPerVector
+        256,         // NPerBlock
-    4,              // ABlockTransferDstScalarPerVector_K1
+        16,          // KPerBlock
-    true,           // ABlockLdsAddExtraM
+        4,           // AK1
-    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
+        4,           // BK1
-    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+        32,          // MPerXdl
-    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+        32,          // NPerXdl
-    2,              // BBlockTransferSrcVectorDim
+        2,           // MXdlPerWave
-    4,              // BBlockTransferSrcScalarPerVector
+        4,           // NXdlPerWave
-    4,              // BBlockTransferDstScalarPerVector_K1
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-    true,           // BBlockTransferAddExtraN
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-    7,              // CThreadTransferSrcDstVectorDim
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-    1>;             // CThreadTransferDstScalarPerVector
+        2,           // ABlockTransferSrcVectorDim
+        4,           // ABlockTransferSrcScalarPerVector
+        4,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        4,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        4>;
 int main(int argc, char* argv[])
 {
+    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    int num_dim_spatial  = 2;
-    ck::utils::conv::ConvParam params{
+    ck::utils::conv::ConvParam conv_param{
-        2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
@@ -81,77 +99,119 @@ int main(int argc, char* argv[])
    }
    else
    {
-        do_verification = std::stoi(argv[1]);
+        do_verification                   = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
+        init_method                       = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        time_kernel                       = std::stoi(argv[3]);
-        num_dim_spatial = std::stoi(argv[4]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        params = parse_conv_params(num_dim_spatial, 5, argv);
+        conv_param = parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
-    if(num_dim_spatial == 1)
+    if(conv_param.num_dim_spatial_ == 1)
    {
-        return run_conv_fwd<1,
+        using InLayout  = ctc::GNWC;
-                            ck::tensor_layout::convolution::NWC,
+        using WeiLayout = ctc::GKXC;
-                            ck::tensor_layout::convolution::KXC,
+        using OutLayout = ctc::GNWK;
-                            ck::tensor_layout::convolution::NWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<1>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            1,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 2)
+    else if(conv_param.num_dim_spatial_ == 2)
    {
-        return run_conv_fwd<2,
+        using InLayout  = ctc::GNHWC;
-                            ck::tensor_layout::convolution::NHWC,
+        using WeiLayout = ctc::GKYXC;
-                            ck::tensor_layout::convolution::KYXC,
+        using OutLayout = ctc::GNHWK;
-                            ck::tensor_layout::convolution::NHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<2>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            2,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 3)
+    else if(conv_param.num_dim_spatial_ == 3)
    {
-        return run_conv_fwd<3,
+        using InLayout  = ctc::GNDHWC;
-                            ck::tensor_layout::convolution::NDHWC,
+        using WeiLayout = ctc::GKZYXC;
-                            ck::tensor_layout::convolution::KZYXC,
+        using OutLayout = ctc::GNDHWK;
-                            ck::tensor_layout::convolution::NDHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<3>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            3,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
    return 0;

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -3,12 +3,15 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-using InDataType  = double;
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-using WeiDataType = double;
-using OutDataType = double;
+using InDataType       = double;
-using AccDataType = double;
+using WeiDataType      = double;
+using AccDataType      = double;
+using CShuffleDataType = double;
+using OutDataType      = double;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -17,57 +20,72 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto ConvFwdDefault =
+static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-template <ck::index_t NDimSpatial>
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
-    NDimSpatial,    // NDimSpatial
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
-    InDataType,     //
+using DeviceGroupedConvNDFwdInstance =
-    WeiDataType,    //
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-    OutDataType,    //
+        NDimSpatial,
-    AccDataType,    //
+        InLayout,
-    InElementOp,    // Input Elementwise Operation
+        WeiLayout,
-    WeiElementOp,   // Weights Elementwise Operation
+        ck::Tuple<>,
-    OutElementOp,   // Output Elementwise Operation
+        OutLayout,
-    ConvFwdDefault, // ConvForwardSpecialization
+        InDataType,
-    256,            // BlockSize
+        WeiDataType,
-    128,            // MPerBlock
+        AccDataType,
-    128,            // NPerBlock
+        CShuffleDataType,
-    4,              // K0PerBlock
+        ck::Tuple<>,
-    2,              // K1
+        OutDataType,
-    16,             // MPerXDL
+        InElementOp,
-    16,             // NPerXDL
+        WeiElementOp,
-    4,              // MXdlPerWave
+        OutElementOp,
-    4,              // NXdlPerWave
+        ConvSpec,    // ConvForwardSpecialization
-    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
+        GemmSpec,    // GemmSpecialization
-    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+        1,           //
-    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+        256,         // BlockSize
-    2,              // ABlockTransferSrcVectorDim
+        128,         // MPerBlock
-    2,              // ABlockTransferSrcScalarPerVector
+        128,         // NPerBlock
-    2,              // ABlockTransferDstScalarPerVector_K1
+        8,           // KPerBlock
-    true,           // ABlockLdsAddExtraM
+        2,           // AK1
-    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
+        2,           // BK1
-    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+        16,          // MPerXdl
-    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+        16,          // NPerXdl
-    2,              // BBlockTransferSrcVectorDim
+        4,           // MXdlPerWave
-    2,              // BBlockTransferSrcScalarPerVector
+        4,           // NXdlPerWave
-    2,              // BBlockTransferDstScalarPerVector_K1
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-    true,           // BBlockTransferAddExtraN
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-    7,              // CThreadTransferSrcDstVectorDim
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-    1>;             // CThreadTransferDstScalarPerVector
+        2,           // ABlockTransferSrcVectorDim
+        2,           // ABlockTransferSrcScalarPerVector
+        2,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        2,           // BBlockTransferSrcScalarPerVector
+        2,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        1>;
 int main(int argc, char* argv[])
 {
+    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    int num_dim_spatial  = 2;
-    ck::utils::conv::ConvParam params{
+    ck::utils::conv::ConvParam conv_param{
-        2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
@@ -81,77 +99,119 @@ int main(int argc, char* argv[])
    }
    else
    {
-        do_verification = std::stoi(argv[1]);
+        do_verification                   = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
+        init_method                       = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        time_kernel                       = std::stoi(argv[3]);
-        num_dim_spatial = std::stoi(argv[4]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        params = parse_conv_params(num_dim_spatial, 5, argv);
+        conv_param = parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
-    if(num_dim_spatial == 1)
+    if(conv_param.num_dim_spatial_ == 1)
    {
-        return run_conv_fwd<1,
+        using InLayout  = ctc::GNWC;
-                            ck::tensor_layout::convolution::NWC,
+        using WeiLayout = ctc::GKXC;
-                            ck::tensor_layout::convolution::KXC,
+        using OutLayout = ctc::GNWK;
-                            ck::tensor_layout::convolution::NWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<1>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            1,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 2)
+    else if(conv_param.num_dim_spatial_ == 2)
    {
-        return run_conv_fwd<2,
+        using InLayout  = ctc::GNHWC;
-                            ck::tensor_layout::convolution::NHWC,
+        using WeiLayout = ctc::GKYXC;
-                            ck::tensor_layout::convolution::KYXC,
+        using OutLayout = ctc::GNHWK;
-                            ck::tensor_layout::convolution::NHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<2>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            2,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 3)
+    else if(conv_param.num_dim_spatial_ == 3)
    {
-        return run_conv_fwd<3,
+        using InLayout  = ctc::GNDHWC;
-                            ck::tensor_layout::convolution::NDHWC,
+        using WeiLayout = ctc::GKZYXC;
-                            ck::tensor_layout::convolution::KZYXC,
+        using OutLayout = ctc::GNDHWK;
-                            ck::tensor_layout::convolution::NDHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<3>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            3,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
    return 0;

--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -3,12 +3,15 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-using InDataType  = int8_t;
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-using WeiDataType = int8_t;
-using OutDataType = int8_t;
+using InDataType       = int8_t;
-using AccDataType = int32_t;
+using WeiDataType      = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+using OutDataType      = int8_t;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -17,57 +20,72 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto ConvFwdDefault =
+static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-template <ck::index_t NDimSpatial>
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
-    NDimSpatial,    // NDimSpatial
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
-    InDataType,     //
+using DeviceGroupedConvNDFwdInstance =
-    WeiDataType,    //
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-    OutDataType,    //
+        NDimSpatial,
-    AccDataType,    //
+        InLayout,
-    InElementOp,    // Input Elementwise Operation
+        WeiLayout,
-    WeiElementOp,   // Weights Elementwise Operation
+        ck::Tuple<>,
-    OutElementOp,   // Output Elementwise Operation
+        OutLayout,
-    ConvFwdDefault, // ConvForwardSpecialization
+        InDataType,
-    256,            // BlockSize
+        WeiDataType,
-    128,            // MPerBlock
+        AccDataType,
-    256,            // NPerBlock
+        CShuffleDataType,
-    4,              // K0PerBlock
+        ck::Tuple<>,
-    16,             // K1
+        OutDataType,
-    32,             // MPerXdl
+        InElementOp,
-    32,             // NPerXdl
+        WeiElementOp,
-    2,              // MXdlPerWave
+        OutElementOp,
-    4,              // NXdlPerWave
+        ConvSpec,    // ConvForwardSpecialization
-    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
+        GemmSpec,    // GemmSpecialization
-    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+        1,           //
-    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+        256,         // BlockSize
-    2,              // ABlockTransferSrcVectorDim
+        128,         // MPerBlock
-    16,             // ABlockTransferSrcScalarPerVector
+        256,         // NPerBlock
-    16,             // ABlockTransferDstScalarPerVector_K1
+        64,          // KPerBlock
-    true,           // ABlockLdsAddExtraM
+        16,          // AK1
-    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
+        16,          // BK1
-    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+        32,          // MPerXdl
-    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+        32,          // NPerXdl
-    2,              // BBlockTransferSrcVectorDim
+        2,           // MXdlPerWave
-    16,             // BBlockTransferSrcScalarPerVector
+        4,           // NXdlPerWave
-    16,             // BBlockTransferDstScalarPerVector_K1
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-    true,           // BBlockLdsAddExtraN
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-    7,              // CThreadTransferSrcDstVectorDim
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-    1>;             // CThreadTransferDstScalarPerVector
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
 int main(int argc, char* argv[])
 {
+    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    int num_dim_spatial  = 2;
-    ck::utils::conv::ConvParam params{
+    ck::utils::conv::ConvParam conv_param{
-        2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
@@ -81,77 +99,119 @@ int main(int argc, char* argv[])
    }
    else
    {
-        do_verification = std::stoi(argv[1]);
+        do_verification                   = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
+        init_method                       = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        time_kernel                       = std::stoi(argv[3]);
-        num_dim_spatial = std::stoi(argv[4]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        params = parse_conv_params(num_dim_spatial, 5, argv);
+        conv_param = parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
-    if(num_dim_spatial == 1)
+    if(conv_param.num_dim_spatial_ == 1)
    {
-        return run_conv_fwd<1,
+        using InLayout  = ctc::GNWC;
-                            ck::tensor_layout::convolution::NWC,
+        using WeiLayout = ctc::GKXC;
-                            ck::tensor_layout::convolution::KXC,
+        using OutLayout = ctc::GNWK;
-                            ck::tensor_layout::convolution::NWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<1>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            1,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 2)
+    else if(conv_param.num_dim_spatial_ == 2)
    {
-        return run_conv_fwd<2,
+        using InLayout  = ctc::GNHWC;
-                            ck::tensor_layout::convolution::NHWC,
+        using WeiLayout = ctc::GKYXC;
-                            ck::tensor_layout::convolution::KYXC,
+        using OutLayout = ctc::GNHWK;
-                            ck::tensor_layout::convolution::NHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<2>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            2,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
-    else if(num_dim_spatial == 3)
+    else if(conv_param.num_dim_spatial_ == 3)
    {
-        return run_conv_fwd<3,
+        using InLayout  = ctc::GNDHWC;
-                            ck::tensor_layout::convolution::NDHWC,
+        using WeiLayout = ctc::GKZYXC;
-                            ck::tensor_layout::convolution::KZYXC,
+        using OutLayout = ctc::GNDHWK;
-                            ck::tensor_layout::convolution::NDHWK,
-                            InDataType,
+        const auto in_g_n_c_wis_desc =
-                            WeiDataType,
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
-                            OutDataType,
-                            InElementOp,
+        const auto wei_g_k_c_xs_desc =
-                            WeiElementOp,
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
-                            OutElementOp,
-                            DeviceConvNDFwdInstance<3>>(do_verification,
+        const auto out_g_n_k_wos_desc =
-                                                        init_method,
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-                                                        time_kernel,
-                                                        params,
+        return run_grouped_conv_fwd<
-                                                        in_element_op,
+            3,
-                                                        wei_element_op,
+            InDataType,
-                                                        out_element_op);
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
+                                                                               init_method,
+                                                                               time_kernel,
+                                                                               conv_param,
+                                                                               in_g_n_c_wis_desc,
+                                                                               wei_g_k_c_xs_desc,
+                                                                               out_g_n_k_wos_desc,
+                                                                               in_element_op,
+                                                                               wei_element_op,
+                                                                               out_element_op);
    }
    return 0;

--- a/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+++ b/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
@@ -15,7 +15,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 void print_helper_msg()

--- a/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
@@ -5,6 +5,8 @@
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 using InDataType       = ck::half_t;
 using WeiDataType      = ck::half_t;
 using AccDataType      = float;
@@ -29,7 +31,7 @@ template <ck::index_t NDimSpatial,
          typename WeiLayout,
          typename BiasLayout,
          typename OutLayout>
-using DeviceGroupledConvNDFwdInstance =
+using DeviceGroupedConvNDFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
@@ -52,24 +54,25 @@ using DeviceGroupledConvNDFwdInstance =
        128,         // MPerBlock
        256,         // NPerBlock
        32,          // KPerBlock
-        8,           // K1
+        8,           // AK1
+        8,           // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        8,           // ABlockTransferSrcScalarPerVector
-        8,           // ABlockTransferDstScalarPerVector_K1
+        8,           // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
-        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        8,           // BBlockTransferSrcScalarPerVector
-        8,           // BBlockTransferDstScalarPerVector_K1
+        8,           // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
@@ -164,7 +167,7 @@ int main(int argc, char* argv[])
            InElementOp,
            WeiElementOp,
            OutElementOp,
-            DeviceGroupledConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
@@ -251,7 +254,7 @@ int main(int argc, char* argv[])
            InElementOp,
            WeiElementOp,
            OutElementOp,
-            DeviceGroupledConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
@@ -349,7 +352,7 @@ int main(int argc, char* argv[])
            InElementOp,
            WeiElementOp,
            OutElementOp,
-            DeviceGroupledConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>(
            do_verification,
            init_method,
            time_kernel,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -248,7 +248,8 @@ template <index_t NDimSpatial,
          index_t MPerBlock,
          index_t NPerBlock,
          index_t KPerBlock,
-          index_t K1,
+          index_t AK1,
+          index_t BK1,
          index_t MPerXDL,
          index_t NPerXDL,
          index_t MXdlPerWave,
@@ -295,9 +296,6 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
    static constexpr auto I2 = Number<2>{};
    static constexpr auto I3 = Number<3>{};
-    static constexpr auto K1Number     = Number<K1>{};
-    static constexpr auto GemmK1Number = K1Number;
    static constexpr auto matrix_padder =
        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
@@ -1256,8 +1254,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
        MPerBlock,
        NPerBlock,
        KPerBlock,
-        K1,
+        AK1,
-        K1,
+        BK1,
        MPerXDL,
        NPerXDL,
        MXdlPerWave,

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -39,11 +39,23 @@ struct PassThrough
        y = x;
    }
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(x);
+    }
    template <>
    __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
    {
        y = x;
    }
+    template <>
+    __host__ __device__ void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
+    {
+        y = type_convert<int8_t>(x);
+    }
 };
 struct UnaryConvert

--- a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
+++ b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
@@ -156,9 +156,9 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvPa
                                param.filter_spatial_lengths_.begin(),
                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
-    else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
+    else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKXC> ||
-                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKYXC> ||
-                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKZYXC>)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
                                                    static_cast<std::size_t>(param.K_),
@@ -183,6 +183,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvPa
    else
    {
        printf("%s\n", __func__);
+        printf("%s\n", WeiLayout::name);
        throw std::runtime_error("wrong! unsupported layout");
    }
@@ -238,6 +239,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvP
    else
    {
        printf("%s\n", __func__);
+        printf("%s\n", OutLayout::name);
        throw std::runtime_error("wrong! unsupported layout");
    }