Merge branch 'amd-develop' into amd-master

5683ea4e · Jun Liu · f0831350 · dddc2115 · 5683ea4e · 5683ea4e
Commit 5683ea4e authored Aug 08, 2023 by Jun Liu
20 changed files
--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
@@ -10,21 +10,19 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
-# Conv perlayer quantization
-add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)

-# Conv perchannel quantization
-add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
-
-# Conv + bias + relu perlayer quantization
-add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
-
-# Conv + bias + relu perchannel quantization
-add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
-
-# Conv + bias + tanh perlayer quantization
-add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
-
-# Conv + bias + tanh perchannel quantization
-add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
+  if(DL_KERNELS)
+  # Conv perlayer quantization
+  add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
+  # Conv perchannel quantization
+  add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
+  # Conv + bias + relu perlayer quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
+  # Conv + bias + relu perchannel quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
+  # Conv + bias + tanh perlayer quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
+  # Conv + bias + tanh perchannel quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
+  endif()
 endif()
\ No newline at end of file
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -3,9 +3,15 @@ list(APPEND gpu_list2 gfx908 gfx90a)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
-   add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
-   add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
-   add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
+   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+      add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
+   endif()
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+      add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
+   endif()
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+      add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
+   endif()
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
   endif(USE_BITINT_EXTENSION_INT4)
@@ -14,5 +20,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()

 if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
-   add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+      add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+   endif()
 endif()
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
-add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
-add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
-add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+    add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
+    add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
+endif()
--- a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
-add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
-add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
+endif()
--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
-add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
-add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
+    add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
+endif()
--- a/example/46_gemm_add_multiply/CMakeLists.txt
+++ b/example/46_gemm_add_multiply/CMakeLists.txt
-add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
-add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    if(DL_KERNELS)
+        add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
+    endif()
+    add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
+endif()
--- a/example/48_pool3d_fwd/CMakeLists.txt
+++ b/example/48_pool3d_fwd/CMakeLists.txt
-add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
-
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
+endif()
--- a/example/49_maxpool2d_bwd/CMakeLists.txt
+++ b/example/49_maxpool2d_bwd/CMakeLists.txt
-add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
-add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
-add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
+endif()
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
+endif()
--- a/example/50_put_element/CMakeLists.txt
+++ b/example/50_put_element/CMakeLists.txt
-add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
+endif()
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -198,7 +198,7 @@
 #define CK_WORKAROUND_SWDEV_388832 1

 // workaround: Grouped Conv2d_bwd_data fails for already implemented instance
-#define CK_WORKAROUND_SWDEV_3318619 0
+#define CK_WORKAROUND_GITHUB_ISSUE_824 1

 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -27,15 +27,12 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator
    MakeArgumentPointer(const void* p_in,
                        void* p_wei,
                        const void* p_out,
-                        const ck::index_t G,
-                        const ck::index_t N,
-                        const ck::index_t K,
-                        const ck::index_t C,
-                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
@@ -784,15 +784,12 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        Argument(const InDataType* p_in_grid,
                 WeiDataType* p_wei_grid,
                 const OutDataType* p_out_grid,
-                 const ck::index_t G,
-                 const ck::index_t N,
-                 const ck::index_t K,
-                 const ck::index_t C,
-                 const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial + 3>& /*input_strides*/,
-                 const std::array<ck::index_t, NDimSpatial + 3>& /*output_strides*/,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& /*a_g_n_c_wis_strides*/,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& /*b_g_k_c_xs_strides*/,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& /*e_g_n_k_wos_strides*/,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -812,27 +809,38 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
              a_element_op_{out_element_op},
              b_element_op_{wei_element_op},
              c_element_op_{in_element_op},
-              Conv_G_{G},
-              Conv_N_{N},
-              Conv_K_{K},
-              Conv_C_{C},
-              input_spatial_lengths_{input_spatial_lengths},
-              filter_spatial_lengths_{filter_spatial_lengths},
-              output_spatial_lengths_{output_spatial_lengths},
+              Conv_G_{a_g_n_c_wis_lengths[0]},
+              Conv_N_{a_g_n_c_wis_lengths[1]},
+              Conv_K_{b_g_k_c_xs_lengths[1]},
+              Conv_C_{a_g_n_c_wis_lengths[2]},
+              input_spatial_lengths_{},
+              filter_spatial_lengths_{},
+              output_spatial_lengths_{},
              conv_filter_strides_{conv_filter_strides},
              conv_filter_dilations_{conv_filter_dilations},
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads},
              k_batch_{split_k}
        {
+            constexpr index_t spatial_offset = 3;
+            std::copy(begin(a_g_n_c_wis_lengths) + spatial_offset,
+                      end(a_g_n_c_wis_lengths),
+                      begin(input_spatial_lengths_));
+            std::copy(begin(b_g_k_c_xs_lengths) + spatial_offset,
+                      end(b_g_k_c_xs_lengths),
+                      begin(filter_spatial_lengths_));
+            std::copy(begin(e_g_n_k_wos_lengths) + spatial_offset,
+                      end(e_g_n_k_wos_lengths),
+                      begin(output_spatial_lengths_));
+
            const auto descs =
                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
-                    N,
-                    K,
-                    C,
-                    input_spatial_lengths,
-                    filter_spatial_lengths,
-                    output_spatial_lengths,
+                    Conv_N_,
+                    Conv_K_,
+                    Conv_C_,
+                    input_spatial_lengths_,
+                    filter_spatial_lengths_,
+                    output_spatial_lengths_,
                    conv_filter_strides,
                    conv_filter_dilations,
                    input_left_pads,
@@ -856,21 +864,21 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl

            // A/B/C Batch Stride
            compute_ptr_offset_of_batch_.BatchStrideA_ =
-                N * K *
-                std::accumulate(begin(output_spatial_lengths),
-                                end(output_spatial_lengths),
+                Conv_N_ * Conv_K_ *
+                std::accumulate(begin(output_spatial_lengths_),
+                                end(output_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});
            compute_ptr_offset_of_batch_.BatchStrideB_ =
-                N * C *
-                std::accumulate(begin(input_spatial_lengths),
-                                end(input_spatial_lengths),
+                Conv_N_ * Conv_C_ *
+                std::accumulate(begin(input_spatial_lengths_),
+                                end(input_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});
            compute_ptr_offset_of_batch_.BatchStrideC_ =
-                K * C *
-                std::accumulate(begin(filter_spatial_lengths),
-                                end(filter_spatial_lengths),
+                Conv_K_ * Conv_C_ *
+                std::accumulate(begin(filter_spatial_lengths_),
+                                end(filter_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});
        }
@@ -904,9 +912,9 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        const index_t Conv_K_;
        const index_t Conv_C_;

-        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths_;
-        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths_;
-        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths_;
        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations_;
        const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
@@ -1110,39 +1118,34 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
    }

-    static auto MakeArgument(const InDataType* p_in_grid,
-                             WeiDataType* p_wei_grid,
-                             const OutDataType* p_out_grid,
-                             const ck::index_t G,
-                             const ck::index_t N,
-                             const ck::index_t K,
-                             const ck::index_t C,
-                             const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                             const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                             const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                             const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-                             const std::array<ck::index_t, NDimSpatial>& input_left_pads,
-                             const std::array<ck::index_t, NDimSpatial>& input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op,
-                             ck::index_t split_k)
+    static auto
+    MakeArgument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 ck::index_t split_k)
    {
        return Argument{p_in_grid,
                        p_wei_grid,
                        p_out_grid,
-                        G,
-                        N,
-                        K,
-                        C,
-                        input_spatial_lengths,
-                        filter_spatial_lengths,
-                        output_spatial_lengths,
-                        input_strides,
-                        output_strides,
+                        a_g_n_c_wis_lengths, // input
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths, // weight
+                        b_g_k_c_xs_strides,
+                        e_g_n_k_wos_lengths, // output
+                        e_g_n_k_wos_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -1159,15 +1162,12 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    MakeArgumentPointer(const void* p_in_grid,
                        void* p_wei_grid,
                        const void* p_out_grid,
-                        const ck::index_t G,
-                        const ck::index_t N,
-                        const ck::index_t K,
-                        const ck::index_t C,
-                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -1180,15 +1180,12 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
                                          static_cast<WeiDataType*>(p_wei_grid),
                                          static_cast<const OutDataType*>(p_out_grid),
-                                          G,
-                                          N,
-                                          K,
-                                          C,
-                                          input_spatial_lengths,
-                                          filter_spatial_lengths,
-                                          output_spatial_lengths,
-                                          input_strides,
-                                          output_strides,
+                                          a_g_n_c_wis_lengths, // input
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths, // weight
+                                          b_g_k_c_xs_strides,
+                                          e_g_n_k_wos_lengths, // output
+                                          e_g_n_k_wos_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -245,21 +245,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                       const ck::index_t K,
                       const std::array<ck::index_t, NDimSpatial + 3>& output_strides)
    {
-        if constexpr(is_GNHWK_GKYXC_GNHWC)
-        {
-            return make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
-        }
-        else if constexpr(is_NHWGK_GKYXC_NHWGC)
-        {
-            const index_t WoStride = output_strides[4];
-            const auto KStride     = Number<1>{};
-            return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K),
-                                                make_tuple(WoStride, KStride));
-        }
-        else
-        {
-            throw std::runtime_error("wrong! unsupported layout: " + OutLayout::name());
-        }
+        const index_t WoStride = output_strides[4];
+        const auto KStride     = Number<1>{};
+        return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K),
+                                            make_tuple(WoStride, KStride));
    }

    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
@@ -270,42 +259,36 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                      const ck::index_t C,
                      const std::array<ck::index_t, NDimSpatial + 3>& input_strides)
    {
-        if constexpr(is_GNHWK_GKYXC_GNHWC)
-        {
-            if constexpr(ConvBackwardWeightSpecialization ==
-                         ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
-            {
-                return make_naive_tensor_descriptor_packed(make_tuple(N * Hi * Wi, C));
-            }
-            else
-            {
-                return make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
-            }
-        }
-        else if constexpr(is_NHWGK_GKYXC_NHWGC)
+        const index_t NStride  = input_strides[1];
+        const index_t HiStride = input_strides[3];
+        const index_t WiStride = input_strides[4];
+        const auto CStride     = input_strides[2];
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
        {
-            const index_t NStride  = input_strides[1];
-            const index_t HiStride = input_strides[3];
-            const index_t WiStride = input_strides[4];
-            const auto CStride     = input_strides[2];
-            if constexpr(ConvBackwardWeightSpecialization ==
-                         ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
-            {
-                return make_naive_tensor_descriptor(make_tuple(N * Hi * Wi, C),
-                                                    make_tuple(WiStride, CStride));
-            }
-            else
-            {
-                return make_naive_tensor_descriptor(
-                    make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
-            }
+            return make_naive_tensor_descriptor(make_tuple(N * Hi * Wi, C),
+                                                make_tuple(WiStride, CStride));
        }
        else
        {
-            throw std::runtime_error("wrong! unsupported layout: " + InLayout::name());
+            return make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C),
+                                                make_tuple(NStride, HiStride, WiStride, CStride));
        }
    }

+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    constexpr static auto
+    make_wei_grid_desc(const ck::index_t K,
+                       const ck::index_t Y,
+                       const ck::index_t X,
+                       const ck::index_t C,
+                       const std::array<ck::index_t, NDimSpatial + 3>& weights_strides)
+    {
+        const auto CStride = Number<1>{};
+        const auto KStride = weights_strides[1];
+        return make_naive_tensor_descriptor(make_tuple(K, Y * X * C), make_tuple(KStride, CStride));
+    }
+
    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
    constexpr static auto
    make_out_grid_desc(const ck::index_t N,
@@ -315,21 +298,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                       const ck::index_t K,
                       const std::array<ck::index_t, NDimSpatial + 3>& output_strides)
    {
-        if constexpr(is_GNDHWK_GKZYXC_GNDHWC)
-        {
-            return make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K));
-        }
-        else if constexpr(is_NDHWGK_GKZYXC_NDHWGC)
-        {
-            const index_t WoStride = output_strides[5];
-            const auto KStride     = Number<1>{};
-            return make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, K),
-                                                make_tuple(WoStride, KStride));
-        }
-        else
-        {
-            throw std::runtime_error("wrong! unsupported layout: " + OutLayout::name());
-        }
+        const index_t WoStride = output_strides[5];
+        const auto KStride     = Number<1>{};
+        return make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, K),
+                                            make_tuple(WoStride, KStride));
    }

    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
@@ -341,44 +313,40 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                      const ck::index_t C,
                      const std::array<ck::index_t, NDimSpatial + 3>& input_strides)
    {
-        if constexpr(is_GNDHWK_GKZYXC_GNDHWC)
-        {
-            if constexpr(ConvBackwardWeightSpecialization ==
-                         ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
-            {
-                return make_naive_tensor_descriptor_packed(make_tuple(N * Di * Hi * Wi, C));
-            }
-            else
-            {
-                return make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
-            }
-        }
-        else if constexpr(is_NDHWGK_GKZYXC_NDHWGC)
+        const index_t NStride  = input_strides[1];
+        const index_t DiStride = input_strides[3];
+        const index_t HiStride = input_strides[4];
+        const index_t WiStride = input_strides[5];
+        const auto CStride     = input_strides[2];
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
        {
-            const index_t NStride  = input_strides[1];
-            const index_t DiStride = input_strides[3];
-            const index_t HiStride = input_strides[4];
-            const index_t WiStride = input_strides[5];
-            const auto CStride     = input_strides[2];
-            if constexpr(ConvBackwardWeightSpecialization ==
-                         ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
-            {
-                return make_naive_tensor_descriptor(make_tuple(N * Di * Hi * Wi, C),
-                                                    make_tuple(WiStride, CStride));
-            }
-            else
-            {
-                return make_naive_tensor_descriptor(
-                    make_tuple(N, Di, Hi, Wi, C),
-                    make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
-            }
+            return make_naive_tensor_descriptor(make_tuple(N * Di * Hi * Wi, C),
+                                                make_tuple(WiStride, CStride));
        }
        else
        {
-            throw std::runtime_error("wrong! unsupported layout: " + InLayout::name());
+            return make_naive_tensor_descriptor(
+                make_tuple(N, Di, Hi, Wi, C),
+                make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
        }
    }

+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    constexpr static auto
+    make_wei_grid_desc(const ck::index_t K,
+                       const ck::index_t Z,
+                       const ck::index_t Y,
+                       const ck::index_t X,
+                       const ck::index_t C,
+                       const std::array<ck::index_t, NDimSpatial + 3>& weights_strides)
+    {
+        const auto CStride = Number<1>{};
+        const auto KStride = weights_strides[1];
+        return make_naive_tensor_descriptor(make_tuple(K, Z * Y * X * C),
+                                            make_tuple(KStride, CStride));
+    }
+
    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
        const ck::index_t N,
@@ -388,6 +356,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
        const std::array<ck::index_t, NDimSpatial + 3>& /* input_strides */,
+        const std::array<ck::index_t, NDimSpatial + 3>& /* weights_strides */,
        const std::array<ck::index_t, NDimSpatial + 3>& /* output_strides */,
        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
@@ -542,6 +511,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
+        const std::array<ck::index_t, NDimSpatial + 3>& weights_strides,
        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
@@ -584,6 +554,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle

        const auto out_grid_desc = make_out_grid_desc<NDim>(N, Ho, Wo, K, output_strides);
        const auto in_grid_desc  = make_in_grid_desc<NDim>(N, Hi, Wi, C, input_strides);
+        const auto wei_grid_desc = make_wei_grid_desc<NDim>(K, Y, X, C, weights_strides);

        if constexpr(ConvBackwardWeightSpecialization ==
                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
@@ -618,13 +589,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));

-            // C: weight tensor
-            const auto wei_gemmm_gemmn_grid_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
-
            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_grid_desc);
+                              wei_grid_desc);
        }
        else
        {
@@ -684,13 +651,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));

-            // C: weight tensor
-            const auto wei_gemmm_gemmn_grid_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
-
            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_grid_desc);
+                              wei_grid_desc);
        }
    }

@@ -703,6 +666,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
+        const std::array<ck::index_t, NDimSpatial + 3>& weights_strides,
        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
@@ -752,6 +716,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle

        const auto out_grid_desc = make_out_grid_desc<NDim>(N, Do, Ho, Wo, K, output_strides);
        const auto in_grid_desc  = make_in_grid_desc<NDim>(N, Di, Hi, Wi, C, input_strides);
+        const auto wei_grid_desc = make_wei_grid_desc<NDim>(K, Z, Y, X, C, weights_strides);

        if constexpr(ConvBackwardWeightSpecialization ==
                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
@@ -786,13 +751,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));

-            // C: weight tensor
-            const auto wei_gemmm_gemmn_grid_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(K, Z * Y * X * C));
-
            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_grid_desc);
+                              wei_grid_desc);
        }
        else
        {
@@ -861,13 +822,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));

-            // C: weight tensor
-            const auto wei_gemmm_gemmn_grid_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(K, Z * Y * X * C));
-
            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_grid_desc);
+                              wei_grid_desc);
        }
    } // function end

@@ -887,6 +844,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                                                  lengths,
                                                                  strides,
                                                                  strides,
+                                                                  strides,
                                                                  params,
                                                                  params,
                                                                  params,
@@ -910,6 +868,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                                                  lengths,
                                                                  strides,
                                                                  strides,
+                                                                  strides,
                                                                  params,
                                                                  params,
                                                                  params,
@@ -933,6 +892,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                                                  lengths,
                                                                  strides,
                                                                  strides,
+                                                                  strides,
                                                                  params,
                                                                  params,
                                                                  params,
@@ -1051,15 +1011,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        Argument(const InDataType* p_in_grid,
                 WeiDataType* p_wei_grid,
                 const OutDataType* p_out_grid,
-                 const ck::index_t G,
-                 const ck::index_t N,
-                 const ck::index_t K,
-                 const ck::index_t C,
-                 const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                 const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                 const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -1084,27 +1041,40 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
              a_element_op_{out_element_op},
              b_element_op_{in_element_op},
              c_element_op_{wei_element_op},
-              Conv_G_{G},
-              Conv_N_{N},
-              Conv_K_{K},
-              Conv_C_{C},
-              output_spatial_lengths_{output_spatial_lengths},
-              filter_spatial_lengths_{filter_spatial_lengths},
+              Conv_G_{a_g_n_c_wis_lengths[0]},
+              Conv_N_{a_g_n_c_wis_lengths[1]},
+              Conv_K_{b_g_k_c_xs_lengths[1]},
+              Conv_C_{a_g_n_c_wis_lengths[2]},
+              input_spatial_lengths_{},
+              filter_spatial_lengths_{},
+              output_spatial_lengths_{},
              conv_filter_strides_{conv_filter_strides},
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads},
              k_batch_{split_k}
        {
+            constexpr index_t spatial_offset = 3;
+            std::copy(begin(a_g_n_c_wis_lengths) + spatial_offset,
+                      end(a_g_n_c_wis_lengths),
+                      begin(input_spatial_lengths_));
+            std::copy(begin(b_g_k_c_xs_lengths) + spatial_offset,
+                      end(b_g_k_c_xs_lengths),
+                      begin(filter_spatial_lengths_));
+            std::copy(begin(e_g_n_k_wos_lengths) + spatial_offset,
+                      end(e_g_n_k_wos_lengths),
+                      begin(output_spatial_lengths_));
+
            const auto descs =
                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
-                    N,
-                    K,
-                    C,
-                    input_spatial_lengths,
-                    filter_spatial_lengths,
-                    output_spatial_lengths,
-                    input_strides,
-                    output_strides,
+                    Conv_N_,
+                    Conv_K_,
+                    Conv_C_,
+                    input_spatial_lengths_,
+                    filter_spatial_lengths_,
+                    output_spatial_lengths_,
+                    a_g_n_c_wis_strides,
+                    b_g_k_c_xs_strides,
+                    e_g_n_k_wos_strides,
                    conv_filter_strides,
                    conv_filter_dilations,
                    input_left_pads,
@@ -1119,12 +1089,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);

            // A/B/C Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ = output_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = input_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideA_ = e_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = a_g_n_c_wis_strides[0];
            compute_ptr_offset_of_batch_.BatchStrideC_ =
-                K * C *
-                std::accumulate(begin(filter_spatial_lengths),
-                                end(filter_spatial_lengths),
+                Conv_K_ * Conv_C_ *
+                std::accumulate(begin(filter_spatial_lengths_),
+                                end(filter_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});

@@ -1163,8 +1133,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        const index_t Conv_N_;
        const index_t Conv_K_;
        const index_t Conv_C_;
-        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths_;
-        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths_;
        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
        const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
        const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
@@ -1339,39 +1310,34 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
    }

-    static auto MakeArgument(const InDataType* p_in_grid,
-                             WeiDataType* p_wei_grid,
-                             const OutDataType* p_out_grid,
-                             const ck::index_t G,
-                             const ck::index_t N,
-                             const ck::index_t K,
-                             const ck::index_t C,
-                             const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                             const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                             const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                             const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                             const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-                             const std::array<ck::index_t, NDimSpatial>& input_left_pads,
-                             const std::array<ck::index_t, NDimSpatial>& input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op,
-                             const ck::index_t split_k)
+    static auto
+    MakeArgument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 const ck::index_t split_k)
    {
        return Argument{p_in_grid,
                        p_wei_grid,
                        p_out_grid,
-                        G,
-                        N,
-                        K,
-                        C,
-                        input_spatial_lengths,
-                        filter_spatial_lengths,
-                        output_spatial_lengths,
-                        input_strides,
-                        output_strides,
+                        a_g_n_c_wis_lengths, // input
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths, // weight
+                        b_g_k_c_xs_strides,
+                        e_g_n_k_wos_lengths, // output
+                        e_g_n_k_wos_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -1390,15 +1356,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
    MakeArgumentPointer(const void* p_in_grid,
                        void* p_wei_grid,
                        const void* p_out_grid,
-                        const ck::index_t G,
-                        const ck::index_t N,
-                        const ck::index_t K,
-                        const ck::index_t C,
-                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, // input
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, // weight
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, // output
+                        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -1411,15 +1374,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
                                          static_cast<WeiDataType*>(p_wei_grid),
                                          static_cast<const OutDataType*>(p_out_grid),
-                                          G,
-                                          N,
-                                          K,
-                                          C,
-                                          input_spatial_lengths,
-                                          filter_spatial_lengths,
-                                          output_spatial_lengths,
-                                          input_strides,
-                                          output_strides,
+                                          a_g_n_c_wis_lengths, // input
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths, // weight
+                                          b_g_k_c_xs_strides,
+                                          e_g_n_k_wos_lengths, // output
+                                          e_g_n_k_wos_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
+#ifdef __bf16__
 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
@@ -36,7 +36,8 @@ void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-
+#endif
+#ifdef __fp16__
 void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -56,7 +57,8 @@ void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-
+#endif
+#ifdef __fp32__
 void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
@@ -76,7 +78,8 @@ void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
-
+#endif
+#ifdef __int8__
 void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
                                                  Row,
@@ -120,7 +123,7 @@ void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
-
+#endif
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -151,7 +154,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
+#ifdef __fp32__
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<CDataType, float>)
        {
@@ -176,8 +179,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
-                          is_same_v<CDataType, half_t>)
+#endif
+#ifdef __fp16__
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -200,8 +205,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
-                          is_same_v<CDataType, bhalf_t>)
+#endif
+#ifdef __bf16__
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -224,8 +231,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
-                          is_same_v<CDataType, int8_t>)
+#endif
+#ifdef __int8__
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<CDataType, int8_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -248,7 +257,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -14,7 +14,7 @@

 using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
 using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
-
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -137,3 +137,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -91,3 +91,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
+#ifdef __fp16__
 void add_device_batched_gemm_bias_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -58,7 +58,8 @@ void add_device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
-
+#endif
+#ifdef __bf16__
 void add_device_batched_gemm_bias_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -100,7 +101,7 @@ void add_device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
-
+#endif
 template <typename ADataType,
          typename B0DataType,
          typename B1DataType,
@@ -147,7 +148,7 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
+#ifdef __fp16__
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t> &&
                     Acc0BiasDataType::Size() == 1 &&
@@ -164,6 +165,8 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
+#ifdef __bf16__
        else if constexpr(is_same_v<ADataType, BF16> && is_same_v<B0DataType, BF16> &&
                          is_same_v<B1DataType, BF16> && is_same_v<CDataType, BF16> &&
                          Acc0BiasDataType::Size() == 1 &&
@@ -180,6 +183,7 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
+#ifdef __fp16__
 void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
                                                      Col,
@@ -111,3 +111,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
@@ -19,7 +19,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
+#ifdef __fp16__
 void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
                                                        Row,
@@ -123,7 +123,8 @@ void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instan
                                                        PassThrough,
                                                        PassThrough,
                                                        PassThrough>>>& instances);
-
+#endif
+#ifdef __int8__
 void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
                                                        Row,
@@ -227,7 +228,7 @@ void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
                                                        PassThrough,
                                                        PassThrough,
                                                        PassThrough>>>& instances);
-
+#endif
 template <typename ALayout,
          typename BLayout,
          typename ELayout,
@@ -262,7 +263,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
+#ifdef __fp16__
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                     is_same_v<EDataType, half_t>)
        {
@@ -295,6 +296,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                    op_ptrs);
            }
        }
+#endif
+#ifdef __int8__
        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                          is_same_v<EDataType, int8_t>)
        {
@@ -327,7 +330,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                    op_ptrs);
            }
        }
-
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -119,3 +119,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif