Merge pull request #9 from ROCmSoftwarePlatform/mergedown-from-public

Merge down from public repo

Merge pull request #9 from ROCmSoftwarePlatform/mergedown-from-public
Merge down from public repo
aaf4defa · Illia Silin · GitHub · e0c2a70f · e51806cc · aaf4defa
Unverified Commit aaf4defa authored Apr 19, 2023 by Illia Silin Committed by GitHub Apr 19, 2023
20 changed files
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& requant_scale_g_k_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<RequantScaleDataType> requant_scale(requant_scale_g_k_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "requant_scale: " << requant_scale.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-128, 127});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-128, 127});
+    requant_scale.GenerateTensorValue(GeneratorTensor_2<RequantScaleDataType>{0, 1});
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem requant_scale_device_buf(sizeof(RequantScaleDataType) *
+                                       requant_scale.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    requant_scale_device_buf.ToDevice(requant_scale.mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(requant_scale_g_k_desc.GetLengths(), d0_g_n_k_wos_lengths);
+    copy(requant_scale_g_k_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      {requant_scale_device_buf.GetDeviceBuffer()},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {d0_g_n_k_wos_lengths},
+                                      {d0_g_n_k_wos_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_host(out_g_n_k_wos_desc);
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     AccDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+        ref_invoker.Run(ref_argument);
+        // TODO: implement elementwise operation for host
+        out_host.ForEach([&](auto&, auto idx) {
+            out_element_op(out_host(idx), c_host(idx), requant_scale(idx));
+        });
+        out_device_buf.FromDevice(out_device.mData.data());
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+    return (pass ? 0 : 1);
+}
+int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        192,          // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    using InLayout           = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto requant_scale_g_k_desc =
+        HostTensorDescriptor({conv_param.G_,
+                              conv_param.N_,
+                              conv_param.K_,
+                              conv_param.output_spatial_lengths_[0],
+                              conv_param.output_spatial_lengths_[1]},
+                             {
+                                 conv_param.K_, // g
+                                 0,             // n
+                                 1,             // k
+                                 0,             // ho
+                                 0              // wo
+                             });
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+    using deviceOp = DeviceGroupedConvNDFwdInstance<ndim_spatial,
+                                                    InLayout,
+                                                    WeiLayout,
+                                                    RequantScaleLayout,
+                                                    OutLayout>;
+    return run_grouped_conv_fwd<ndim_spatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp,
+                                deviceOp>(do_verification,
+                                          time_kernel,
+                                          conv_param,
+                                          in_g_n_c_wis_desc,
+                                          wei_g_k_c_xs_desc,
+                                          requant_scale_g_k_desc,
+                                          out_g_n_k_wos_desc,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+}
--- a/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
+#pragma once
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/algorithm.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
-using InDataType       = int8_t;
-using WeiDataType      = int8_t;
-using AccDataType      = int32_t;
-using CShuffleDataType = int32_t;
-using OutDataType      = int8_t;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
-using InElementOp  = PassThrough;
-using WeiElementOp = PassThrough;
-using ActivationOp = PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
-using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-        NDimSpatial,
-        InLayout,
-        WeiLayout,
-        ck::Tuple<>,
-        OutLayout,
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<>,
-        OutDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp,
-        ConvSpec,    // ConvForwardSpecialization
-        GemmSpec,    // GemmSpecialization
-        1,           //
-        256,         // BlockSize
-        128,         // MPerBlock
-        256,         // NPerBlock
-        64,          // KPerBlock
-        16,          // AK1
-        16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-        2,           // ABlockTransferSrcVectorDim
-        16,          // ABlockTransferSrcScalarPerVector
-        16,          // ABlockTransferDstScalarPerVector_AK1
-        1,           // ABlockLdsExtraM
-        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-        2,           // BBlockTransferSrcVectorDim
-        16,          // BBlockTransferSrcScalarPerVector
-        16,          // BBlockTransferDstScalarPerVector_BK1
-        1,           // BBlockLdsExtraN
-        1,
-        1,
-        S<1, 64, 1, 4>,
-        16>;
 template <ck::index_t NDimSpatial,
          typename InDataType,
@@ -221,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
    return (pass ? 0 : 1);
 }
-int main()
+int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
 {
    bool do_verification           = true;
-    bool time_kernel               = true;
+    bool time_kernel               = false;
    const ck::index_t ndim_spatial = 2;
    ck::utils::conv::ConvParam conv_param{
@@ -232,7 +150,7 @@ int main()
        1,            // group
        4,            // batch
        64,           // output channels
-        32,           // input chanels
+        192,          // input chanels
        {3, 3},       // weight HW
        {71, 71},     // x HW
        {2, 2},       // strides
@@ -243,7 +161,6 @@ int main()
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
    using InLayout  = ck::tensor_layout::convolution::GNHWC;
    using WeiLayout = ck::tensor_layout::convolution::GKYXC;

--- a/example/44_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/44_conv2d_fwd_quantization/CMakeLists.txt
-add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -27,14 +27,6 @@
 #define CK_WAVELET_MIN_BLOCK_PER_CU 2
 #endif
-// check GPU target
-#ifdef __HIP_DEVICE_COMPILE__
-#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-      defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__) || defined(__gfx940__))
-#error Not supported target
-#endif
-#endif
 // buffer resource
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
@@ -43,8 +35,8 @@
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
 #elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
-#elif defined(__gfx1100__) // for GPU code
+#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
-#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
 #endif
 // FMA instruction
@@ -76,7 +68,7 @@
 // WMMA instruction
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_WMMA
-#elif defined(__gfx1100__) // for GPU code
+#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
 #define CK_USE_AMD_WMMA
 #endif
@@ -172,6 +164,11 @@
 // tuning parameter
 #define CK_WORKAROUND_SWDEV_325164 0
+// workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
+#define CK_WORKAROUND_SWDEV_383542 1
+// workaround: compiler issue on gfx908
+#define CK_WORKAROUND_SWDEV_388832 1
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -622,11 +622,16 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    }
 };
-// Blockwise gemm supporting
+/**
-// 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
+ * @brief Blockwise gemm
-// 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
+ *
-// source buffer
+ * Supports
-// 3. configurable k index starting position and step size after each FMA/XDL instruction
+ * 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
+ * 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
+ * source buffer
+ * 3. configurable k index starting position and step size after each FMA/XDL instruction
+ */
 template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,

--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -12,6 +12,16 @@
 namespace ck {
+/**
+ * @brief Blockwise softmax
+ *
+ * @tparam BlockSize Block size
+ * @tparam AccDataType Accumulator data type
+ * @tparam ThreadMap_M_K Thread id to m_k
+ * @tparam ThreadClusterDesc_M_K Threadwise cluster descriptor
+ * @tparam ThreadSliceDesc_M_K Threadwise slices descriptor
+ * @tparam IgnoreNaN Flag to ignore NaN, false by default
+ */
 template <index_t BlockSize,
          typename AccDataType,
          typename ThreadMap_M_K, // thread_id to m_k

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -11,10 +11,15 @@
 namespace ck {
-// this version does following things to avoid scratch memory issue
+/**
-// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * @brief Blockwise data transfer
-// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ *
-// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ * This version does following things to avoid scratch memory issue
+ * 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ * 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ *
+ */
 template <typename ThreadGroup,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -134,7 +134,8 @@ __global__ void
            const Block2CTileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
+    defined(__gfx90a__) || defined(__gfx908__))
    // offset base pointer for each work-group
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -314,9 +315,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        const auto in_gemmm_gemmk_desc =
            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
-        const auto M = in_gemmm_gemmk_desc.GetLength(I0);
+        const auto M   = in_gemmm_gemmk_desc.GetLength(I0);
-        const auto K = in_gemmm_gemmk_desc.GetLength(I1);
+        const auto K   = in_gemmm_gemmk_desc.GetLength(I1);
        const auto AK0 = K / K1;
        return transform_tensor_descriptor(
@@ -709,7 +709,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        namespace ctc = tensor_layout::convolution;
        // check device
-        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+             ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908"))
        {
            return false;
        }
@@ -834,6 +835,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        {
            return false;
        }
        // check Gridwise GEMM
        return GridwiseGemm::CheckValidity(
            arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.e_grid_desc_m_n_);
@@ -946,7 +948,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock << ", "
-            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << getConvForwardSpecializationString(ConvForwardSpecialization) << ", "
+            << K1
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -824,7 +824,19 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock << ", "
-            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << getConvForwardSpecializationString(ConvForwardSpecialization) << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1 << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << CBlockTransferScalarPerVector_NWaveNPerXdl
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -866,7 +866,16 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
            << "B0Spec" << getTensorSpecializationString(BSpec) << ", "
            << "B1Spec" << getTensorSpecializationString(B1Spec) << ", "
            << "CSpec" << getTensorSpecializationString(CSpec) << ", "
-            << getMaskingSpecializationString(MaskingSpec) << ">";
+            << getMaskingSpecializationString(MaskingSpec) << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle
+            << ">";
        // clang-format on
        return str.str();

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -770,7 +770,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(ck::get_device_name() == "gfx1100")
+        if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+           ck::get_device_name() == "gfx1102")
        {
            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -789,6 +789,20 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
        return true;
    }
+    // check if DsLayout is supported
+    template <typename RefLayout, typename DsLayout, const index_t NumDTensor>
+    static bool CheckDLayout()
+    {
+        static bool valid = true;
+        // iterate over DLayout tuple
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+            // if RefLayout and DLayout are same, keep valid true, otherwise false
+            valid = valid && is_same_v<RefLayout, DLayout>;
+        });
+        return valid;
+    }
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
@@ -797,6 +811,26 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
            return false;
        }
+        // Check supported layouts
+        // A0 - Row
+        // B0 - Col
+        // D0s - Rows
+        // B1 - Row or Col
+        // D1s - Rows
+        // E1 - Row
+        if(!(is_same_v<tensor_layout::gemm::RowMajor, A0Layout> &&
+             is_same_v<tensor_layout::gemm::ColumnMajor, B0Layout> &&
+             CheckDLayout<tensor_layout::gemm::RowMajor, D0sLayout, NumD0Tensor>() &&
+             (is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ||
+              is_same_v<tensor_layout::gemm::ColumnMajor,
+                        B1Layout>)&&CheckDLayout<tensor_layout::gemm::RowMajor,
+                                                 D1sLayout,
+                                                 NumD1Tensor>() &&
+             is_same_v<tensor_layout::gemm::RowMajor, E1Layout>))
+        {
+            return false;
+        }
        return GridwiseGemm::CheckValidity(arg.a0_grid_desc_m_k_,
                                           arg.b0_grid_desc_n_k_,
                                           arg.b1_grid_desc_n_k_,

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -588,6 +588,11 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            return false;
        }
+        if(ck::get_device_name() != "gfx90a" && std::is_same<ADataType, double>::value)
+        {
+            return false;
+        }
        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                        arg.b_grid_desc_n_k_,
                                        arg.ds_grid_desc_m_n_,

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -777,7 +777,19 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1 << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << CBlockTransferScalarPerVector_NWaveNPerXdl
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -822,7 +822,14 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -956,7 +956,19 @@ struct
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1 << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << CBlockTransferScalarPerVector_NWaveNPerXdl
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -913,7 +913,19 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1 << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << CBlockTransferScalarPerVector_NWaveNPerXdl
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -880,7 +880,17 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock << ", "
-            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << getConvForwardSpecializationString(ConvForwardSpecialization) << ", "
+            << K1 << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1 << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << CBlockTransferScalarPerVector_NWaveNPerXdl
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -720,7 +720,16 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock << ", "
-            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << getConvForwardSpecializationString(ConvForwardSpecialization) << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1
            << ">";
        // clang-format on