Merge branch 'develop' into transpose_5d

e1a5137e · arai713 · GitHub · eb57178d · 718065eb · e1a5137e
Unverified Commit e1a5137e authored Sep 19, 2023 by arai713 Committed by GitHub Sep 19, 2023
20 changed files
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
@@ -30,6 +30,7 @@ using ADataType   = int8_t;
 using BDataType   = int8_t;
 using AccDataType = int32_t;
 using CDataType   = int32_t;
+using ComputeType = int8_t;
 using ALayout = Row;
 using BLayout = Col;
@@ -43,11 +44,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
    // clang-format off
-//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|     Compute|
-//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|        Type|
-//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|            |
-//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |            |
-        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,             16,             16,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,             16,             16,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,             16,             16,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,             16,             16,      true,           1,           1,                   S<1, 32, 1, 8>,               4, ComputeType>;
 // clang-format on
 #include "run_splitK_gemm_example.inc"

--- a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
@@ -14,18 +14,22 @@ using ComputeDataType = float;
 struct YElementOp
 {
-    template <typename T>
+    template <typename Y, typename X>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ void operator()(Y& y, const X& x) const
    {
-        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
+        static_assert(ck::is_same<X, float>::value || ck::is_same<X, double>::value ||
-                          ck::is_same<T, ck::half_t>::value,
+                          ck::is_same<X, ck::half_t>::value,
                      "Data type is not supported by this operation!");
-        T a;
+        static_assert(ck::is_same<Y, float>::value || ck::is_same<Y, double>::value ||
+                          ck::is_same<Y, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+        X a;
        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
-        y = x * a;
+        y = ck::type_convert<Y>(x * a);
    };
 };

--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
@@ -8,7 +8,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -60,7 +60,7 @@ bool maxpool_bwd_test(bool do_verification,
                                                                1>; // InSrcOutDstVectorSize
    using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
-        DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
+        DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
@@ -155,7 +155,8 @@ bool maxpool_bwd_test(bool do_verification,
        dout_n_c_ho_wo.mDesc.GetElementSpaceSize(),
        din_n_c_hi_wi_device.mDesc.GetElementSpaceSize(),
        window_spatial_lengths,
-        window_strides);
+        window_strides,
+        window_dilations);
    if(!pool_bwd.IsSupportedArgument(pool_bwd_argument_ptr.get()))
    {

--- a/example/52_image_to_column/CMakeLists.txt
+++ b/example/52_image_to_column/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_custom_target(example_image_to_column)
+   add_example_executable(example_image_to_column_f32 image_to_column_f32.cpp)
+   add_dependencies(example_image_to_column example_image_to_column_f32)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/52_image_to_column/common.hpp
+++ b/example/52_image_to_column/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+static inline constexpr ck::index_t NDimSpatial = 2;
+using FP32 = float;
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+};
+#define DefaultConvParams                                                            \
+    ck::utils::conv::ConvParam                                                       \
+    {                                                                                \
+        NDimSpatial, 1, 32, 1, 1, {4, 4}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, { 0, 0 } \
+    }
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_params)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+    if(argc == 1)
+    {
+        // use default
+        config = ExecutionConfig{};
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_params                       = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+    return true;
+}
--- a/example/52_image_to_column/image_to_column_f32.cpp
+++ b/example/52_image_to_column/image_to_column_f32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using InDataType  = FP32;
+using OutDataType = FP32;
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+// clang-format off
+using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+                              < NDimSpatial, InLayout, InDataType, OutDataType,   256,   128,   128, S<16, 16>,     1>;
+// clang-format on
+bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
+{
+    const auto N = conv_params.N_;
+    const auto C = conv_params.C_;
+    const ck::index_t NDoHoWo =
+        N * ck::accumulate_n<ck::index_t>(
+                conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+    const ck::index_t CZYX =
+        C * ck::accumulate_n<ck::index_t>(
+                conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+    const auto in_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
+    const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
+    std::array<ck::index_t, 2> output_m_k_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    copy(conv_params.input_spatial_lengths_, input_spatial_lengths);
+    copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
+    copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
+    copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
+    copy(out_desc.GetStrides(), output_m_k_strides);
+    copy(conv_params.conv_filter_strides_, conv_filter_strides);
+    copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_params.input_left_pads_, input_left_pads);
+    copy(conv_params.input_right_pads_, input_right_pads);
+    Tensor<InDataType> in(in_desc);
+    Tensor<OutDataType> out_device(out_desc);
+    Tensor<OutDataType> out_host(out_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "out: " << out_device.mDesc << std::endl;
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1: in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    // reset input to zero
+    out_device_buf.SetZero();
+    static_assert(std::is_default_constructible_v<DeviceImgToColInstance>);
+    // do conv
+    auto img2col  = DeviceImgToColInstance{};
+    auto invoker  = img2col.MakeInvoker();
+    auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                         out_device_buf.GetDeviceBuffer(),
+                                         N,
+                                         C,
+                                         input_spatial_lengths,
+                                         filter_spatial_lengths,
+                                         output_spatial_lengths,
+                                         input_g_n_c_wis_strides,
+                                         output_m_k_strides,
+                                         conv_filter_strides,
+                                         conv_filter_dilations,
+                                         input_left_pads,
+                                         input_right_pads);
+    if(!img2col.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_img2col with the specified compilation parameters does "
+                     "not support this img2col problem"
+                  << std::endl;
+        return false;
+    }
+    float ave_time        = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+    std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
+    float gb_per_sec      = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    if(config.do_verification)
+    {
+        auto ref_image_to_column = ck::tensor_operation::host::
+            ReferenceImageToColumn<NDimSpatial, InLayout, InDataType, OutDataType>();
+        auto ref_invoker = ref_image_to_column.MakeInvoker();
+        auto ref_argument = ref_image_to_column.MakeArgument(in,
+                                                             out_host,
+                                                             conv_params.filter_spatial_lengths_,
+                                                             conv_params.conv_filter_strides_,
+                                                             conv_params.conv_filter_dilations_,
+                                                             conv_params.input_left_pads_,
+                                                             conv_params.input_right_pads_);
+        if(!ref_image_to_column.IsSupportedArgument(&ref_argument))
+        {
+            std::cerr << "wrong! ref_img2col with the specified compilation parameters does "
+                         "not support this img2col problem"
+                      << std::endl;
+            return false;
+        }
+        ref_invoker.Run(ref_argument);
+        out_device_buf.FromDevice(out_device.mData.data());
+        return ck::utils::check_err(out_device.mData, out_host.mData);
+    }
+    return true;
+}
+int RunImageToColumnExample(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_params = DefaultConvParams;
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatial dimensions" << std::endl;
+        return EXIT_FAILURE;
+    }
+    return !RunImageToColumn(config, conv_params);
+}
+int main(int argc, char* argv[]) { return RunImageToColumnExample(argc, argv); }
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -3,6 +3,8 @@
 #pragma once
+#include "ck/config.h"
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
@@ -200,9 +202,6 @@
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1
-// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
-#define CK_WORKAROUND_GITHUB_ISSUE_824 1
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0

--- a/include/ck/config.h.in
+++ b/include/ck/config.h.in
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_CONFIG_H_IN
+#define CK_CONFIG_H_IN
+// clang-format off
+//
+// DataType supports in the current CK build
+//
+#ifndef DTYPES
+#cmakedefine DTYPES "@DTYPES@"
+#endif
+// if DTYPES is not defined, enable all datatypes in headerfiles
+#ifndef CK_ENABLE_ALL_DTYPES
+#cmakedefine CK_ENABLE_ALL_DTYPES @CK_ENABLE_ALL_DTYPES@
+#if defined(CK_ENABLE_ALL_DTYPES)
+#ifndef CK_ENABLE_INT8
+#define CK_ENABLE_INT8 "ON"
+#endif
+#ifndef CK_ENABLE_FP8
+#define CK_ENABLE_FP8 "ON"
+#endif
+#ifndef CK_ENABLE_BF8
+#define CK_ENABLE_BF8 "ON"
+#endif
+#ifndef CK_ENABLE_FP16
+#define CK_ENABLE_FP16 "ON"
+#endif
+#ifndef CK_ENABLE_BF16
+#define CK_ENABLE_BF16 "ON"
+#endif
+#ifndef CK_ENABLE_FP32
+#define CK_ENABLE_FP32 "ON"
+#endif
+#ifndef CK_ENABLE_FP64
+#define CK_ENABLE_FP64 "ON"
+#endif
+#endif
+#endif
+// if DTYPES are selectively enabled
+#ifndef CK_ENABLE_INT8
+#cmakedefine CK_ENABLE_INT8 @CK_ENABLE_INT8@
+#endif
+#ifndef CK_ENABLE_FP8
+#cmakedefine CK_ENABLE_FP8 @CK_ENABLE_FP8@
+#endif
+#ifndef CK_ENABLE_BF8
+#cmakedefine CK_ENABLE_BF8 @CK_ENABLE_BF8@
+#endif
+#ifndef CK_ENABLE_FP16
+#cmakedefine CK_ENABLE_FP16 @CK_ENABLE_FP16@
+#endif
+#ifndef CK_ENABLE_BF16
+#cmakedefine CK_ENABLE_BF16 @CK_ENABLE_BF16@
+#endif
+#ifndef CK_ENABLE_FP32
+#cmakedefine CK_ENABLE_FP32 @CK_ENABLE_FP32@
+#endif
+#ifndef CK_ENABLE_FP64
+#cmakedefine CK_ENABLE_FP64 @CK_ENABLE_FP64@
+#endif
+//
+// Legacy DL kernel supports in the current CK build
+// by default DL kernels are turned OFF
+//
+#ifndef CK_ENABLE_DL_KERNELS
+#cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@
+#endif
+//
+// Instances supports in the current CK build
+//
+#ifndef CK_ENABLE_INSTANCES_ONLY
+#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@
+#endif
+// clang-format on
+#endif // CK_CONFIG_H_IN
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -1042,13 +1042,13 @@ struct Merge_v2_magic_division
    using UpLengths =
        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
-    using LowLengthsMagicDivisorMultipiler = decltype(
+    using LowLengthsMagicDivisorMultipiler = decltype(generate_tuple(
-        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
+        lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
-                       Number<NDimLow>{}));
+        Number<NDimLow>{}));
-    using LowLengthsMagicDivisorShift = decltype(
+    using LowLengthsMagicDivisorShift = decltype(generate_tuple(
-        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
+        lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
-                       Number<NDimLow>{}));
+        Number<NDimLow>{}));
    LowLengths low_lengths_;
    LowLengthsMagicDivisorMultipiler low_lengths_magic_divisor_multiplier_;
@@ -1201,9 +1201,9 @@ struct Merge_v2r2_magic_division
        lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
        Number<NDimLow>{}));
-    using LowLengthsScanMagicDivisorShift = decltype(
+    using LowLengthsScanMagicDivisorShift = decltype(generate_tuple(
-        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengthsScan>{},
+        lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengthsScan>{},
-                       Number<NDimLow>{}));
+        Number<NDimLow>{}));
    LowLengths low_lengths_;
    LowLengthsScan low_lengths_scan_;

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -221,49 +221,102 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle
        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());
-        static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
+        // basic intrinsic to determine loopover direction
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
+        if constexpr(MRepeat < NRepeat)
-                // read A
+        {
-                a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+            static_for<0, KPerBlock / WmmaK, 1>{}(
-                                   make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
-                                   a_block_buf,
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
-                                   a_thread_desc_,
+                        // read A
-                                   make_tuple(I0, m0, I0, I0, I0),
+                        a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
-                                   a_thread_buf);
+                                           make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                                           a_block_buf,
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                           a_thread_desc_,
-                    // read B
+                                           make_tuple(I0, m0, I0, I0, I0),
-                    b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                           a_thread_buf);
-                                       make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
-                                       b_block_buf,
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                       b_thread_desc_,
+                            // read B
-                                       make_tuple(I0, n0, I0, I0, I0),
+                            b_thread_copy_.Run(
-                                       b_thread_buf);
+                                b_block_desc_k0_n0_n1_n2_k1,
-                    vector_type<FloatA, WmmaK> a_thread_vec;
+                                make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
-                    vector_type<FloatB, WmmaK> b_thread_vec;
+                                b_block_buf,
+                                b_thread_desc_,
-                    static_for<0, WmmaK, 1>{}([&](auto i) {
+                                make_tuple(I0, n0, I0, I0, I0),
-                        a_thread_vec.template AsType<FloatA>()(i) =
+                                b_thread_buf);
-                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            vector_type<FloatA, WmmaK> a_thread_vec;
-                                make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                            vector_type<FloatB, WmmaK> b_thread_vec;
-                        b_thread_vec.template AsType<FloatB>()(i) =
-                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            static_for<0, WmmaK, 1>{}([&](auto i) {
-                                make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                                a_thread_vec.template AsType<FloatA>()(i) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                                b_thread_vec.template AsType<FloatB>()(i) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                            });
+                            using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                            using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                            wmma_gemm.template Run(
+                                a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                                b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
                    });
-                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
-                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
-                    constexpr index_t c_offset =
-                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                    wmma_gemm.template Run(
-                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
-                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
-                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
-            });
+        }
-        });
+        else
+        {
+            static_for<0, KPerBlock / WmmaK, 1>{}(
+                [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        // read B
+                        b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                           make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(I0, n0, I0, I0, I0),
+                                           b_thread_buf);
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            // read A
+                            a_thread_copy_.Run(
+                                a_block_desc_k0_m0_m1_m2_k1,
+                                make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                                a_block_buf,
+                                a_thread_desc_,
+                                make_tuple(I0, m0, I0, I0, I0),
+                                a_thread_buf);
+                            vector_type<FloatA, WmmaK> a_thread_vec;
+                            vector_type<FloatB, WmmaK> b_thread_vec;
+                            static_for<0, WmmaK, 1>{}([&](auto i) {
+                                a_thread_vec.template AsType<FloatA>()(i) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                                b_thread_vec.template AsType<FloatB>()(i) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                            });
+                            using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                            using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                            wmma_gemm.template Run(
+                                a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                                b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+        }
    }
    protected:

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -4,27 +4,13 @@
 #pragma once
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/loop_scheduler.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
-enum struct LoopScheduler
-{
-    Default,
-    Interwave,
-};
-constexpr LoopScheduler make_default_loop_scheduler()
-{
-#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
-    return LoopScheduler::Interwave;
-#else
-    return LoopScheduler::Default;
-#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
-}
 template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
 __host__ __device__ static constexpr auto
 MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)

--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -35,8 +35,8 @@ struct BlockwiseSoftmax
    static constexpr index_t MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0);
    static constexpr index_t KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1);
-    using ThreadSliceDesc_M = decltype(
+    using ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed(
-        make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));
+        make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));
    using ThreadwiseMaxReduce = typename conditional<
        IgnoreNaN,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <array>
+#include "device_grouped_gemm.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <index_t NumDTensor = 0>
+struct GroupedGemmKernelArgument
+{
+    const void* p_a_grid;
+    const void* p_b_grid;
+    std::array<const void*, NumDTensor> p_ds_grid;
+    void* p_e_grid;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t StrideA;
+    index_t StrideB;
+    std::array<index_t, NumDTensor> StrideDs;
+    index_t StrideE;
+};
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGroupedGemmFixedNK : DeviceGroupedGemm<ALayout,
+                                                    BLayout,
+                                                    DsLayout,
+                                                    ELayout,
+                                                    ADataType,
+                                                    BDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    CElementwiseOperation>
+{
+    virtual void SetDeviceKernelArgs(BaseArgument* p_arg, const void* kernel_args) const = 0;
+    virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const               = 0;
+    virtual void SetKBatch(BaseArgument* p_arg, index_t k_batch) const                   = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+/**
+ * \brief Image to column.
+ *
+ * This Device operator converts image ([G, N, Di, Hi, Wi, C]) to the gemm
+ * problem([N * Do * Ho * Wo, Z *  Y * X * C]). G must be equal to 1.
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam InputLayout Input Layout.
+ * \tparam InputDataType Input Data Type.
+ * \tparam OutputDataType Output Data Type.
+ */
+template <index_t NDimSpatial,
+          typename InputLayout,
+          typename InputDataType,
+          typename OutputDataType>
+struct DeviceImageToColumn : public BaseOperator
+{
+    /**
+     * \brief Make argument pointer for image to column.
+     *
+     * \param p_in A pointer to the device memory of the input image.
+     * \param p_out A pointer to the device memory of the output.
+     * \param N Convolution batch size.
+     * \param C Convolution number of channels.
+     * \param input_spatial_lengths Input spatial lengths.
+     * \param filter_spatial_lengths Filter spatial lengths.
+     * \param output_spatial_lengths Output spatial lengths.
+     * \param input_g_n_c_wis_strides Input strides in order [G, N, C, D, H, W].
+     * \param output_m_k_strides Output strides.
+     * \param conv_filter_strides Convolution filter strides.
+     * \param conv_filter_dilations Convolution filter dilations.
+     * \param input_left_pads Convolution left pads.
+     * \param input_right_pads Convolution right pads.
+     * \return Pointer to the argument.
+     */
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        void* p_out,
+                        const ck::index_t N,
+                        const ck::index_t C,
+                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& output_m_k_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp