Merge branch 'amd-develop' into amd-master

5683ea4e · Jun Liu · f0831350 · dddc2115 · 5683ea4e · 5683ea4e
Commit 5683ea4e authored Aug 08, 2023 by Jun Liu
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,11 +60,34 @@ message("checking which targets are supported")
 #This is the list of targets to be used in case GPU_TARGETS is not set on command line
 #These targets will be filtered and only supported ones will be used
 #Setting GPU_TARGETS on command line will override this list
-rocm_check_target_ids(DEFAULT_GPU_TARGETS
-    TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
-)
+if(NOT PROFILER_ONLY)
+    rocm_check_target_ids(DEFAULT_GPU_TARGETS
+        TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+else()
+    add_definitions(-DPROFILER_ONLY)
+    if(GPU_TARGETS)
+        message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
+    endif()
+    if(GPU_ARCH MATCHES "gfx9")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942")
+    elseif(GPU_ARCH MATCHES "gfx10")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
+    elseif(GPU_ARCH MATCHES "gfx11")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
+    else()
+        message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
+    endif()
+endif()
+
 message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
+
 set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ")
+
+if(GPU_TARGETS)
+    message("Building CK for the following targets: ${GPU_TARGETS}")
+else()
+    message("Building CK for the following targets: ${AMDGPU_TARGETS}")
+endif()
 find_package(hip)

 option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
@@ -347,6 +370,7 @@ add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${IN
 add_subdirectory(library)

 if(NOT DEFINED INSTANCES_ONLY)
+ if(NOT DEFINED PROFILER_ONLY)
   rocm_package_setup_component(tests
        LIBRARY_NAME composablekernel
        PACKAGE_NAME tests # Prevent -static suffix on package name
@@ -356,15 +380,22 @@ if(NOT DEFINED INSTANCES_ONLY)
        LIBRARY_NAME composablekernel
        PACKAGE_NAME examples
   )
+   add_subdirectory(example)
+   add_subdirectory(test)

   rocm_package_setup_component(profiler
        LIBRARY_NAME composablekernel
        PACKAGE_NAME ckProfiler
   )
-
-   add_subdirectory(example)
-   add_subdirectory(test)
   add_subdirectory(profiler)
+  else()
+    #When building PROFILER_ONLY, label the package with GPU_ARCH
+    rocm_package_setup_component(profiler
+       LIBRARY_NAME composablekernel
+       PACKAGE_NAME ckProfiler_${GPU_ARCH}
+    )
+    add_subdirectory(profiler)
+  endif()
 endif()

 #Create an interface target for the include only files and call it "composablekernels"

--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
@@ -32,63 +32,49 @@ struct SimpleDeviceMem
 };

 template <ck::index_t NumDimSpatial>
-std::size_t GetFlops(ck::index_t G,
-                     ck::index_t N,
-                     ck::index_t K,
-                     ck::index_t C,
-                     const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
-                     const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
+std::size_t GetFlops(const std::array<ck::index_t, NumDimSpatial>& output_lengths,
+                     const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
 {
+    constexpr ck::index_t spatial_offset = 3;
+    const auto C                         = filter_lengths[2];
    // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
-    return static_cast<std::size_t>(2) * G * N * K * C *
-           std::accumulate(std::begin(output_spatial_lengths),
-                           std::end(output_spatial_lengths),
+    return static_cast<std::size_t>(2) * C *
+           std::accumulate(std::begin(output_lengths),
+                           std::end(output_lengths),
                           static_cast<std::size_t>(1),
                           std::multiplies<>()) *
-           std::accumulate(std::begin(filter_spatial_lengths),
-                           std::end(filter_spatial_lengths),
+           std::accumulate(std::begin(filter_lengths) + spatial_offset,
+                           std::end(filter_lengths),
                           static_cast<std::size_t>(1),
                           std::multiplies<>());
 }

 template <typename InDataType, ck::index_t NumDimSpatial>
-std::size_t GetInputByte(ck::index_t G,
-                         ck::index_t N,
-                         ck::index_t C,
-                         const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths)
+std::size_t GetInputByte(const std::array<ck::index_t, NumDimSpatial>& input_lengths)
 {
    // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
-    return sizeof(InDataType) * (G * N * C *
-                                 std::accumulate(std::begin(input_spatial_lengths),
-                                                 std::end(input_spatial_lengths),
+    return sizeof(InDataType) * (std::accumulate(std::begin(input_lengths),
+                                                 std::end(input_lengths),
                                                 static_cast<std::size_t>(1),
                                                 std::multiplies<>()));
 }

 template <typename WeiDataType, ck::index_t NumDimSpatial>
-std::size_t GetWeightByte(ck::index_t G,
-                          ck::index_t K,
-                          ck::index_t C,
-                          const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
+std::size_t GetWeightByte(const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
 {
    // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
-    return sizeof(WeiDataType) * (G * K * C *
-                                  std::accumulate(std::begin(filter_spatial_lengths),
-                                                  std::end(filter_spatial_lengths),
+    return sizeof(WeiDataType) * (std::accumulate(std::begin(filter_lengths),
+                                                  std::end(filter_lengths),
                                                  static_cast<std::size_t>(1),
                                                  std::multiplies<>()));
 }

 template <typename OutDataType, ck::index_t NumDimSpatial>
-std::size_t GetOutputByte(ck::index_t G,
-                          ck::index_t N,
-                          ck::index_t K,
-                          const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths)
+std::size_t GetOutputByte(const std::array<ck::index_t, NumDimSpatial>& output_lengths)
 {
    // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
-    return sizeof(OutDataType) * (G * N * K *
-                                  std::accumulate(std::begin(output_spatial_lengths),
-                                                  std::end(output_spatial_lengths),
+    return sizeof(OutDataType) * (std::accumulate(std::begin(output_lengths),
+                                                  std::end(output_lengths),
                                                  static_cast<std::size_t>(1),
                                                  std::multiplies<std::size_t>()));
 }
@@ -101,14 +87,11 @@ template <ck::index_t NumDimSpatial,
          typename WeiLayout,
          typename OutLayout>
 bool run_grouped_conv_bwd_weight(
-    const ck::index_t G,
-    const ck::index_t N,
-    const ck::index_t K,
-    const ck::index_t C,
-    const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
-    const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
-    const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& input_lengths,
    const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& filter_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& weights_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& output_lengths,
    const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
@@ -117,9 +100,9 @@ bool run_grouped_conv_bwd_weight(
 {

    ck::index_t split_k = 2;
-    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths));
-    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths));
-    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths));
+    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths));
+    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths));
+    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths));

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
                                                                              InLayout,
@@ -143,6 +126,10 @@ bool run_grouped_conv_bwd_weight(
    float best_gb_per_sec = 0;
    float best_tflops     = 0;

+    std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NumDimSpatial + 3> b_g_k_c_xs_lengths{};
+
    // profile device operation instances
    std::cout << "Run all instances and do timing" << std::endl;

@@ -152,14 +139,11 @@ bool run_grouped_conv_bwd_weight(
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
-                                                        G,
-                                                        N,
-                                                        K,
-                                                        C,
-                                                        input_spatial_lengths,
-                                                        filter_spatial_lengths,
-                                                        output_spatial_lengths,
+                                                        input_lengths,
                                                        input_strides,
+                                                        filter_lengths,
+                                                        weights_strides,
+                                                        output_lengths,
                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
@@ -176,12 +160,10 @@ bool run_grouped_conv_bwd_weight(
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

-            std::size_t flop =
-                GetFlops<NumDimSpatial>(G, N, K, C, output_spatial_lengths, filter_spatial_lengths);
-            std::size_t num_bytes =
-                GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths) +
-                GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths) +
-                GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths);
+            std::size_t flop      = GetFlops<NumDimSpatial + 3>(output_lengths, filter_lengths);
+            std::size_t num_bytes = GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths) +
+                                    GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths) +
+                                    GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths);

            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -221,14 +203,11 @@ bool run_grouped_conv_bwd_weight(
        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
-                                                        G,
-                                                        N,
-                                                        K,
-                                                        C,
-                                                        input_spatial_lengths,
-                                                        filter_spatial_lengths,
-                                                        output_spatial_lengths,
+                                                        input_lengths,
                                                        input_strides,
+                                                        filter_lengths,
+                                                        weights_strides,
+                                                        output_lengths,
                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
@@ -22,11 +22,12 @@ static constexpr ck::index_t C             = 192;
 static constexpr ck::index_t X             = 3;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Wo            = 28;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Wo};
-static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, C, 1};
-static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, 1, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{K * X * C, X* C, 1, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, 1, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
@@ -40,14 +41,11 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  input_spatial_lengths,
-                                                  filter_spatial_lengths,
-                                                  output_spatial_lengths,
+                                       OutLayout>(input_lengths,
                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
                                                  output_strides,
                                                  conv_filter_strides,
                                                  conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
@@ -25,13 +25,15 @@ static constexpr ck::index_t Hi            = 28;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 28;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Ho, Wo};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
-    N * Hi * Wi * C, Hi* Wi* C, Wi* C, C, 1};
+    N * Hi * Wi * C, Hi* Wi* C, 1, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Y * X * C, Y* X* C, 1, X* C, C};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
-    N * Ho * Wo * K, Ho* Wo* K, Wo* K, K, 1};
+    N * Ho * Wo * K, Ho* Wo* K, 1, Wo* K, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
@@ -45,14 +47,11 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  input_spatial_lengths,
-                                                  filter_spatial_lengths,
-                                                  output_spatial_lengths,
+                                       OutLayout>(input_lengths,
                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
                                                  output_strides,
                                                  conv_filter_strides,
                                                  conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
@@ -28,13 +28,15 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
-    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
-    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
@@ -48,14 +50,11 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  input_spatial_lengths,
-                                                  filter_spatial_lengths,
-                                                  output_spatial_lengths,
+                                       OutLayout>(input_lengths,
                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
                                                  output_strides,
                                                  conv_filter_strides,
                                                  conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
@@ -28,13 +28,15 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
-    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
-    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
@@ -48,20 +50,16 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(
-               G,
-               N,
-               K,
-               C,
-               {Di, Hi, Wi},
-               {Z, Y, X},
-               {Do, Ho, Wo},
-               {N * Di * Hi * Wi * C, Di * Hi * Wi * C, Hi * Wi * C, Wi * C, C, 1},
-               {N * Do * Ho * Wo * K, Do * Ho * Wo * K, Ho * Wo * K, Wo * K, K, 1},
-               {1, 1, 1},
-               {1, 1, 1},
-               {1, 1, 1},
-               {1, 1, 1})
+                                       OutLayout>(input_lengths,
+                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/docs/API_Reference_Guide.rst
+++ b/docs/API_Reference_Guide.rst
@@ -7,8 +7,8 @@ API Reference Guide
 Introduction
 =================

-This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
-principles that are used to write new classes that extend CK functionality.
+This document contains details of the APIs for the Composable Kernel (CK) library and introduces
+some of the key design principles that are used to write new classes that extend CK functionality.

 =================
 Using CK API
@@ -30,8 +30,8 @@ DeviceMem
 Kernels For Flashattention
 ---------------------------

-The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`.  This sections lists the classes that are
-used in the CK GPU implementation of Flashattention.
+The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists
+the classes that are used in the CK GPU implementation of Flashattention.

 **Gridwise classes**


--- a/docs/Supported_Primitives_Guide.rst
+++ b/docs/Supported_Primitives_Guide.rst
@@ -2,15 +2,16 @@
 Supported Primitives Guide
 ==========================

-This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference
-Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
+This document contains details of supported primitives in Composable Kernel (CK). In contrast to the
+API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins
+the algorithms implemented in CK.

 ------------
 Softmax
 ------------

-For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the softmax of concatenated
-:math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
+For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the
+softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,

 .. math::
   :nowrap:
@@ -25,8 +26,8 @@ For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can d
 where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
 :math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.

-For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size :math:`B_r \times B_c` we can
-compute the row-wise softmax as follows.
+For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size
+:math:`B_r \times B_c` we can compute the row-wise softmax as follows.

 For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,


--- a/docs/dockerhub.rst
+++ b/docs/dockerhub.rst
 ===================
-CK docker hub
+CK Docker Hub
 ===================

-`Docker hub <https://hub.docker.com/r/rocm/composable_kernel>`_
-
 -------------------------------------
 Why do I need this?
 -------------------------------------

-To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images.
+To make our lives easier and bring Composable Kernel dependencies together, we recommend using
+docker images that can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel>`_.

 -------------------------------------
 So what is Composable Kernel?
 -------------------------------------

-Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical
+kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
+through general purpose kernel languages, like HIP C++.

 To get the CK library::

    git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git


-
 run a docker container::

    docker run                                                            \
@@ -30,7 +30,7 @@ run a docker container::
    --group-add sudo                                                      \
    -w /root/workspace                                                    \
    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
-    rocm/composable_kernel:ck_ub20.04_rocm5.3_release                     \
+    rocm/composable_kernel:ck_ub20.04_rocm5.6                             \
    /bin/bash

 and build the CK::
@@ -58,7 +58,9 @@ We can also run specific examples or tests like::
    ./bin/example_gemm_xdl_fp16
    ./bin/test_gemm_fp16

-For more details visit `CK github repo <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_, `CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_, `even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.
+For more details visit `CK github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_,
+`CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_,
+`even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.

 -------------------------------------
 And what is inside?
@@ -74,12 +76,11 @@ The docker images have everything you need for running CK including:
 Which image is right for me?
 -------------------------------------

-Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are:
+Let's take a look at the image naming, for example ``ck_ub20.04_rocm5.6``. The image specs are:

-* "ck" - made for running Composable Kernel
-* "ub20.04" - based on Ubuntu 20.04
-* "rocm5.4" - ROCm platform version 5.4
-* "release" - compiler version is release
+* ``ck`` - made for running Composable Kernel;
+* ``ub20.04`` - based on Ubuntu 20.04;
+* ``rocm5.6`` - ROCm platform version 5.6.

 So just pick the right image for your project dependencies and you're all set.

@@ -87,7 +88,9 @@ So just pick the right image for your project dependencies and you're all set.
 DIY starts here
 -------------------------------------

-If you need to customize a docker image or just can't stop tinkering, feel free to adjust the `Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_ for your needs.
+If you need to customize a docker image or just can't stop tinkering, feel free to adjust the
+`Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_
+for your needs.

 -------------------------------------
 License

--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,12 +12,15 @@ This document contains instructions for installing, using, and contributing to C
 Methodology
 -----------

-Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical
+kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
+through general purpose kernel languages, like HIP C++.

 CK utilizes two concepts to achieve performance portability and code maintainability:

 * A tile-based programming model
-* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+* Algorithm complexity reduction for complex ML operators, using innovative technique we call
+  "Tensor Coordinate Transformation".

 .. image:: data/ck_component.png
   :alt: CK Components

--- a/docs/tutorial_hello_world.rst
+++ b/docs/tutorial_hello_world.rst
@@ -6,15 +6,26 @@ CK Hello world
 Motivation
 -------------------------------------

-This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who would like to optimize their pipelines and squeeze every performance drop by adding Composable Kernel (CK) library to their projects. We would like to make the CK library approachable so the tutorial is not based on the latest release and doesn't have all the bleeding edge features, but it will be reproducible now and forever.
+This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who
+would like to optimize their pipelines and squeeze every performance drop by adding Composable
+Kernel (CK) library to their projects. We would like to make the CK library approachable so
+the tutorial is not based on the latest release and doesn't have all the bleeding edge features,
+but it will be reproducible now and forever.

-During this tutorial we will have an introduction to the CK library, we will build it and run some examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go in depth and breadth and get familiar with other tools and ways to integrate CK into your project.
+During this tutorial we will have an introduction to the CK library, we will build it and run some
+examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go
+in depth and breadth and get familiar with other tools and ways to integrate CK into your project.

 -------------------------------------
 Description
 -------------------------------------

-Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create new ones. The library has components required for majority of modern neural networks architectures including matrix multiplication, convolution, contraction, reduction, attention modules, variety of activation functions, fused operators and many more.
+Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and
+efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast
+and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create
+new ones. The library has components required for majority of modern neural networks architectures
+including matrix multiplication, convolution, contraction, reduction, attention modules, variety of
+activation functions, fused operators and many more.

 So how do we (almost) reach the speed of light? CK acceleration abilities are based on:

@@ -24,15 +35,18 @@ So how do we (almost) reach the speed of light? CK acceleration abilities are ba
 * Hardware acceleration use.
 * Support of low precision data types including fp16, bf16, int8 and int4.

-If you are excited and need more technical details and benchmarking results - read this awesome `blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
+If you are excited and need more technical details and benchmarking results - read this awesome
+`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.

-For more details visit our `github repo <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_.
+For more details visit our `github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_.

 -------------------------------------
 Hardware targets
 -------------------------------------

-CK library fully supports "gfx908" and "gfx90a" GPU architectures and only some operators are supported for "gfx1030". Let's check the hardware you have at hand and decide on the target GPU architecture
+CK library fully supports `gfx908` and `gfx90a` GPU architectures and only some operators are
+supported for `gfx1030`. Let's check the hardware you have at hand and decide on the target
+GPU architecture.

 ==========     =========
 GPU Target     AMD GPU
@@ -42,7 +56,8 @@ gfx90a 	       Radeon Instinct MI210, MI250, MI250X
 gfx1030        Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
 ==========     =========

-There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if you don't have an AMD GPU at hand.
+There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
+you don't have an AMD GPU at hand.

 -------------------------------------
 Build the library
@@ -54,9 +69,13 @@ First let's clone the library and rebase to the tested version::
    cd composable_kernel/
    git checkout tutorial_hello_world

-To make our lives easier we prepared `docker images <https://hub.docker.com/r/rocm/composable_kernel>`_ with all the necessary dependencies. Pick the right image and create a container. In this tutorial we use "rocm/composable_kernel:ck_ub20.04_rocm5.3_release" image, it is based on Ubuntu 20.04, ROCm v5.3, compiler release version.
+To make our lives easier we prepared
+`docker images <https://hub.docker.com/r/rocm/composable_kernel>`_ with all the necessary
+dependencies. Pick the right image and create a container. In this tutorial we use
+``rocm/composable_kernel:ck_ub20.04_rocm5.6`` image, it is based on Ubuntu 20.04 and
+ROCm v5.6.

-If your current folder is ${HOME}, start the docker container with::
+If your current folder is ``${HOME}``, start the docker container with::

    docker run  \
    -it  \
@@ -64,20 +83,23 @@ If your current folder is ${HOME}, start the docker container with::
    --group-add sudo  \
    -w /root/workspace  \
    -v ${HOME}:/root/workspace  \
-    rocm/composable_kernel:ck_ub20.04_rocm5.3_release  \
+    rocm/composable_kernel:ck_ub20.04_rocm5.6  \
    /bin/bash

-If your current folder is different from ${HOME}, adjust the line `-v ${HOME}:/root/workspace` to fit your folder structure.
+If your current folder is different from ``${HOME}``, adjust the line ``-v ${HOME}:/root/workspace``
+to fit your folder structure.

-Inside the docker container current folder is "~/workspace", library path is "~/workspace/composable_kernel", navigate to the library::
+Inside the docker container current folder is ``~/workspace``, library path is
+``~/workspace/composable_kernel``, navigate to the library::

    cd composable_kernel/

-Create and go to the "build" directory::
+Create and go to the ``build`` directory::

    mkdir build && cd build

-In the previous section we talked about target GPU architecture. Once you decide which one is right for you, run cmake using the right GPU_TARGETS flag::
+In the previous section we talked about target GPU architecture. Once you decide which one is right
+for you, run CMake using the right ``GPU_TARGETS`` flag::

    cmake  \
    -D CMAKE_PREFIX_PATH=/opt/rocm  \
@@ -87,7 +109,7 @@ In the previous section we talked about target GPU architecture. Once you decide
    -D BUILD_DEV=OFF  \
    -D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..

-If everything went well the cmake run will end up with::
+If everything went well the CMake run will end up with::

    -- Configuring done
    -- Generating done
@@ -118,9 +140,12 @@ We can also run them separately, here is a separate example execution::

    ./bin/example_gemm_xdl_fp16 1 1 1

-The arguments "1 1 1" mean that we want to run this example in the mode: verify results with CPU, initialize matrices with integers and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
+The arguments ``1 1 1`` mean that we want to run this example in the mode: verify results with CPU,
+initialize matrices with integers and benchmark the kernel execution. You can play around with
+these parameters and see how output and execution results change.

-If everything goes well and you have a device based on gfx908 or gfx90a architecture you should see something like::
+If everything goes well and you have a device based on `gfx908` or `gfx90a` architecture you should see
+something like::

    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
    b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
@@ -130,14 +155,15 @@ If everything goes well and you have a device based on gfx908 or gfx90a architec
    Start running 10 times...
    Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1

-Meanwhile, running it on a gfx1030 device should result in::
+Meanwhile, running it on a `gfx1030` device should result in::

    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
    b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
    DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem

-But don't panic, some of the operators are supported on gfx1030 architecture, so you can run a separate example like::
+But don't panic, some of the operators are supported on `gfx1030` architecture, so you can run a
+separate example like::

    ./bin/example_gemm_dl_fp16 1 1 1

@@ -154,7 +180,14 @@ and it should result in something nice similar to::
    Start running 10 times...
    Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>

-Or we can run a separate test::
+.. note::
+
+    There was a new CMake flag ``DL_KERNELS`` added in the latest versions of CK. If you use one of
+    the newest versions of the library and do not see the above results when running
+    ``example_gemm_dl_fp16``, it might be necessary to add ``-D DL_KERNELS=ON`` to your CMake command
+    in order to build the operators supported on the `gfx1030` architecture.
+
+We can also run a separate test::

    ctest -R test_gemm_fp16

@@ -169,6 +202,9 @@ If everything goes well you should see something like::
 Summary
 -----------

-In this tutorial we took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different configs to find out the best one for your hardware and task.
+In this tutorial we took the first look at the Composable Kernel library, built it on your system
+and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different
+configs to find out the best one for your hardware and task.

-P.S.: Don't forget to switch out the cloud instance if you have launched one, you can find better ways to spend your money for sure!
+P.S.: Don't forget to switch off the cloud instance if you have launched one, you can find better
+ways to spend your money for sure!
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
-add_custom_target(example_gemm_dl)
+if(DL_KERNELS)
+  add_custom_target(example_gemm_dl)

-add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
-add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
+  add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
+  add_dependencies(example_gemm_dl example_gemm_dl_fp32)
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
+    add_dependencies(example_gemm_dl example_gemm_dl_fp16)
+  endif()
+  if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+    add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
+    add_dependencies(example_gemm_dl example_gemm_dl_int8)
+  endif()

-add_dependencies(example_gemm_dl example_gemm_dl_fp32)
-add_dependencies(example_gemm_dl example_gemm_dl_fp16)
-
-if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-   add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
-   add_dependencies(example_gemm_dl example_gemm_dl_int8)
+  if(USE_BITINT_EXTENSION_INT4)
+    add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
+    add_dependencies(example_gemm_dl example_gemm_dl_int4)
+  endif(USE_BITINT_EXTENSION_INT4)
 endif()

-if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
-  add_dependencies(example_gemm_dl example_gemm_dl_int4)
-endif(USE_BITINT_EXTENSION_INT4)
-
-
 add_custom_target(example_gemm_xdl)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
+  add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
+  add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)

-add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
-add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
-add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
+  if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
+    add_custom_target(example_gemm_wmma)
+    add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
+    add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+  endif()

-add_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
-add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
-add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
+endif()
+
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+  add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
+endif()

 if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
  add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
@@ -37,22 +50,17 @@ if(USE_BITINT_EXTENSION_INT4)
  add_dependencies(example_gemm_xdl example_gemm_xdl_int4)
 endif(USE_BITINT_EXTENSION_INT4)

-add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
-# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
-add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
-
-add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
-add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
-
-if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
-  add_custom_target(example_gemm_wmma)
-  add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
-  add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+  # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+  add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
 endif()

 add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)

-if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942")
-  add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp)
-  add_dependencies(example_gemm_xdl example_gemm_xdl_f8)
+if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES)
+  if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942")
+    add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp)
+    add_dependencies(example_gemm_xdl example_gemm_xdl_f8)
+  endif()
 endif()
--- a/example/02_gemm_bilinear/CMakeLists.txt
+++ b/example/02_gemm_bilinear/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list1 gfx1100 gfx1101 gfx1102)
 list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
@@ -15,3 +16,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
+endif()
--- a/example/03_gemm_bias_relu/CMakeLists.txt
+++ b/example/03_gemm_bias_relu/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -6,3 +7,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
+endif()
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -3,22 +3,26 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(example_gemm_add_add_fastgelu_xdl)
-
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+    if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+      add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
+      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
+    endif()
+    if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+      add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
+    endif()
+    if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+      add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
+    endif()
    if(USE_BITINT_EXTENSION_INT4)
       add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
+       add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
    endif(USE_BITINT_EXTENSION_INT4)
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
-
-    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
-    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
-    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
-    if(USE_BITINT_EXTENSION_INT4)
-        add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
-    endif(USE_BITINT_EXTENSION_INT4)
-    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
+    if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+      add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
+      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
+    endif()
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -2,16 +2,34 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-    add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
-    add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
-    add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
-    add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
+    if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+      add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
+    endif()
+    if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+      add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
+    endif()
+    if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+      add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
+    endif()
+    if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+      add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
+    endif()
    # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
-    add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+    if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+      add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+    endif()
   set(target 1)
 endif()
 endforeach()

-add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
-add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
-add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
+if(DL_KERNELS)
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
+  endif()
+  if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
+  endif()
+  if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+    add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
+  endif()
+endif()
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
@@ -3,14 +3,22 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(example_convnd_fwd_reduce_xdl)
-   add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
-   add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
-   add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
-   add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
-   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
-   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
-   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
-   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
+   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+    add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
+    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
+   endif()
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+    add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
+    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
+   endif()
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
+    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
+   endif()
+   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
+    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
+   endif()
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
      add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)

--- a/example/13_pool2d_fwd/CMakeLists.txt
+++ b/example/13_pool2d_fwd/CMakeLists.txt
-add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp)
-add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp)
-
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp)
+endif()
--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
 if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
 # dlops
-add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
+if(DL_KERNELS)
+  add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
+endif()

 # xdlops
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)

--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
 add_custom_target(example_grouped_gemm_xdl)
-
-add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp)
-add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
-add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
-add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
-add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp)
-add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp)
-
-
-add_dependencies(example_grouped_gemm_xdl
-                 example_grouped_gemm_xdl_fp32
-                 example_grouped_gemm_xdl_fp16
-                 example_grouped_gemm_xdl_bfp16
-                 example_grouped_gemm_xdl_int8
-                 example_grouped_gemm_multiple_d_dl_fp16
-                 example_grouped_gemm_xdl_splitk_fp16)
-
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32)
+endif()
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
+  add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp)
+  add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp)
+  add_dependencies(example_grouped_gemm_xdl 
+                   example_grouped_gemm_xdl_fp16
+                   example_grouped_gemm_multiple_d_dl_fp16
+                   example_grouped_gemm_xdl_splitk_fp16)
+endif()
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_bfp16)
+endif()
+if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int8)
+endif()
 if(USE_BITINT_EXTENSION_INT4)
  add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)