Merge branch 'amd-develop' into amd-master

c9013009 · Jun Liu · 114c2646 · 84dcf5d0 · c9013009 · c9013009
Commit c9013009 authored Sep 25, 2023 by Jun Liu
20 changed files
--- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
@@ -6,30 +6,43 @@ foreach(gpu IN LISTS GPU_TARGETS)
   add_custom_target(example_gemm_reduce_xdl_max)
   add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
   add_custom_target(example_gemm_add_add_mean_meansquare_xdl)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
-    add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
-    add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
+   add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_fp16)
+   endif()
+   add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16)
+   endif()
+   add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp16)
   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-    add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
-    add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)
+
+   add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int8)
+   endif()
+   add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_add_addsquare_xdl_int8)
   endif()
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-    add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
-    add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
+
+   add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_fp32)
+   endif()
+   add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp32)
   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)
-    add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
+
+   add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_bf16)
+   endif()
+   add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_bf16)
   endif()
   
@@ -40,7 +53,9 @@ foreach(gpu IN LISTS GPU_TARGETS)

   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp)
-      add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
+      if(result EQUAL 0)
+        add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
+      endif()
   endif()
   set(target 1)
 endif()

--- a/example/17_convnd_bwd_data/CMakeLists.txt
+++ b/example/17_convnd_bwd_data/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp)
-   target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
+   if(result EQUAL 0)
+      target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
+   endif()
   set(target 1)
 endif()
 endforeach()
-  if(DL_KERNELS)
-    add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
-    target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
-  endif()
+
+add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
+if(result EQUAL 0)
+  target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
 endif()
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -7,4 +6,3 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
-endif()
--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -3,22 +3,20 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(example_grouped_conv_bwd_weight)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16)
   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
+   add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
+   if(result EQUAL 0)
    add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_bf16)
   endif()
   set(target 1)
 endif()
 endforeach()

-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-  if(DL_KERNELS)
-    add_custom_target(example_grouped_conv_bwd_weight_dl)
-    add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp)
-    add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16)
-  endif()
-endif()
\ No newline at end of file
+add_custom_target(example_grouped_conv_bwd_weight_dl)
+add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp)
+if(result EQUAL 0)
+  add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16)
+endif()
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
@@ -3,7 +3,7 @@

 #include "common.hpp"

-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp"

 using InDataType  = F16;
 using WeiDataType = F16;
@@ -15,44 +15,55 @@ using WeiElementOp = PassThrough;
 using OutElementOp = PassThrough;

 template <ck::index_t NDimSpatial>
-using DeviceConvBwdWeightInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl<
-        NDimSpatial,          // NDimSpatial
-        InDataType,           // InDataType
-        WeiDataType,          // WeiDataType
-        OutDataType,          // OutDataType
-        AccDataType,          // AccDataType
-        InElementOp,          // InElementwiseOperation
-        WeiElementOp,         // WeiElementwiseOperation
-        OutElementOp,         // OutElementwiseOperation
-        ConvBwdWeightDefault, // ConvBackwardWeightSpecialization
-        256,                  // BlockSize
-        128,                  // MPerBlock
-        128,                  // NPerBlock
-        16,                   // K0PerBlock
-        2,                    // K1
-        4,                    // M1PerThread
-        4,                    // N1PerThread
-        1,                    // KPerThread
-        S<8, 2>,              // M1N1ThreadClusterM1Xs
-        S<8, 2>,              // M1N1ThreadClusterN1Xs
-        S<1, 8, 1, 1, 2>,     // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
-        S<1, 2, 1, 128, 1>,   // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
-        S<0, 2, 3, 1, 4>,     // ABlockTransferThreadClusterArrangeOrder
-        S<0, 2, 3, 1, 4>,     // ABlockTransferSrcAccessOrder
-        S<1, 1, 1, 1, 1>,     // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
-        S<0, 2, 3, 1, 4>,     // ABlockTransferSrcVectorTensorContiguousDimOrder
-        S<1, 1, 1, 1, 1>,     // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
-        S<1, 1, 1, 8, 2>,     // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
-        S<1, 16, 1, 16, 1>,   // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
-        S<0, 1, 4, 2, 3>,     // BBlockTransferThreadClusterArrangeOrder
-        S<0, 1, 4, 2, 3>,     // BBlockTransferSrcAccessOrder
-        S<1, 1, 1, 8, 1>,     // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
-        S<0, 1, 4, 2, 3>,     // BBlockTransferSrcVectorTensorContiguousDimOrder
-        S<1, 1, 1, 1, 2>,     // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
-        S<0, 1, 2, 3, 4, 5>,  // CThreadTransferSrcDstAccessOrder
-        5,                    // CThreadTransferSrcDstVectorDim
-        4>;                   // CThreadTransferDstScalarPerVector
+using DeviceConvBwdWeightInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Dl<
+    NDimSpatial, // NDimSpatial
+    ck::tuple_element_t<NDimSpatial - 1,
+                        ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                  ck::tensor_layout::convolution::GNHWC,
+                                  ck::tensor_layout::convolution::GNDHWC>>, // InLayout
+    ck::tuple_element_t<NDimSpatial - 1,
+                        ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                  ck::tensor_layout::convolution::GKYXC,
+                                  ck::tensor_layout::convolution::GKZYXC>>, // WeiLayout
+    ck::tuple_element_t<NDimSpatial - 1,
+                        ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                  ck::tensor_layout::convolution::GNHWK,
+                                  ck::tensor_layout::convolution::GNDHWK>>, // OutLayout
+    InDataType,                                                             // InDataType
+    WeiDataType,                                                            // WeiDataType
+    OutDataType,                                                            // OutDataType
+    AccDataType,                                                            // AccDataType
+    InElementOp,          // InElementwiseOperation
+    WeiElementOp,         // WeiElementwiseOperation
+    OutElementOp,         // OutElementwiseOperation
+    ConvBwdWeightDefault, // ConvBackwardWeightSpecialization
+    256,                  // BlockSize
+    128,                  // MPerBlock
+    128,                  // NPerBlock
+    16,                   // K0PerBlock
+    2,                    // K1
+    4,                    // M1PerThread
+    4,                    // N1PerThread
+    1,                    // KPerThread
+    S<8, 2>,              // M1N1ThreadClusterM1Xs
+    S<8, 2>,              // M1N1ThreadClusterN1Xs
+    S<1, 8, 1, 1, 2>,     // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
+    S<1, 2, 1, 128, 1>,   // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
+    S<0, 2, 3, 1, 4>,     // ABlockTransferThreadClusterArrangeOrder
+    S<0, 2, 3, 1, 4>,     // ABlockTransferSrcAccessOrder
+    S<1, 1, 1, 1, 1>,     // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
+    S<0, 2, 3, 1, 4>,     // ABlockTransferSrcVectorTensorContiguousDimOrder
+    S<1, 1, 1, 1, 1>,     // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
+    S<1, 1, 1, 8, 2>,     // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
+    S<1, 16, 1, 16, 1>,   // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
+    S<0, 1, 4, 2, 3>,     // BBlockTransferThreadClusterArrangeOrder
+    S<0, 1, 4, 2, 3>,     // BBlockTransferSrcAccessOrder
+    S<1, 1, 1, 8, 1>,     // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
+    S<0, 1, 4, 2, 3>,     // BBlockTransferSrcVectorTensorContiguousDimOrder
+    S<1, 1, 1, 1, 2>,     // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
+    S<0, 1, 2, 3, 4, 5>,  // CThreadTransferSrcDstAccessOrder
+    5,                    // CThreadTransferSrcDstVectorDim
+    4>;                   // CThreadTransferDstScalarPerVector

 #include "run_grouped_conv_bwd_weight_example.inc"


--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -14,20 +14,8 @@ template <ck::index_t NDimSpatial>
 bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
                                 const ck::utils::conv::ConvParam& conv_param)
 {
-    ck::index_t split_k;
-    // Set split_k = 2 for xdl op, split_k = 1 for dl
    // Dl op doesn't support split_k > 1
-    // TODO: Add Dl op split_k > 1 support
-    if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
-         ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
-         ck::get_device_name() == "gfx1102"))
-    {
-        split_k = 2;
-    }
-    else
-    {
-        split_k = 1;
-    }
+    constexpr ck::index_t split_k = 1;

    const auto in_g_n_c_wis_desc =
        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<

--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -10,4 +9,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
-endif()
+
--- a/example/22_cgemm/CMakeLists.txt
+++ b/example/22_cgemm/CMakeLists.txt
 add_custom_target(example_cgemm_xdl)

-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-  add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp)
+add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp)
+if(result EQUAL 0)
  add_dependencies(example_cgemm_xdl example_cgemm_xdl_bf16)
 endif()
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-  add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp)
+add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp)
+if(result EQUAL 0)
  add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp16)
 endif()
-if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
 add_example_executable(example_cgemm_xdl_fp32 cgemm_xdl_fp32.cpp)
-add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32)
+if(result EQUAL 0)
+  add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32)
 endif()
-if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-  add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp)
+add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp)
+if(result EQUAL 0)
  add_dependencies(example_cgemm_xdl example_cgemm_xdl_int8)
 endif()
 if(USE_BITINT_EXTENSION_INT4)

--- a/example/24_batched_gemm/CMakeLists.txt
+++ b/example/24_batched_gemm/CMakeLists.txt
 add_custom_target(example_batched_gemm_xdl)
-if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-  add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp)
+add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp)
+if(result EQUAL 0)
  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp32)
 endif()
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-  add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp)
+if(result EQUAL 0)
  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16)
 endif()
-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-  add_example_executable(example_batched_gemm_xdl_bfp16 batched_gemm_xdl_bfp16.cpp)
-  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bfp16)
+add_example_executable(example_batched_gemm_xdl_bf16 batched_gemm_xdl_bf16.cpp)
+if(result EQUAL 0)
+  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16)
 endif()
-if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-  add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp)
+add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp)
+if(result EQUAL 0)
  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int8)
 endif()
 if(USE_BITINT_EXTENSION_INT4)
  add_example_executable(example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp)
-  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4)
+  if(result EQUAL 0)
+    add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4)
+  endif()
 endif()
--- a/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp
--- a/example/25_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/25_gemm_bias_e_permute/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_gemm_bias_e_permute_g1m3n2k1_xdl_fp16 gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp)
-    add_example_executable(example_gemm_bias_e_permute_g1m2n3k1_xdl_fp16 gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp)
-endif()
+add_example_executable(example_gemm_bias_e_permute_g1m3n2k1_xdl_fp16 gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp)
+add_example_executable(example_gemm_bias_e_permute_g1m2n3k1_xdl_fp16 gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp)
--- a/example/26_contraction/CMakeLists.txt
+++ b/example/26_contraction/CMakeLists.txt
-if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-    add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp)
-    add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp)
-endif()
-if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-    add_example_executable(example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp)
-    add_example_executable(example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp)
-endif()
+add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp)
+add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp)
+add_example_executable(example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp)
+add_example_executable(example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp)
--- a/example/27_layernorm/CMakeLists.txt
+++ b/example/27_layernorm/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp)
-    add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp)
-endif()
+add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp)
+add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp)
--- a/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_grouped_gemm_bias_e_permute_xdl_fp16 grouped_gemm_bias_e_permute_xdl_fp16.cpp)
-endif()
+add_example_executable(example_grouped_gemm_bias_e_permute_xdl_fp16 grouped_gemm_bias_e_permute_xdl_fp16.cpp)
--- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)

-    if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
-        add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp)
-    endif()
+if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
+    add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp)
 endif()
--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -5,27 +5,31 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
   add_custom_target(example_grouped_conv_fwd_multiple_d)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
-      add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+   endif()
+   add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
   endif()
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
   endif()
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
-      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+      if(result EQUAL 0)
+         add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+      endif()
   endif() # USE_BITINT_EXTENSION_INT4

   set(target 1)
@@ -35,12 +39,8 @@ endforeach()
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
-   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
-   endif()
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
   set(target 1)
 endif()
 endforeach()
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
 list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942)
-list(APPEND gpu_list2 gfx908 gfx90a)
+
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
-   endif()
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
-   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
-   endif()
+   add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
   endif(USE_BITINT_EXTENSION_INT4)
@@ -20,7 +14,5 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()

 if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
-   endif()
+   add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
 endif()
--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-    add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-    add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
-    add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
-endif()
-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp)
-endif()
-
 add_custom_target(example_gemm_scale_softmax_gemm)
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+
+add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16)
+endif()
+add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16)
+endif()
+add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16)
+endif()
+add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
+endif()
+add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
 endif()
-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_bf16)
+endif()
+add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16)
 endif()
+
--- a/example/35_splitK_gemm/CMakeLists.txt
+++ b/example/35_splitK_gemm/CMakeLists.txt
@@ -3,25 +3,28 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(example_splitK_gemm_xdl)
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+
+  add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+  if(result EQUAL 0)
    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_fp32)
   endif()
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+  add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+  if(result EQUAL 0)
    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16)
   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
-    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_bfp16)
+  add_example_executable(example_splitK_gemm_xdl_bf16 splitK_gemm_xdl_bf16.cpp)
+  if(result EQUAL 0)
+    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_bf16)
   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
+  add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
+  if(result EQUAL 0)
    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int8)
   endif()
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
-      add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+      if(result EQUAL 0)
+        add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+      endif()
   endif()
   set(target 1)
 endif()

--- a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp