Merge branch 'develop' into gemm_v2r3_kpad_fix

a8629a98 · zjing14 · GitHub · 8dc713ea · 94bfa502 · a8629a98
Unverified Commit a8629a98 authored Sep 26, 2023 by zjing14 Committed by GitHub Sep 26, 2023
20 changed files
--- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)

-    if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
-        add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp)
-    endif()
+if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
+    add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp)
 endif()
--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -5,27 +5,31 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
   add_custom_target(example_grouped_conv_fwd_multiple_d)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
-      add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+   endif()
+   add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
   endif()
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+   if(result EQUAL 0)
      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
   endif()
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
-      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+      if(result EQUAL 0)
+         add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+      endif()
   endif() # USE_BITINT_EXTENSION_INT4

   set(target 1)
@@ -35,12 +39,8 @@ endforeach()
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
-   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
-   endif()
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
   set(target 1)
 endif()
 endforeach()
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
 list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942)
-list(APPEND gpu_list2 gfx908 gfx90a)
+
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
-   endif()
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
-   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
-   endif()
+   add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
   endif(USE_BITINT_EXTENSION_INT4)
@@ -20,7 +14,5 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()

 if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
-   endif()
+   add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
 endif()
--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-    add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-    add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
-    add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
-endif()
-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp)
-    add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp)
-endif()
-
 add_custom_target(example_gemm_scale_softmax_gemm)
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+
+add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16)
+endif()
+add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16)
+endif()
+add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16)
+endif()
+add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
+endif()
+add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
 endif()
-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_bf16)
+endif()
+add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16)
 endif()
+
--- a/example/35_splitK_gemm/CMakeLists.txt
+++ b/example/35_splitK_gemm/CMakeLists.txt
@@ -3,25 +3,28 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(example_splitK_gemm_xdl)
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+
+  add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+  if(result EQUAL 0)
    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_fp32)
   endif()
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+  add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+  if(result EQUAL 0)
    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16)
   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
-    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_bfp16)
+  add_example_executable(example_splitK_gemm_xdl_bf16 splitK_gemm_xdl_bf16.cpp)
+  if(result EQUAL 0)
+    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_bf16)
   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
+  add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
+  if(result EQUAL 0)
    add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int8)
   endif()
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
-      add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+      if(result EQUAL 0)
+        add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+      endif()
   endif()
   set(target 1)
 endif()

--- a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
--- a/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp)
-endif()
+add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp)
--- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(example_grouped_conv_bwd_data)
   add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
+   if(result EQUAL 0)
+      add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
+   endif()
   add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
-
-   add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
-   add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
+   if(result EQUAL 0)
+     add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
+   endif()
   set(target 1)
 endif()
 endforeach()
-endif()
--- a/example/39_permute/CMakeLists.txt
+++ b/example/39_permute/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_custom_target(example_permute)
-
-    add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
-    add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
-    add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
+add_custom_target(example_permute)

+add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_permute example_permute_1xHxW_fp16)
+endif()
+add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_permute example_permute_NxHxW_fp16)
+endif()
+add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
+if(result EQUAL 0)
    add_dependencies(example_permute example_permute_HxWx4_fp16)
 endif()
--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
-if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -11,7 +10,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endif()
 endforeach()

-  if(DL_KERNELS)
  # Conv perlayer quantization
  add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
  # Conv perchannel quantization
@@ -24,5 +22,3 @@ endforeach()
  add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
  # Conv + bias + tanh perchannel quantization
  add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
-  endif()
-endif()
\ No newline at end of file
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -3,15 +3,9 @@ list(APPEND gpu_list2 gfx908 gfx90a)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
-   endif()
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
-   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
-   endif()
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
   endif(USE_BITINT_EXTENSION_INT4)
@@ -20,7 +14,5 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()

 if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
-   endif()
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
 endif()
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
-    add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
-    add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
-endif()
+add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
+add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
--- a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
@@ -14,18 +14,22 @@ using ComputeDataType = float;

 struct YElementOp
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
    {
-        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
-                          ck::is_same<T, ck::half_t>::value,
+        static_assert(ck::is_same<X, float>::value || ck::is_same<X, double>::value ||
+                          ck::is_same<X, ck::half_t>::value,
                      "Data type is not supported by this operation!");

-        T a;
+        static_assert(ck::is_same<Y, float>::value || ck::is_same<Y, double>::value ||
+                          ck::is_same<Y, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        X a;

        ck::tensor_operation::element_wise::Sigmoid{}(a, x);

-        y = x * a;
+        y = ck::type_convert<Y>(x * a);
    };
 };


--- a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
-endif()
-if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-    add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
-endif()
+add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
+add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
-    add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
-endif()
+add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
+add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
--- a/example/46_gemm_add_multiply/CMakeLists.txt
+++ b/example/46_gemm_add_multiply/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    if(DL_KERNELS)
-        add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
-    endif()
-    add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
-endif()
+add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
+add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
--- a/example/48_pool3d_fwd/CMakeLists.txt
+++ b/example/48_pool3d_fwd/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
-endif()
+add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
--- a/example/49_maxpool2d_bwd/CMakeLists.txt
+++ b/example/49_maxpool2d_bwd/CMakeLists.txt
-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
-endif()
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
-endif()
-if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-    add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
-endif()
+add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
--- a/example/50_put_element/CMakeLists.txt
+++ b/example/50_put_element/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
-endif()
+add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
--- a/example/52_image_to_column/CMakeLists.txt
+++ b/example/52_image_to_column/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_custom_target(example_image_to_column)
+   add_example_executable(example_image_to_column_f32 image_to_column_f32.cpp)
+   add_dependencies(example_image_to_column example_image_to_column_f32)
+   set(target 1)
+ endif()
+endforeach()