Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance

Merge branch 'develop' of...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance
e57bd240 · aska-0096 · 4b70d68e · 37a8c1f7 · e57bd240 · e57bd240
Commit e57bd240 authored Sep 07, 2023 by aska-0096
20 changed files
--- a/example/39_permute/CMakeLists.txt
+++ b/example/39_permute/CMakeLists.txt
-add_custom_target(example_permute)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_custom_target(example_permute)

-add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
-add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
-add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
+    add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
+    add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
+    add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)

-add_dependencies(example_permute example_permute_1xHxW_fp16)
-add_dependencies(example_permute example_permute_NxHxW_fp16)
-add_dependencies(example_permute example_permute_HxWx4_fp16)
+    add_dependencies(example_permute example_permute_1xHxW_fp16)
+    add_dependencies(example_permute example_permute_NxHxW_fp16)
+    add_dependencies(example_permute example_permute_HxWx4_fp16)
+endif()
--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
@@ -10,21 +10,19 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
-# Conv perlayer quantization
-add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)

-# Conv perchannel quantization
-add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
-
-# Conv + bias + relu perlayer quantization
-add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
-
-# Conv + bias + relu perchannel quantization
-add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
-
-# Conv + bias + tanh perlayer quantization
-add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
-
-# Conv + bias + tanh perchannel quantization
-add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
+  if(DL_KERNELS)
+  # Conv perlayer quantization
+  add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
+  # Conv perchannel quantization
+  add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
+  # Conv + bias + relu perlayer quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
+  # Conv + bias + relu perchannel quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
+  # Conv + bias + tanh perlayer quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
+  # Conv + bias + tanh perchannel quantization
+  add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
+  endif()
 endif()
\ No newline at end of file
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -3,9 +3,15 @@ list(APPEND gpu_list2 gfx908 gfx90a)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
+   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
      add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
+   endif()
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
      add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
+   endif()
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
      add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
+   endif()
   if(USE_BITINT_EXTENSION_INT4)
      add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
   endif(USE_BITINT_EXTENSION_INT4)
@@ -14,5 +20,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()

 if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
+   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
      add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+   endif()
 endif()
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
-add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
-add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
-add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+    add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
+    add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
+endif()
--- a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
-add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
-add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
+endif()
--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
-add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
-add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
+    add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
+endif()
--- a/example/46_gemm_add_multiply/CMakeLists.txt
+++ b/example/46_gemm_add_multiply/CMakeLists.txt
-add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
-add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    if(DL_KERNELS)
+        add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
+    endif()
+    add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
+endif()
--- a/example/48_pool3d_fwd/CMakeLists.txt
+++ b/example/48_pool3d_fwd/CMakeLists.txt
-add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
-
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
+endif()
--- a/example/48_pool3d_fwd/pool3d_fwd_common.hpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
@@ -18,7 +18,45 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"

-template <typename InDataType,
+template <typename TensorLayout>
+std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
+                                                ck::index_t C_,
+                                                ck::index_t D,
+                                                ck::index_t H,
+                                                ck::index_t W,
+                                                TensorLayout layout)
+{
+    using namespace ck::literals;
+    (void)N_;
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+        return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+        return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
+};
+
+template <typename TensorLayout>
+HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
+                                              std::size_t C_,
+                                              std::size_t D,
+                                              std::size_t H,
+                                              std::size_t W,
+                                              TensorLayout layout)
+{
+    using namespace ck::literals;
+
+    if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+    }
+    else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
+    {
+        return HostTensorDescriptor({N_, C_, D, H, W},
+                                    {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+    }
+};
+
+template <typename DevicePoolFwdInstance,
+          typename InDataType,
          typename OutDataType,
          typename ComputeDataType,
          typename IndexDataType,
@@ -40,6 +78,9 @@ bool pool3d_test(bool do_verification,
                 ck::index_t window_stride_d,
                 ck::index_t window_stride_h,
                 ck::index_t window_stride_w,
+                 ck::index_t window_dilation_d,
+                 ck::index_t window_dilation_h,
+                 ck::index_t window_dilation_w,
                 ck::index_t in_left_pad_d,
                 ck::index_t in_left_pad_h,
                 ck::index_t in_left_pad_w,
@@ -47,53 +88,21 @@ bool pool3d_test(bool do_verification,
                 ck::index_t in_right_pad_h,
                 ck::index_t in_right_pad_w)
 {
-    using DevicePoolFwdInstance =
-        ck::tensor_operation::device::DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<
-            InDataType,      // InDataType
-            OutDataType,     // OutDataType
-            IndexDataType,   // IndexDataType
-            ComputeDataType, // ComputeDataType
-            ReduceOpId,
-            OutputIndex,
-            64, // BlockSize
-            64, // ReduceMThreadClusterSize
-            1,  // ReduceKThreadClusterSize
-            4,  // ReduceMThreadSliceSize
-            1,  // ReduceKThreadSliceSize
-            4>; // InSrcOutDstVectorSize
-
-    const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
-    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
-    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;

    const std::vector<ck::index_t> window_spatial_lengths{Z, Y, X};
    const std::vector<ck::index_t> window_strides{
        window_stride_d, window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> window_dilations{
+        window_dilation_d, window_dilation_h, window_dilation_w};
    const std::vector<ck::index_t> input_left_pads{in_left_pad_d, in_left_pad_h, in_left_pad_w};
    const std::vector<ck::index_t> input_right_pads{in_right_pad_d, in_right_pad_h, in_right_pad_w};

-    // tensor layout
-    auto f_host_tensor_descriptor = [](std::size_t N_,
-                                       std::size_t C_,
-                                       std::size_t D,
-                                       std::size_t H,
-                                       std::size_t W,
-                                       auto layout) {
-        using namespace ck::literals;
-
-        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
-        {
-            return HostTensorDescriptor({N_, C_, D, H, W},
-                                        {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
-        }
-        else if constexpr(ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NDHWC>::value)
-        {
-            return HostTensorDescriptor({N_, C_, D, H, W},
-                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
-        }
-    };
-
    Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi, InLayout{}));
    Tensor<OutDataType> out_n_c_do_ho_wo_host(
        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
@@ -126,10 +135,11 @@ bool pool3d_test(bool do_verification,
        {N, C, Di, Hi, Wi},
        {Z, Y, X},
        {N, C, Do, Ho, Wo},
-        {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
-        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
-        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+        f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, InLayout{}),
+        f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, OutLayout{}),
+        f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, OutLayout{}),
        window_strides,
+        window_dilations,
        input_left_pads,
        input_right_pads,
        {2, 3, 4});
@@ -165,6 +175,7 @@ bool pool3d_test(bool do_verification,
                                                             out_indices_n_c_do_ho_wo_host,
                                                             window_spatial_lengths,
                                                             window_strides,
+                                                             window_dilations,
                                                             input_left_pads,
                                                             input_right_pads);


--- a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
@@ -27,6 +27,20 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 static constexpr bool OutputIndex  = false;
 static constexpr bool PropagateNan = false;

+using DevicePoolFwdInstance =
+    ck::tensor_operation::device::DevicePool3dFwd_NDHWC_NDHWC<InDataType,
+                                                              OutDataType,
+                                                              IndexDataType,
+                                                              ComputeDataType,
+                                                              ReduceOpId,
+                                                              OutputIndex,
+                                                              64, // BlockSize
+                                                              64, // ReduceMThreadClusterSize
+                                                              1,  // ReduceKThreadClusterSize
+                                                              1,  // ReduceMThreadSliceSize
+                                                              1,  // ReduceKThreadSliceSize
+                                                              1>; // InSrcOutDstVectorSize
+
 int main()
 {
    bool do_verification = true;
@@ -44,6 +58,9 @@ int main()
    ck::index_t window_stride_d   = 2;
    ck::index_t window_stride_h   = 2;
    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_d = 1;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
    ck::index_t in_left_pad_d     = 1;
    ck::index_t in_left_pad_h     = 1;
    ck::index_t in_left_pad_w     = 1;
@@ -51,7 +68,8 @@ int main()
    ck::index_t in_right_pad_h    = 1;
    ck::index_t in_right_pad_w    = 1;

-    bool pass = pool3d_test<InDataType,
+    bool pass = pool3d_test<DevicePoolFwdInstance,
+                            InDataType,
                            OutDataType,
                            ComputeDataType,
                            IndexDataType,
@@ -72,6 +90,9 @@ int main()
                                         window_stride_d,
                                         window_stride_h,
                                         window_stride_w,
+                                         window_dilation_d,
+                                         window_dilation_h,
+                                         window_dilation_w,
                                         in_left_pad_d,
                                         in_left_pad_h,
                                         in_left_pad_w,

--- a/example/49_maxpool2d_bwd/CMakeLists.txt
+++ b/example/49_maxpool2d_bwd/CMakeLists.txt
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
--- a/example/50_put_element/CMakeLists.txt
+++ b/example/50_put_element/CMakeLists.txt
--- a/example/50_put_element/put_element_fp16.cpp
+++ b/example/50_put_element/put_element_fp16.cpp
--- a/example/51_avgpool3d_bwd/CMakeLists.txt
+++ b/example/51_avgpool3d_bwd/CMakeLists.txt
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
--- a/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+++ b/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp