merge from develop and revisison for pr#881

29448ffd · Harisankar Sadasivan · 9223a5e2 · 8f84a012 · 29448ffd · 29448ffd
Commit 29448ffd authored Sep 08, 2023 by Harisankar Sadasivan
20 changed files
--- a/example/17_convnd_bwd_data/CMakeLists.txt
+++ b/example/17_convnd_bwd_data/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -7,5 +8,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
-add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
+  if(DL_KERNELS)
-target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
+    add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
+    target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
+  endif()
+endif()
--- a/example/17_convnd_bwd_data/README.md
+++ b/example/17_convnd_bwd_data/README.md
--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
--- a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
--- a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -6,3 +7,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
+endif()
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
--- a/example/19_binary_elementwise/CMakeLists.txt
+++ b/example/19_binary_elementwise/CMakeLists.txt
--- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
--- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
--- a/example/19_binary_elementwise/elementwise_add_1d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_1d.cpp
--- a/example/19_binary_elementwise/elementwise_add_4d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -3,18 +3,22 @@ set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(example_grouped_conv_bwd_weight)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-   add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
+    add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
-   add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
+    add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16)
+   endif()
-   add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-                                                 example_grouped_conv_bwd_weight_xdl_bf16)
+    add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
+    add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_bf16)
+   endif()
   set(target 1)
 endif()
 endforeach()
-add_custom_target(example_grouped_conv_bwd_weight_dl)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  if(DL_KERNELS)
-add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp)
+    add_custom_target(example_grouped_conv_bwd_weight_dl)
+    add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp)
-add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16)
+    add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16)
+  endif()
+endif()
\ No newline at end of file
--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 using InDataType = BF16;
 // bf16 kernel use fp32 atomic add to accumulate Weight tensor into global memory
@@ -17,8 +17,20 @@ using OutElementOp = PassThrough;
 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
-        NDimSpatial,          // NDimSpatial
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
        InDataType,           // InDataType
        WeiDataType,          // WeiDataType
        OutDataType,          // OutDataType

--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 using InDataType  = F16;
 using WeiDataType = F16;
@@ -16,8 +16,20 @@ using OutElementOp = PassThrough;
 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
-        NDimSpatial,          // NDimSpatial
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
        InDataType,           // InDataType
        WeiDataType,          // WeiDataType
        OutDataType,          // OutDataType

--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -72,9 +72,12 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    // init to 0
    wei_device_buf.SetZero();
-    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
-    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_strides{};
-    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> weights_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> output_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
@@ -82,9 +85,12 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
-    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
+    range_copy(in_g_n_c_wis_desc.GetLengths(), begin(input_lengths));
-    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
+    range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
-    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(wei_g_k_c_xs_desc.GetLengths(), begin(filter_lengths));
+    range_copy(wei_g_k_c_xs_desc.GetStrides(), begin(weights_strides));
+    range_copy(out_g_n_k_wos_desc.GetLengths(), begin(output_lengths));
+    range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
@@ -96,13 +102,12 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                      conv_param.G_,
+                                      input_lengths,
-                                      conv_param.N_,
+                                      input_strides,
-                                      conv_param.K_,
+                                      filter_lengths,
-                                      conv_param.C_,
+                                      weights_strides,
-                                      input_spatial_lengths,
+                                      output_lengths,
-                                      filter_spatial_lengths,
+                                      output_strides,
-                                      output_spatial_lengths,
                                      conv_filter_strides,
                                      conv_filter_dilations,
                                      input_left_pads,

--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -9,3 +10,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
+endif()
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp