adding group conv

05c484e2 · Chao Liu · 3474c777 · 05c484e2 · 05c484e2 · 05c484e2
Commit 05c484e2 authored Jul 20, 2022 by Chao Liu
6 changed files
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_bias_relu_xdl_fp16.cpp
@@ -93,7 +93,7 @@ void print_helper_msg()
              << "Following arguments (depending on number of spatial dims):\n"
              << " N, K, C, \n"
              << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
-              << " <in_n_hi_wi_c image spatial dimensions>, (ie Hi, Wi for 2D)\n"
+              << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
              << " <strides>, (ie Sy, Sx for 2D)\n"
              << " <dilations>, (ie Dy, Dx for 2D)\n"
              << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"

--- a/example/28_group_convnd_fwd_bias_relu/CMakeLists.txt
+++ b/example/28_group_convnd_fwd_bias_relu/CMakeLists.txt
+add_example_executable(example_group_convnd_fwd_bias_relu_xdl_fp16 group_convnd_fwd_bias_relu_xdl_fp16.cpp)
--- a/example/28_group_convnd_fwd_bias_relu/group_convnd_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/28_group_convnd_fwd_bias_relu/group_convnd_fwd_bias_relu_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "../09_convnd_fwd/convnd_fwd_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_convnd_fwd_multiple_d_nwc_kxc_nwk_xdl_cshuffle.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+using CShuffleDataType = ck::half_t;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial>
+using DeviceConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceConvNdFwdMultipleD_NwcKxcNwk_Xdl_CShuffle<
+        NDimSpatial,      //
+        InDataType,       //
+        WeiDataType,      //
+        AccDataType,      //
+        CShuffleDataType, //
+        ck::Tuple<>,      //
+        OutDataType,      //
+        InElementOp,      // Input Elementwise Operation
+        WeiElementOp,     // Weights Elementwise Operation
+        OutElementOp,     // Output Elementwise Operation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        1,                //
+        256,              // BlockSize
+        128,              // MPerBlock
+        256,              // NPerBlock
+        32,               // KPerBlock
+        8,                // K1
+        32,               // MPerXdl
+        32,               // NPerXdl
+        2,                // MXdlPerWave
+        4,                // NXdlPerWave
+        S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        8,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_K1
+        1,                // ABlockLdsExtraM
+        S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        8,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_K1
+        1,                // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+int main(int argc, char* argv[])
+{
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int num_dim_spatial  = 2;
+    ck::utils::conv::ConvParam params{
+        2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        num_dim_spatial = std::stoi(argv[4]);
+        params = parse_conv_params(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(num_dim_spatial == 1)
+    {
+        return run_conv_fwd<1,
+                            ck::tensor_layout::convolution::NWC,
+                            ck::tensor_layout::convolution::KXC,
+                            ck::tensor_layout::convolution::NWK,
+                            InDataType,
+                            WeiDataType,
+                            OutDataType,
+                            InElementOp,
+                            WeiElementOp,
+                            OutElementOp,
+                            DeviceConvNDFwdInstance<1>>(do_verification,
+                                                        init_method,
+                                                        time_kernel,
+                                                        params,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+    }
+    else if(num_dim_spatial == 2)
+    {
+        return run_conv_fwd<2,
+                            ck::tensor_layout::convolution::NHWC,
+                            ck::tensor_layout::convolution::KYXC,
+                            ck::tensor_layout::convolution::NHWK,
+                            InDataType,
+                            WeiDataType,
+                            OutDataType,
+                            InElementOp,
+                            WeiElementOp,
+                            OutElementOp,
+                            DeviceConvNDFwdInstance<2>>(do_verification,
+                                                        init_method,
+                                                        time_kernel,
+                                                        params,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+    }
+    else if(num_dim_spatial == 3)
+    {
+        return run_conv_fwd<3,
+                            ck::tensor_layout::convolution::NDHWC,
+                            ck::tensor_layout::convolution::KZYXC,
+                            ck::tensor_layout::convolution::NDHWK,
+                            InDataType,
+                            WeiDataType,
+                            OutDataType,
+                            InElementOp,
+                            WeiElementOp,
+                            OutElementOp,
+                            DeviceConvNDFwdInstance<3>>(do_verification,
+                                                        init_method,
+                                                        time_kernel,
+                                                        params,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+    }
+    return 0;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -44,3 +44,4 @@ add_subdirectory(24_batched_gemm_c_permute)
 add_subdirectory(25_gemm_bias_c_permute)
 add_subdirectory(26_contraction)
 add_subdirectory(27_layernorm)
+add_subdirectory(28_group_convnd_fwd_bias_relu)
--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// GEMM:
+//   input : input image A[N, Hi, Wi, C],
+//   input : weight B[K, Y, X, C],
+//   input : D0[N, Ho, Wo, K], D1[N, Ho, Wo, K], ...
+//   output : output image E[N, Ho, Wo, K]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceConvFwdMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const ADataType* p_a,
+                        const BDataType* p_b,
+                        EDataType* p_e,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_multiple_d_nwc_kxc_nwk_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_multiple_d_nwc_kxc_nwk_xdl_cshuffle.hpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
@@ -151,25 +151,27 @@ template <index_t NDimSpatial,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
 struct DeviceConvNdFwdMultipleD_NwcKxcNwk_Xdl_CShuffle
-    : public DeviceConvFwd<NDimSpatial,
+    : public DeviceConvFwdMultipleD<
-                           ck::tuple_element_t<NDimSpatial - 1,
+          NDimSpatial,
-                                               ck::Tuple<ck::tensor_layout::convolution::NWC,
+          ck::tuple_element_t<NDimSpatial - 1,
-                                                         ck::tensor_layout::convolution::NHWC,
+                              ck::Tuple<ck::tensor_layout::convolution::NWC,
-                                                         ck::tensor_layout::convolution::NDHWC>>,
+                                        ck::tensor_layout::convolution::NHWC,
-                           ck::tuple_element_t<NDimSpatial - 1,
+                                        ck::tensor_layout::convolution::NDHWC>>,
-                                               ck::Tuple<ck::tensor_layout::convolution::KXC,
+          ck::tuple_element_t<NDimSpatial - 1,
-                                                         ck::tensor_layout::convolution::KYXC,
+                              ck::Tuple<ck::tensor_layout::convolution::KXC,
-                                                         ck::tensor_layout::convolution::KZYXC>>,
+                                        ck::tensor_layout::convolution::KYXC,
-                           ck::tuple_element_t<NDimSpatial - 1,
+                                        ck::tensor_layout::convolution::KZYXC>>,
-                                               ck::Tuple<ck::tensor_layout::convolution::NWK,
+          ck::tuple_element_t<NDimSpatial - 1,
-                                                         ck::tensor_layout::convolution::NHWK,
+                              ck::Tuple<ck::tensor_layout::convolution::NWK,
-                                                         ck::tensor_layout::convolution::NDHWK>>,
+                                        ck::tensor_layout::convolution::NHWK,
-                           ADataType,
+                                        ck::tensor_layout::convolution::NDHWK>>,
-                           BDataType,
+          ADataType,
-                           EDataType,
+          BDataType,
-                           AElementwiseOperation,
+          DsDataType,
-                           BElementwiseOperation,
+          EDataType,
-                           CDEElementwiseOperation>
+          AElementwiseOperation,
+          BElementwiseOperation,
+          CDEElementwiseOperation>
 {
    using DeviceOp = DeviceConvNdFwdMultipleD_NwcKxcNwk_Xdl_CShuffle;
@@ -1130,9 +1132,9 @@ struct DeviceConvNdFwdMultipleD_NwcKxcNwk_Xdl_CShuffle
    static auto MakeInvoker() { return Invoker{}; }
    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_in_grid,
+    MakeArgumentPointer(const ADataType* p_in_grid,
-                        const void* p_wei_grid,
+                        const BDataType* p_wei_grid,
-                        void* p_out_grid,
+                        EDataType* p_out_grid,
                        index_t N,
                        index_t K,
                        index_t C,