refactor

1f8e8231 · Chao Liu · aa0a891a · 1f8e8231 · 1f8e8231 · 1f8e8231
Commit 1f8e8231 authored Dec 05, 2021 by Chao Liu
3 changed files
--- a/example/4_conv_xdl_bias_relu_add/include/device_conv_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/example/4_conv_xdl_bias_relu_add/include/device_conv_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp
-#ifndef DEVICE_CONV_FWD_XDL_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP
-#define DEVICE_CONV_FWD_XDL_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP
+#ifndef DEVICE_CONV2D_FWD_XDL_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP
+#define DEVICE_CONV2D_FWD_XDL_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP

 #include <iostream>
+#include <sstream>
 #include "device.hpp"
 #include "device_base.hpp"
 #include "device_conv.hpp"
@@ -10,13 +11,13 @@
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdlops_v2r5.hpp"
-#include "example/4_conv_xdl_bias_relu_add/include/device_conv_fwd_xdl_bias_activation_add.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {

-// specialization for 2D conv: in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) + residual[N, Ho, Wo, K]
 template <typename InDataType,
          typename WeiDataType,
          typename OutDataType,
@@ -51,49 +52,12 @@ template <typename InDataType,
          ck::index_t CThreadTransferDstScalarPerVector,
          bool ABlockLdsAddExtraM,
          bool BBlockLdsAddExtraN>
-struct DeviceConvFwdXdl_bias_activation_add<
-    2,                                        // ck::index_t NDimSpatial,
-    InDataType,                               // typename InDataType,
-    WeiDataType,                              // typename WeiDataType,
-    OutDataType,                              // typename OutDataType,
-    AccDataType,                              // typename AccDataType,
-    ck::tensor_layout::convolution::NHWC,     // typename InLayout,
-    ck::tensor_layout::convolution::KYXC,     // typename WeiLayout,
-    ck::tensor_layout::convolution::NHWK,     // typename OutLayout,
-    InElementwiseOperation,                   // typename InElementwiseOperation,
-    WeiElementwiseOperation,                  // typename WeiElementwiseOperation,
-    OutElementwiseOperation,                  // typename OutElementwiseOperation,
-    BlockSize,                                // ck::index_t BlockSize,
-    MPerBlock,                                // ck::index_t MPerBlock,
-    NPerBlock,                                // ck::index_t NPerBlock,
-    K0PerBlock,                               // ck::index_t K0PerBlock,
-    K1,                                       // ck::index_t K1,
-    MPerXDL,                                  // ck::index_t MPerXDL,
-    NPerXDL,                                  // ck::index_t NPerXDL,
-    MXdlPerWave,                              // ck::index_t MXdlPerWave,
-    NXdlPerWave,                              // ck::index_t NXdlPerWave,
-    ABlockTransferThreadSliceLengths_K0_M_K1, // typename ABlockTransferThreadSliceLengths_K0_M_K1,
-    ABlockTransferThreadClusterLengths_K0_M_K1, // typename
-                                                // ABlockTransferThreadClusterLengths_K0_M_K1,
-    ABlockTransferThreadClusterArrangeOrder,    // typename ABlockTransferThreadClusterArrangeOrder,
-    ABlockTransferSrcAccessOrder,               // typename ABlockTransferSrcAccessOrder,
-    ABlockTransferSrcVectorDim,                 // ck::index_t ABlockTransferSrcVectorDim,
-    ABlockTransferSrcScalarPerVector,           // ck::index_t ABlockTransferSrcScalarPerVector,
-    ABlockTransferDstScalarPerVector_K1,        // ck::index_t ABlockTransferDstScalarPerVector_K1,
-    BBlockTransferThreadSliceLengths_K0_N_K1, // typename BBlockTransferThreadSliceLengths_K0_N_K1,
-    BBlockTransferThreadClusterLengths_K0_N_K1, // typename
-                                                // BBlockTransferThreadClusterLengths_K0_N_K1,
-    BBlockTransferThreadClusterArrangeOrder,    // typename BBlockTransferThreadClusterArrangeOrder,
-    BBlockTransferSrcAccessOrder,               // typename BBlockTransferSrcAccessOrder,
-    BBlockTransferSrcVectorDim,                 // ck::index_t BBlockTransferSrcVectorDim,
-    BBlockTransferSrcScalarPerVector,           // ck::index_t BBlockTransferSrcScalarPerVector,
-    BBlockTransferDstScalarPerVector_K1,        // ck::index_t BBlockTransferDstScalarPerVector_K1,
-    CThreadTransferSrcDstVectorDim,             // ck::index_t CThreadTransferSrcDstVectorDim,
-    CThreadTransferDstScalarPerVector,          // ck::index_t CThreadTransferDstScalarPerVector,
-    ABlockLdsAddExtraM,                         // bool ABlockLdsAddExtraM,
-    BBlockLdsAddExtraN                          // bool BBlockLdsAddExtraN>
-    > : public BaseOperator
+struct DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public BaseOperator
 {
+    using DeviceOp =
+        DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+
    using ADataType = InDataType;
    using BDataType = WeiDataType;
    using CDataType = OutDataType;
@@ -241,7 +205,7 @@ struct DeviceConvFwdXdl_bias_activation_add<

        // C0: bias tensor: assume a contiguous vector
        const auto bias_grid_desc_gemmm_gemmn =
-            make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(0, 1));
+            make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));

        // C1: residual tensor: assume same layout as output tensor
        const auto resi_grid_desc_gemmm_gemmn = out_gemmm_gemmn_grid_desc;
@@ -407,17 +371,17 @@ struct DeviceConvFwdXdl_bias_activation_add<
              wei_element_op_{wei_element_op},
              out_element_op_{out_element_op}
        {
-            const auto descs = DeviceConvFwdXdl_bias_activation_add::
-                MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(N,
-                                                                K,
-                                                                C,
-                                                                input_spatial_lengths,
-                                                                filter_spatial_lengths,
-                                                                output_spatial_lengths,
-                                                                conv_filter_strides,
-                                                                conv_filter_dilations,
-                                                                input_left_pads,
-                                                                input_right_pads);
+            const auto descs =
+                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(N,
+                                                                          K,
+                                                                          C,
+                                                                          input_spatial_lengths,
+                                                                          filter_spatial_lengths,
+                                                                          output_spatial_lengths,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads);

            a_grid_desc_k0_m_k1_ = descs[I0];
            b_grid_desc_k0_n_k1_ = descs[I1];
@@ -466,7 +430,7 @@ struct DeviceConvFwdXdl_bias_activation_add<
    // Invoker
    struct Invoker : public BaseInvoker
    {
-        using Argument = DeviceConvFwdXdl_bias_activation_add::Argument;
+        using Argument = DeviceOp::Argument;

        float Run(const Argument& arg, int nrepeat = 1)
        {
@@ -513,18 +477,15 @@ struct DeviceConvFwdXdl_bias_activation_add<
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    remove_reference_t<DeviceConvFwdXdl_bias_activation_add::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceConvFwdXdl_bias_activation_add::BGridDesc_K0_N_K1>,
-                    remove_reference_t<
-                        DeviceConvFwdXdl_bias_activation_add::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    remove_reference_t<
-                        DeviceConvFwdXdl_bias_activation_add::C0GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    remove_reference_t<
-                        DeviceConvFwdXdl_bias_activation_add::C1GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<DeviceOp::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceOp::C0GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceOp::C1GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<DeviceConvFwdXdl_bias_activation_add::Block2CTileMap>,
+                    remove_reference_t<DeviceOp::Block2CTileMap>,
                    true>;

                ave_time = launch_and_time_kernel(kernel,
@@ -553,18 +514,15 @@ struct DeviceConvFwdXdl_bias_activation_add<
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    remove_reference_t<DeviceConvFwdXdl_bias_activation_add::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceConvFwdXdl_bias_activation_add::BGridDesc_K0_N_K1>,
-                    remove_reference_t<
-                        DeviceConvFwdXdl_bias_activation_add::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    remove_reference_t<
-                        DeviceConvFwdXdl_bias_activation_add::C0GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    remove_reference_t<
-                        DeviceConvFwdXdl_bias_activation_add::C1GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<DeviceOp::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceOp::C0GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceOp::C1GridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<DeviceConvFwdXdl_bias_activation_add::Block2CTileMap>,
+                    remove_reference_t<DeviceOp::Block2CTileMap>,
                    false>;

                ave_time = launch_and_time_kernel(kernel,
@@ -591,7 +549,6 @@ struct DeviceConvFwdXdl_bias_activation_add<
            return ave_time;
        }

-        // polymorphic
        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
        {
            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
@@ -613,7 +570,6 @@ struct DeviceConvFwdXdl_bias_activation_add<
                                           arg.N01_);
    }

-    // polymorphic
    bool IsSupportedArgument(const BaseArgument* p_arg) override
    {
        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
@@ -661,6 +617,23 @@ struct DeviceConvFwdXdl_bias_activation_add<
    }

    static auto MakeInvoker() { return Invoker{}; }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
 }; // namespace device

 } // namespace device

--- a/device_operation/include/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/device_operation/include/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-// conv2d: in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
 template <typename InDataType,
          typename WeiDataType,
          typename OutDataType,
@@ -484,7 +484,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            return ave_time;
        }

-        // polymorphic
        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
        {
            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
@@ -506,7 +505,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                           arg.N01_);
    }

-    // polymorphic
    bool IsSupportedArgument(const BaseArgument* p_arg) override
    {
        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
@@ -551,7 +549,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K

    static auto MakeInvoker() { return Invoker{}; }

-    // polymorphic
    std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in_grid,
                        const void* p_wei_grid,
@@ -590,13 +587,11 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                          out_element_op);
    }

-    // polymorphic
    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
    {
        return std::make_unique<Invoker>(Invoker{});
    }

-    // polymorphic
    std::string GetTypeString() const override
    {
        auto str = std::stringstream();

--- a/example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
+++ b/example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
@@ -11,8 +11,7 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "example/4_conv_xdl_bias_relu_add/include/device_conv_fwd_xdl_bias_activation_add.hpp"
-#include "example/4_conv_xdl_bias_relu_add/include/device_conv_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp"
+#include "device_conv2d_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp"

 struct PassThrough
 {
@@ -170,13 +169,14 @@ using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = BiasReluAdd;

-// clang-format off
-using DeviceConvFwdInstance =
-    //################################################################|    NDim|     InData|     WeiData|     OutData|     AccData|       In|       Wei|       Out|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
-    //################################################################| Spatial|       Type|        Type|        Type|        Type|   Layout|    Layout|    Layout| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
-    //################################################################|        |           |            |            |            |         |          |          |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
-    //################################################################|        |           |            |            |            |         |          |          |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
-    ck::tensor_operation::device::DeviceConvFwdXdl_bias_activation_add<       2, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
+using DeviceConvFwdInstance = ck::tensor_operation::device::
+    DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    // clang-format off
+//  |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
+//  |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
+//  |          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
+//  |          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
+    <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
 // clang-format on

 template <typename TIn,