init commit for conv+activ

fe427fd1 · Jing Zhang · b8bb1480 · fe427fd1 · fe427fd1 · fe427fd1
Commit fe427fd1 authored Jul 23, 2021 by Jing Zhang
6 changed files
--- a/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
@@ -32,7 +32,7 @@ __host__ __device__ constexpr auto make_left_pad_transform(
    return DynamicLeftPad<LowLength, LeftPad, SkipIsValidCheck>{low_length, left_pad};
 }

-template <typename LowLength, typename RightPad, bool SkipIsValidCheck>
+template <typename LowLength, typename RightPad, bool SkipIsValidCheck = false>
 __host__ __device__ constexpr auto make_right_pad_transform(
    const LowLength& low_length,
    const RightPad& right_pad,

--- a/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
@@ -346,6 +346,19 @@ struct GridwiseStaticGemm_km_kn_mn_v3
            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
        }

+        // activ
+        {
+            constexpr index_t activ_type = 2;
+            static_for<0, c_k_n_ho_wo_thread_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
+                if constexpr(activ_type == 1)
+                    c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : 0.0;
+                else if constexpr(activ_type == 2)
+                    c_thread_buf(i) = 1.0 / (1.0 + exp(-c_thread_buf[i]));
+            }
+
+            );
+        }
+
        // output: register to global memory
        {
            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor

--- a/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
    const bool do_log             = atoi(argv[5]);
    const int nrepeat             = atoi(argv[6]);

-#if 0
+#if 1
    constexpr index_t N           = 1;
    constexpr index_t C           = 16;
    constexpr index_t Hi          = 1080;
@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
    constexpr index_t K  = 16;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
-#elif 1
+#elif 0
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
    constexpr index_t Hi = 240;
@@ -135,7 +135,7 @@ int main(int argc, char* argv[])
    constexpr index_t K  = 16;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
-#elif 1
+#elif 0
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
    constexpr index_t Hi = 1080;
@@ -143,6 +143,38 @@ int main(int argc, char* argv[])
    constexpr index_t K  = 16;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 8;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 4;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
 #endif

    const index_t conv_stride_h   = 1;
@@ -420,7 +452,7 @@ int main(int argc, char* argv[])
 #else
        device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
 #endif
-            <in_data_t, 8, acc_data_t, out_data_t>(tmp[I0],
+            <in_data_t, 8, 8, acc_data_t, out_data_t>(tmp[I0],
                                                      tmp[I1],
                                                      tmp[I2],
                                                      tmp[I3],
@@ -490,14 +522,15 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_direct_convolution(in,
+        host_direct_convolution_activ(in,
                                      wei,
                                      out_host,
                                      make_tuple(conv_stride_h, conv_stride_w),
                                      make_tuple(conv_dilation_h, conv_dilation_w),
                                      make_tuple(in_left_pad_h, in_left_pad_w),
                                      make_tuple(in_right_pad_h, in_right_pad_w),
-                                layout);
+                                      layout,
+                                      ActivType_t::sigmoid);

        check_error(out_host, out_device);


--- a/host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -6,6 +6,7 @@

 template <typename TInWei,
          ck::index_t InWeiVectorSize,
+          ck::index_t OutVectorSize,
          typename TAcc,
          typename TOut,
          typename InLengths,
@@ -53,8 +54,8 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    const auto C0 = C / Number<InWeiVectorSize>{};
    const auto C1 = Number<InWeiVectorSize>{};

-    const auto K0 = K / Number<InWeiVectorSize>{};
-    const auto K1 = Number<InWeiVectorSize>{};
+    const auto K0 = K / Number<OutVectorSize>{};
+    const auto K1 = Number<OutVectorSize>{};

    Tensor<TInWei> in_n_c0_hi_wi_c1(
        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
@@ -105,7 +106,7 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    constexpr index_t WoPerThread = 2;
    constexpr index_t EPerThread  = EPerBlock;

-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<Y * X, 1>;
    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;

    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
@@ -120,8 +121,10 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    constexpr auto conv_driver =
 #if 0
        DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
-#else
+#elif 1
        DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
+#elif 1
+        DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
 #endif
        <BlockSize,
         typename vector_type<TInWei, InWeiVectorSize>::type,

--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
 #pragma once
 #include "host_tensor.hpp"

+typedef enum
+{
+    passthrough = 0,
+    relu,
+    sigmoid
+} ActivType_t;
+
 template <typename TIn,
          typename TWei,
          typename TOut,
@@ -88,6 +95,106 @@ void host_direct_convolution(const Tensor<TIn>& in,
    }
 }

+template <typename T>
+inline auto activ(T v, const ActivType_t activ_type)
+{
+    switch(activ_type)
+    {
+    case passthrough: return v;
+    case relu: return (v >= 0 ? v : 0);
+    case sigmoid: return (1 / (1 + exp(-v)));
+    default: throw std::runtime_error("unsupported activ type"); break;
+    }
+}
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_activ(const Tensor<TIn>& in,
+                                   const Tensor<TWei>& wei,
+                                   Tensor<TOut>& out,
+                                   const ConvStrides& conv_strides,
+                                   const ConvDilations& conv_dilations,
+                                   const InLeftPads& in_left_pads,
+                                   const InRightPads& in_right_pads,
+                                   const ConvTensorLayout layout = ConvTensorLayout::NCHW,
+                                   const ActivType_t activ_type  = ActivType_t::passthrough)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
+                    }
+                }
+            }
+        }
+        out(n, k, ho, wo) = activ(v, activ_type);
+    };
+
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
+                    }
+                }
+            }
+        }
+        out(n, k, ho, wo) = activ(v, activ_type);
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
+
 template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
 void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
                                   const Tensor<TWei>& wei_kcyx,