merge develop

e5f7ded6 · Jing Zhang · ed068043 · e5f7ded6 · e5f7ded6 · e5f7ded6
Commit e5f7ded6 authored Nov 15, 2021 by Jing Zhang
5 changed files
--- a/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
@@ -3,7 +3,7 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-//#include <half.hpp>
+#include <half.hpp>
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -11,7 +11,6 @@
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
-#include "host_conv.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
@@ -23,6 +22,78 @@ enum ConvForwardAlgo
    V5R1NCHWC // 0
 };
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
+                                       const Tensor<TWei>& wei,
+                                       const Tensor<TOut>& add,
+                                       const Tensor<TOut>& bias,
+                                       Tensor<TOut>& add_host,
+                                       Tensor<TOut>& out_host,
+                                       const ConvStrides& conv_strides,
+                                       const ConvDilations& conv_dilations,
+                                       const InLeftPads& in_left_pads,
+                                       const InRightPads&,
+                                       const ck::ActivTypeEnum_t activ_type)
+{
+    using namespace ck;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        double v = 0;
+        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
+        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
+                        {
+                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
+                        }
+                    }
+                }
+            }
+        }
+        v += bias(k0, k1);
+        v = activ(v, activ_type);
+        const int hox2 = ho * 2;
+        const int wox2 = wo * 2;
+        out_host(n, k0, ho, wo, k1) = v;
+        add_host(n, k0, hox2, wox2, k1)         = v + add(n, k0, hox2, wox2, k1);
+        add_host(n, k0, hox2, wox2 + 1, k1)     = v + add(n, k0, hox2, wox2 + 1, k1);
+        add_host(n, k0, hox2 + 1, wox2, k1)     = v + add(n, k0, hox2 + 1, wox2, k1);
+        add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1);
+    };
+    make_ParallelTensorFunctor(f_nchw,
+                               out_host.mDesc.GetLengths()[0],
+                               out_host.mDesc.GetLengths()[1],
+                               out_host.mDesc.GetLengths()[2],
+                               out_host.mDesc.GetLengths()[3],
+                               out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+}
 int main(int argc, char* argv[])
 {
    using namespace ck;
@@ -334,10 +405,8 @@ int main(int argc, char* argv[])
        if(do_log)
        {
-            // LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            // LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            // LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
-            // std::endl;
            LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
            LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl;
        }

--- a/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
@@ -3,7 +3,7 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-//#include <half.hpp>
+#include <half.hpp>
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -11,7 +11,6 @@
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
-#include "host_conv.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
@@ -23,6 +22,64 @@ enum ConvForwardAlgo
    V5R1NCHWC // 0
 };
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_nchwc(const Tensor<TIn>& in,
+                                   const Tensor<TWei>& wei,
+                                   const Tensor<TOut>& bias,
+                                   Tensor<TOut>& out,
+                                   const ConvStrides& conv_strides,
+                                   const ConvDilations& conv_dilations,
+                                   const InLeftPads& in_left_pads,
+                                   const InRightPads&,
+                                   const ck::ActivTypeEnum_t activ_type)
+{
+    using namespace ck;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        double v    = 0;
+        const int k = k0 * out.mDesc.GetLengths()[4] + k1;
+        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
+                        {
+                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
+                        }
+                    }
+                }
+            }
+        }
+        v += bias(k0, k1);
+        out(n, k0, ho, wo, k1) = activ(v, activ_type);
+    };
+    make_ParallelTensorFunctor(f_nchw,
+                               out.mDesc.GetLengths()[0],
+                               out.mDesc.GetLengths()[1],
+                               out.mDesc.GetLengths()[2],
+                               out.mDesc.GetLengths()[3],
+                               out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+}
 int main(int argc, char* argv[])
 {
    using namespace ck;
@@ -111,8 +168,8 @@ int main(int argc, char* argv[])
    constexpr auto Wi             = Number<1920>{};
    constexpr auto Y              = Number<3>{};
    constexpr auto X              = Number<3>{};
-    constexpr auto C0             = Number<1>{};
+    constexpr auto C0             = Number<2>{};
-    constexpr auto C1             = Number<4>{};
+    constexpr auto C1             = Number<8>{};
    constexpr auto K0             = Number<2>{};
    constexpr auto K1             = Number<8>{};
 #elif 0
@@ -152,7 +209,7 @@ int main(int argc, char* argv[])
    constexpr auto conv_dilation_h = I1;
    constexpr auto conv_dilation_w = I1;
-#if 0
+#if 1
    constexpr auto in_left_pad_h   = I1;
    constexpr auto in_left_pad_w   = I1;
    constexpr auto in_right_pad_h  = I1;

--- a/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
@@ -3,7 +3,7 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-//#include <half.hpp>
+#include <half.hpp>
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -11,7 +11,6 @@
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
-#include "host_conv.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
@@ -23,6 +22,87 @@ enum ConvForwardAlgo
    V5R1NCHWC // 0
 };
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
+                                           const Tensor<TWei>& wei,
+                                           const Tensor<TOut>& bias,
+                                           Tensor<TOut>& out_host,
+                                           Tensor<TOut>& max_host,
+                                           const ConvStrides& conv_strides,
+                                           const ConvDilations& conv_dilations,
+                                           const InLeftPads& in_left_pads,
+                                           const InRightPads&,
+                                           const ck::ActivTypeEnum_t activ_type)
+{
+    using namespace ck;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        double v = 0;
+        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
+        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
+                        {
+                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
+                        }
+                    }
+                }
+            }
+        }
+        v += bias(k0, k1);
+        v = activ(v, activ_type);
+        out_host(n, k0, ho, wo, k1) = v;
+    };
+    make_ParallelTensorFunctor(f_nchw,
+                               out_host.mDesc.GetLengths()[0],
+                               out_host.mDesc.GetLengths()[1],
+                               out_host.mDesc.GetLengths()[2],
+                               out_host.mDesc.GetLengths()[3],
+                               out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+    auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        auto hx = ho * 2;
+        auto wx = wo * 2;
+        auto v0 = out_host(n, k0, hx, wx, k1);
+        auto v1 = out_host(n, k0, hx, wx + 1, k1);
+        auto v2 = out_host(n, k0, hx + 1, wx, k1);
+        auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
+        max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
+    };
+    make_ParallelTensorFunctor(maxpool_nchw,
+                               max_host.mDesc.GetLengths()[0],
+                               max_host.mDesc.GetLengths()[1],
+                               max_host.mDesc.GetLengths()[2],
+                               max_host.mDesc.GetLengths()[3],
+                               max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+}
 int main(int argc, char* argv[])
 {
    using namespace ck;
@@ -98,17 +178,17 @@ int main(int argc, char* argv[])
    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
-#if 0
+#if 1
-    constexpr auto N             = Number<1>{};
+    constexpr auto N                         = Number<1>{};
-    constexpr auto Hi            = Number<1080>{};
+    constexpr auto Hi                        = Number<1080>{};
-    constexpr auto Wi            = Number<1920>{};
+    constexpr auto Wi                        = Number<1920>{};
-    constexpr auto Y             = Number<3>{};
+    constexpr auto Y                         = Number<3>{};
-    constexpr auto X             = Number<3>{};
+    constexpr auto X                         = Number<3>{};
-    constexpr auto C0            = Number<3>{};
+    constexpr auto C0                        = Number<2>{};
-    constexpr auto C1            = Number<4>{};
+    constexpr auto C1                        = Number<8>{};
-    constexpr auto K0            = Number<2>{};
+    constexpr auto K0                        = Number<2>{};
-    constexpr auto K1            = Number<8>{};
+    constexpr auto K1                        = Number<8>{};
-#elif 1
+#elif 0
    constexpr auto N  = Number<1>{};
    constexpr auto Hi = Number<1080>{};
    constexpr auto Wi = Number<1920>{};

--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
@@ -74,4 +74,17 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
    return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 }
+template <typename T>
+inline auto activ(T v, const ck::ActivTypeEnum_t activ_type)
+{
+    const T alpha = 0.3;
+    switch(activ_type)
+    {
+    case ck::ActivTypeEnum_t::None: return v;
+    case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v);
+    case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v)));
+    default: throw std::runtime_error("unsupported activ type"); break;
+    }
+}
 #endif
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -230,27 +230,23 @@ struct Tensor
    {
        switch(mDesc.GetNumOfDimension())
        {
-        case 1:
+        case 1: {
-        {
            auto f = [&](auto i) { (*this)(i) = g(i); };
            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
            break;
        }
-        case 2:
+        case 2: {
-        {
            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
            break;
        }
-        case 3:
+        case 3: {
-        {
            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
            make_ParallelTensorFunctor(
                f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
            break;
        }
-        case 4:
+        case 4: {
-        {
            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
            };
@@ -261,8 +257,7 @@ struct Tensor
                                       mDesc.GetLengths()[3])(num_thread);
            break;
        }
-        case 5:
+        case 5: {
-        {
            auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
                (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
            };