Commit e5f7ded6 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge develop

parent ed068043
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <initializer_list> #include <initializer_list>
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
//#include <half.hpp> #include <half.hpp>
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
...@@ -23,6 +22,78 @@ enum ConvForwardAlgo ...@@ -23,6 +22,78 @@ enum ConvForwardAlgo
V5R1NCHWC // 0 V5R1NCHWC // 0
}; };
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& add,
const Tensor<TOut>& bias,
Tensor<TOut>& add_host,
Tensor<TOut>& out_host,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ck::ActivTypeEnum_t activ_type)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
}
}
v += bias(k0, k1);
v = activ(v, activ_type);
const int hox2 = ho * 2;
const int wox2 = wo * 2;
out_host(n, k0, ho, wo, k1) = v;
add_host(n, k0, hox2, wox2, k1) = v + add(n, k0, hox2, wox2, k1);
add_host(n, k0, hox2, wox2 + 1, k1) = v + add(n, k0, hox2, wox2 + 1, k1);
add_host(n, k0, hox2 + 1, wox2, k1) = v + add(n, k0, hox2 + 1, wox2, k1);
add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1);
};
make_ParallelTensorFunctor(f_nchw,
out_host.mDesc.GetLengths()[0],
out_host.mDesc.GetLengths()[1],
out_host.mDesc.GetLengths()[2],
out_host.mDesc.GetLengths()[3],
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
}
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
...@@ -334,10 +405,8 @@ int main(int argc, char* argv[]) ...@@ -334,10 +405,8 @@ int main(int argc, char* argv[])
if(do_log) if(do_log)
{ {
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
// std::endl;
LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl;
} }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <initializer_list> #include <initializer_list>
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
//#include <half.hpp> #include <half.hpp>
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
...@@ -23,6 +22,64 @@ enum ConvForwardAlgo ...@@ -23,6 +22,64 @@ enum ConvForwardAlgo
V5R1NCHWC // 0 V5R1NCHWC // 0
}; };
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& bias,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ck::ActivTypeEnum_t activ_type)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
const int k = k0 * out.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
}
}
v += bias(k0, k1);
out(n, k0, ho, wo, k1) = activ(v, activ_type);
};
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3],
out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
}
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
...@@ -111,8 +168,8 @@ int main(int argc, char* argv[]) ...@@ -111,8 +168,8 @@ int main(int argc, char* argv[])
constexpr auto Wi = Number<1920>{}; constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{}; constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{}; constexpr auto X = Number<3>{};
constexpr auto C0 = Number<1>{}; constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<4>{}; constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{}; constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
#elif 0 #elif 0
...@@ -152,7 +209,7 @@ int main(int argc, char* argv[]) ...@@ -152,7 +209,7 @@ int main(int argc, char* argv[])
constexpr auto conv_dilation_h = I1; constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1; constexpr auto conv_dilation_w = I1;
#if 0 #if 1
constexpr auto in_left_pad_h = I1; constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1; constexpr auto in_left_pad_w = I1;
constexpr auto in_right_pad_h = I1; constexpr auto in_right_pad_h = I1;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <initializer_list> #include <initializer_list>
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
//#include <half.hpp> #include <half.hpp>
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
...@@ -23,6 +22,87 @@ enum ConvForwardAlgo ...@@ -23,6 +22,87 @@ enum ConvForwardAlgo
V5R1NCHWC // 0 V5R1NCHWC // 0
}; };
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& bias,
Tensor<TOut>& out_host,
Tensor<TOut>& max_host,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ck::ActivTypeEnum_t activ_type)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
}
}
v += bias(k0, k1);
v = activ(v, activ_type);
out_host(n, k0, ho, wo, k1) = v;
};
make_ParallelTensorFunctor(f_nchw,
out_host.mDesc.GetLengths()[0],
out_host.mDesc.GetLengths()[1],
out_host.mDesc.GetLengths()[2],
out_host.mDesc.GetLengths()[3],
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
auto hx = ho * 2;
auto wx = wo * 2;
auto v0 = out_host(n, k0, hx, wx, k1);
auto v1 = out_host(n, k0, hx, wx + 1, k1);
auto v2 = out_host(n, k0, hx + 1, wx, k1);
auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
};
make_ParallelTensorFunctor(maxpool_nchw,
max_host.mDesc.GetLengths()[0],
max_host.mDesc.GetLengths()[1],
max_host.mDesc.GetLengths()[2],
max_host.mDesc.GetLengths()[3],
max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
}
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
...@@ -98,17 +178,17 @@ int main(int argc, char* argv[]) ...@@ -98,17 +178,17 @@ int main(int argc, char* argv[])
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
#if 0 #if 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{}; constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{}; constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{}; constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{}; constexpr auto X = Number<3>{};
constexpr auto C0 = Number<3>{}; constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<4>{}; constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{}; constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
#elif 1 #elif 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{}; constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{}; constexpr auto Wi = Number<1920>{};
......
...@@ -74,4 +74,17 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes ...@@ -74,4 +74,17 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
return std::size_t(2) * N * K * Ho * Wo * C * Y * X; return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
} }
template <typename T>
inline auto activ(T v, const ck::ActivTypeEnum_t activ_type)
{
const T alpha = 0.3;
switch(activ_type)
{
case ck::ActivTypeEnum_t::None: return v;
case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v);
case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v)));
default: throw std::runtime_error("unsupported activ type"); break;
}
}
#endif #endif
...@@ -230,27 +230,23 @@ struct Tensor ...@@ -230,27 +230,23 @@ struct Tensor
{ {
switch(mDesc.GetNumOfDimension()) switch(mDesc.GetNumOfDimension())
{ {
case 1: case 1: {
{
auto f = [&](auto i) { (*this)(i) = g(i); }; auto f = [&](auto i) { (*this)(i) = g(i); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread); make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
break; break;
} }
case 2: case 2: {
{
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); }; auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread); make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
break; break;
} }
case 3: case 3: {
{
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); }; auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
make_ParallelTensorFunctor( make_ParallelTensorFunctor(
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread); f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
break; break;
} }
case 4: case 4: {
{
auto f = [&](auto i0, auto i1, auto i2, auto i3) { auto f = [&](auto i0, auto i1, auto i2, auto i3) {
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3); (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
}; };
...@@ -261,8 +257,7 @@ struct Tensor ...@@ -261,8 +257,7 @@ struct Tensor
mDesc.GetLengths()[3])(num_thread); mDesc.GetLengths()[3])(num_thread);
break; break;
} }
case 5: case 5: {
{
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) { auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4); (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment