"example/vscode:/vscode.git/clone" did not exist on "1deb01b2baf66a683b09a0225946f701a49eac7c"
Commit fe427fd1 authored by Jing Zhang's avatar Jing Zhang
Browse files

init commit for conv+activ

parent b8bb1480
...@@ -32,7 +32,7 @@ __host__ __device__ constexpr auto make_left_pad_transform( ...@@ -32,7 +32,7 @@ __host__ __device__ constexpr auto make_left_pad_transform(
return DynamicLeftPad<LowLength, LeftPad, SkipIsValidCheck>{low_length, left_pad}; return DynamicLeftPad<LowLength, LeftPad, SkipIsValidCheck>{low_length, left_pad};
} }
template <typename LowLength, typename RightPad, bool SkipIsValidCheck> template <typename LowLength, typename RightPad, bool SkipIsValidCheck = false>
__host__ __device__ constexpr auto make_right_pad_transform( __host__ __device__ constexpr auto make_right_pad_transform(
const LowLength& low_length, const LowLength& low_length,
const RightPad& right_pad, const RightPad& right_pad,
......
...@@ -346,6 +346,19 @@ struct GridwiseStaticGemm_km_kn_mn_v3 ...@@ -346,6 +346,19 @@ struct GridwiseStaticGemm_km_kn_mn_v3
blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
} }
// activ
{
constexpr index_t activ_type = 2;
static_for<0, c_k_n_ho_wo_thread_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
if constexpr(activ_type == 1)
c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : 0.0;
else if constexpr(activ_type == 2)
c_thread_buf(i) = 1.0 / (1.0 + exp(-c_thread_buf[i]));
}
);
}
// output: register to global memory // output: register to global memory
{ {
// hack to control index calculation when iterating over c_k_n_ho_wo_global tensor // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
......
...@@ -103,7 +103,7 @@ int main(int argc, char* argv[]) ...@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
const bool do_log = atoi(argv[5]); const bool do_log = atoi(argv[5]);
const int nrepeat = atoi(argv[6]); const int nrepeat = atoi(argv[6]);
#if 0 #if 1
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t Hi = 1080; constexpr index_t Hi = 1080;
...@@ -127,7 +127,7 @@ int main(int argc, char* argv[]) ...@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
constexpr index_t K = 16; constexpr index_t K = 16;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
#elif 1 #elif 0
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t Hi = 240; constexpr index_t Hi = 240;
...@@ -135,7 +135,7 @@ int main(int argc, char* argv[]) ...@@ -135,7 +135,7 @@ int main(int argc, char* argv[])
constexpr index_t K = 16; constexpr index_t K = 16;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
#elif 1 #elif 0
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t Hi = 1080; constexpr index_t Hi = 1080;
...@@ -143,6 +143,38 @@ int main(int argc, char* argv[]) ...@@ -143,6 +143,38 @@ int main(int argc, char* argv[])
constexpr index_t K = 16; constexpr index_t K = 16;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 540;
constexpr index_t Wi = 960;
constexpr index_t K = 16;
constexpr index_t Y = 1;
constexpr index_t X = 1;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 480;
constexpr index_t Wi = 270;
constexpr index_t K = 16;
constexpr index_t Y = 1;
constexpr index_t X = 1;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 8;
constexpr index_t Hi = 1080;
constexpr index_t Wi = 1920;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 1080;
constexpr index_t Wi = 1920;
constexpr index_t K = 4;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#endif #endif
const index_t conv_stride_h = 1; const index_t conv_stride_h = 1;
...@@ -420,7 +452,7 @@ int main(int argc, char* argv[]) ...@@ -420,7 +452,7 @@ int main(int argc, char* argv[])
#else #else
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
#endif #endif
<in_data_t, 8, acc_data_t, out_data_t>(tmp[I0], <in_data_t, 8, 8, acc_data_t, out_data_t>(tmp[I0],
tmp[I1], tmp[I1],
tmp[I2], tmp[I2],
tmp[I3], tmp[I3],
...@@ -490,14 +522,15 @@ int main(int argc, char* argv[]) ...@@ -490,14 +522,15 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
host_direct_convolution(in, host_direct_convolution_activ(in,
wei, wei,
out_host, out_host,
make_tuple(conv_stride_h, conv_stride_w), make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w), make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w), make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w), make_tuple(in_right_pad_h, in_right_pad_w),
layout); layout,
ActivType_t::sigmoid);
check_error(out_host, out_device); check_error(out_host, out_device);
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
template <typename TInWei, template <typename TInWei,
ck::index_t InWeiVectorSize, ck::index_t InWeiVectorSize,
ck::index_t OutVectorSize,
typename TAcc, typename TAcc,
typename TOut, typename TOut,
typename InLengths, typename InLengths,
...@@ -53,8 +54,8 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -53,8 +54,8 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
const auto C0 = C / Number<InWeiVectorSize>{}; const auto C0 = C / Number<InWeiVectorSize>{};
const auto C1 = Number<InWeiVectorSize>{}; const auto C1 = Number<InWeiVectorSize>{};
const auto K0 = K / Number<InWeiVectorSize>{}; const auto K0 = K / Number<OutVectorSize>{};
const auto K1 = Number<InWeiVectorSize>{}; const auto K1 = Number<OutVectorSize>{};
Tensor<TInWei> in_n_c0_hi_wi_c1( Tensor<TInWei> in_n_c0_hi_wi_c1(
HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1})); HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
...@@ -105,7 +106,7 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -105,7 +106,7 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = EPerBlock; constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>; using ABlockTransferThreadSliceLengths_E_K = Sequence<Y * X, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>; using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1; constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
...@@ -120,8 +121,10 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -120,8 +121,10 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 #if 0
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#else #elif 1
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
#elif 1
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
#endif #endif
<BlockSize, <BlockSize,
typename vector_type<TInWei, InWeiVectorSize>::type, typename vector_type<TInWei, InWeiVectorSize>::type,
......
#pragma once #pragma once
#include "host_tensor.hpp" #include "host_tensor.hpp"
typedef enum
{
passthrough = 0,
relu,
sigmoid
} ActivType_t;
template <typename TIn, template <typename TIn,
typename TWei, typename TWei,
typename TOut, typename TOut,
...@@ -88,6 +95,106 @@ void host_direct_convolution(const Tensor<TIn>& in, ...@@ -88,6 +95,106 @@ void host_direct_convolution(const Tensor<TIn>& in,
} }
} }
template <typename T>
inline auto activ(T v, const ActivType_t activ_type)
{
switch(activ_type)
{
case passthrough: return v;
case relu: return (v >= 0 ? v : 0);
case sigmoid: return (1 / (1 + exp(-v)));
default: throw std::runtime_error("unsupported activ type"); break;
}
}
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_activ(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const ConvTensorLayout layout = ConvTensorLayout::NCHW,
const ActivType_t activ_type = ActivType_t::passthrough)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
v += static_cast<const double>(in(n, c, hi, wi)) *
static_cast<const double>(wei(k, c, y, x));
}
}
}
}
out(n, k, ho, wo) = activ(v, activ_type);
};
auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
wi < in.mDesc.GetLengths()[2])
{
v += static_cast<const double>(in(n, hi, wi, c)) *
static_cast<const double>(wei(k, y, x, c));
}
}
}
}
out(n, k, ho, wo) = activ(v, activ_type);
};
switch(layout)
{
case ConvTensorLayout::NCHW:
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break;
case ConvTensorLayout::NHWC:
make_ParallelTensorFunctor(f_nhwc,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break;
default: throw std::runtime_error("wrong! not supported layout");
}
}
template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads> template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx, const Tensor<TWei>& wei_kcyx,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment