"git@developer.sourcefind.cn:OpenDAS/dlib.git" did not exist on "e8baa7d1a4d24a01cdbb010f3096fed44cbca0d8"
Commit df228b3c authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 0b8e67ef
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "tensor.hpp" #include "tensor.hpp"
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
#include "conv_common.cuh"
#include "device_direct_convolution_1.cuh" #include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh" #include "device_direct_convolution_2.cuh"
//#include "device_implicit_gemm_convolution.cuh" //#include "device_implicit_gemm_convolution.cuh"
...@@ -367,7 +368,7 @@ int main() ...@@ -367,7 +368,7 @@ int main()
auto in_desc = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{}); auto in_desc = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
auto wei_desc = make_ConstantTensorDescriptor(Sequence<K, C, S, R>{}); auto wei_desc = make_ConstantTensorDescriptor(Sequence<K, C, S, R>{});
auto out_desc = get_convolution_output_4d_tensor_descriptor(in_desc, wei_desc); auto out_desc = get_convolution_output_default_4d_tensor_descriptor(in_desc, wei_desc);
ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: "); ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: ");
ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: "); ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: ");
......
...@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{}); make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
constexpr auto out_thread_desc = constexpr auto out_thread_desc =
get_convolution_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); get_convolution_output_default_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
constexpr auto in_thread_block_desc = constexpr auto in_thread_block_desc =
make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides()); make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
......
...@@ -23,14 +23,6 @@ struct Sequence ...@@ -23,14 +23,6 @@ struct Sequence
return mData[I]; return mData[I];
} }
template <unsigned I>
__host__ __device__ constexpr auto GetConstant(Number<I>) const
{
constexpr unsigned N = Get(I);
return Number<N>{};
}
template <unsigned I0, unsigned I1> template <unsigned I0, unsigned I1>
__host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>) const __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>) const
{ {
...@@ -61,17 +53,15 @@ struct Sequence ...@@ -61,17 +53,15 @@ struct Sequence
return Sequence<IR0, IR1, IR2, IR3>{}; return Sequence<IR0, IR1, IR2, IR3>{};
} }
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3, unsigned I4> template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto Reorder(Sequence<I0, I1, I2, I3>) const
Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>, Number<I4>) const
{ {
constexpr unsigned IR0 = Get(Number<I0>{}); constexpr unsigned IR0 = Get(Number<I0>{});
constexpr unsigned IR1 = Get(Number<I1>{}); constexpr unsigned IR1 = Get(Number<I1>{});
constexpr unsigned IR2 = Get(Number<I2>{}); constexpr unsigned IR2 = Get(Number<I2>{});
constexpr unsigned IR3 = Get(Number<I3>{}); constexpr unsigned IR3 = Get(Number<I3>{});
constexpr unsigned IR4 = Get(Number<I4>{});
return Sequence<IR0, IR1, IR2, IR3, IR4>{}; return Sequence<IR0, IR1, IR2, IR3>{};
} }
}; };
...@@ -132,7 +122,8 @@ struct ConstantTensorDescriptor ...@@ -132,7 +122,8 @@ struct ConstantTensorDescriptor
} }
// this is ugly, only for 4d // this is ugly, only for 4d
__host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const __host__ __device__ unsigned
Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -140,24 +131,24 @@ struct ConstantTensorDescriptor ...@@ -140,24 +131,24 @@ struct ConstantTensorDescriptor
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
static_assert(nDim == 4, "nDim is not 4"); static_assert(nDim == 4, "nDim is not 4");
return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3); return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3);
}
template <class... Is>
__host__ __device__ constexpr auto Reorder(Is... is) const
{
constexpr auto lengths = Lengths{}.Reorder(is...);
constexpr auto strides = Strides{}.Reorder(is...);
return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
} }
}; };
// this is ugly, only for 4d // this is ugly, only for 4d
template <unsigned N, unsigned C, unsigned H, unsigned W> template <unsigned L0, unsigned L1, unsigned L2, unsigned L3>
__host__ __device__ constexpr auto calculate_default_strides(Sequence<N, C, H, W>) __host__ __device__ constexpr auto calculate_default_strides(Sequence<L0, L1, L2, L3>)
{
return Sequence<L1 * L2 * L3, L2 * L3, L3, 1>{};
}
// this is ugly, only for 4d
template <unsigned S0, unsigned S1, unsigned S2, unsigned S3>
__host__ __device__ constexpr auto calculate_full_lengths(Sequence<S0, S1, S2, S3>)
{ {
return Sequence<C * H * W, H * W, W, 1>{}; static_assert((S0 % S1 == 0) && (S1 % S2 == 0) && (S2 % S3 == 0), "cannot be evenly divided!");
return Sequence<1, S0 / S1, S1 / S2, S2 / S3>{};
} }
template <class Lengths> template <class Lengths>
...@@ -173,37 +164,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride ...@@ -173,37 +164,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return ConstantTensorDescriptor<Lengths, Strides>{}; return ConstantTensorDescriptor<Lengths, Strides>{};
} }
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
{
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
static_assert(in_desc.GetDimension() == 4, "input nDim is not 4");
static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4");
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
"input & weight dimension not consistent");
constexpr auto N = in_desc.GetLength(I0);
constexpr auto HI = in_desc.GetLength(I2);
constexpr auto WI = in_desc.GetLength(I3);
constexpr auto K = wei_desc.GetLength(I0);
constexpr auto S = wei_desc.GetLength(I2);
constexpr auto R = wei_desc.GetLength(I3);
constexpr auto HO = HI - S + 1;
constexpr auto WO = WI - R + 1;
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
}
// this is ugly, only for 4d // this is ugly, only for 4d
template <class TDesc> template <class TDesc>
__host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
......
#pragma once
#include "constant_tensor_descriptor.cuh"
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
__host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
WeiDesc)
{
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
static_assert(in_desc.GetDimension() == 4, "input nDim is not 4");
static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4");
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
"input & weight dimension not consistent");
constexpr auto N = in_desc.GetLength(I0);
constexpr auto HI = in_desc.GetLength(I2);
constexpr auto WI = in_desc.GetLength(I3);
constexpr auto K = wei_desc.GetLength(I0);
constexpr auto S = wei_desc.GetLength(I2);
constexpr auto R = wei_desc.GetLength(I3);
constexpr auto HO = HI - S + 1;
constexpr auto WO = WI - R + 1;
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
}
...@@ -69,8 +69,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, ...@@ -69,8 +69,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides()); Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
constexpr auto out_thread_desc = constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); in_thread_block_desc, wei_thread_block_desc);
// register // register
Float p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_thread_desc.GetElementSpace()];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment