Commit 21c91816 authored by Chao Liu's avatar Chao Liu
Browse files

added blockwise tensor reorder operation

parent 057c10e5
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
#include "device_direct_convolution_1.cuh" #include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh" #include "device_direct_convolution_2.cuh"
//#include "device_implicit_gemm_convolution.cuh"
//#include "device_winograd_convolution.cuh" //#include "device_winograd_convolution.cuh"
struct GeneratorTensor_1 struct GeneratorTensor_1
...@@ -366,7 +367,7 @@ int main() ...@@ -366,7 +367,7 @@ int main()
auto in_desc = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{}); auto in_desc = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
auto wei_desc = make_ConstantTensorDescriptor(Sequence<K, C, S, R>{}); auto wei_desc = make_ConstantTensorDescriptor(Sequence<K, C, S, R>{});
auto out_desc = get_output_4d_tensor_descriptor(in_desc, wei_desc); auto out_desc = get_convolution_output_4d_tensor_descriptor(in_desc, wei_desc);
ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: "); ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: ");
ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: "); ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: ");
...@@ -393,6 +394,8 @@ int main() ...@@ -393,6 +394,8 @@ int main()
device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device); device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
#elif 1 #elif 1
device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device); device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
#elif 0
device_implicit_gemm_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#elif 0 #elif 0
device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#endif #endif
......
...@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{}); make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
constexpr auto out_thread_desc = constexpr auto out_thread_desc =
get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); get_convolution_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
constexpr auto in_thread_block_desc = constexpr auto in_thread_block_desc =
make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides()); make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
......
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
template <class TFloat, class DstDesc, class F, unsigned BlockSize> template <class TFloat, class DstDesc, class F, unsigned BlockSize>
__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) __device__ void
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -75,82 +76,94 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri ...@@ -75,82 +76,94 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri
} }
} }
template <class TFloat, class DescA, class DescB, class DescRef, class F, unsigned BlockSize> template <class TFloat,
__device__ void blockwise_4d_tensor_pointwise_op_binary( class SrcDesc,
DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f) class DstDesc,
class RefDesc,
class Reorder,
class F,
unsigned BlockSize>
__device__ void
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
DstDesc,
TFloat* __restrict__ p_dst,
RefDesc,
Reorder,
F f)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto desc_a = DescA{}; constexpr unsigned IT0 = Reorder{}.Get(I0);
constexpr auto desc_b = DescB{}; constexpr unsigned IT1 = Reorder{}.Get(I1);
constexpr auto desc_ref = DescRef{}; constexpr unsigned IT2 = Reorder{}.Get(I2);
constexpr unsigned IT3 = Reorder{}.Get(I3);
#if 0 constexpr auto src_desc = SrcDesc{};
if(threadIdx.x == 0) constexpr auto dst_desc = DstDesc{};
{ constexpr auto ref_desc = RefDesc{};
print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: ");
print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: ");
print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: ");
}
#endif
constexpr unsigned NLoop = desc_ref.GetElementSize() / BlockSize; constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
for(unsigned iloop = 0; iloop < NLoop; ++iloop) for(unsigned iloop = 0; iloop < NLoop; ++iloop)
{ {
unsigned is = threadIdx.x + iloop * BlockSize; unsigned is = threadIdx.x + iloop * BlockSize;
const unsigned did0 = is / desc_ref.GetStride(I0); unsigned did[4];
is -= did0 * desc_ref.GetStride(I0); did[0] = is / ref_desc.GetStride(I0);
const unsigned did1 = is / desc_ref.GetStride(I1); is -= did[0] * ref_desc.GetStride(I0);
is -= did1 * desc_ref.GetStride(I1); did[1] = is / ref_desc.GetStride(I1);
const unsigned did2 = is / desc_ref.GetStride(I2); is -= did[1] * ref_desc.GetStride(I1);
is -= did2 * desc_ref.GetStride(I2); did[2] = is / ref_desc.GetStride(I2);
const unsigned did3 = is / desc_ref.GetStride(I3); is -= did[2] * ref_desc.GetStride(I2);
const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3); did[3] = is / ref_desc.GetStride(I3);
const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3); const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
f(p_a[aindex], p_b[bindex]); const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
f(p_src[aindex], p_dst[bindex]);
} }
constexpr bool has_tail = (desc_ref.GetElementSize() > NLoop * BlockSize); constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
if(has_tail) if(has_tail)
{ {
unsigned is = threadIdx.x + NLoop * BlockSize; unsigned is = threadIdx.x + NLoop * BlockSize;
if(is < desc_ref.GetElementSize()) if(is < ref_desc.GetElementSize())
{ {
const unsigned did0 = is / desc_ref.GetStride(I0); unsigned did[4];
did[0] = is / ref_desc.GetStride(I0);
is -= did0 * desc_ref.GetStride(I0); is -= did[0] * ref_desc.GetStride(I0);
const unsigned did1 = is / desc_ref.GetStride(I1); did[1] = is / ref_desc.GetStride(I1);
is -= did1 * desc_ref.GetStride(I1); is -= did[1] * ref_desc.GetStride(I1);
const unsigned did2 = is / desc_ref.GetStride(I2); did[2] = is / ref_desc.GetStride(I2);
is -= did2 * desc_ref.GetStride(I2); is -= did[2] * ref_desc.GetStride(I2);
const unsigned did3 = is / desc_ref.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3); const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3); const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
f(p_a[aindex], p_b[bindex]); f(p_src[aindex], p_dst[bindex]);
} }
} }
} }
...@@ -160,21 +173,53 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst ...@@ -160,21 +173,53 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst
{ {
auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
blockwise_4d_tensor_pointwise_op_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>( blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
DstDesc{}, p_dst, f_set_zero); DstDesc{}, p_dst, f_set_zero);
} }
template <class TFloat,
class SrcDesc,
class DstDesc,
class RefDesc,
class Reorder,
unsigned BlockSize>
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
DstDesc,
TFloat* __restrict__ p_dst,
RefDesc,
Reorder)
{
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
SrcDesc,
DstDesc,
RefDesc,
Reorder,
decltype(f_copy),
BlockSize>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
}
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize> template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_copy( __device__ void blockwise_4d_tensor_copy(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
{ {
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; constexpr auto reorder = Sequence<0, 1, 2, 3>{};
blockwise_4d_tensor_copy_reorder<TFloat,
SrcDesc,
DstDesc,
RefDesc,
decltype(reorder),
BlockSize>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
}
blockwise_4d_tensor_pointwise_op_binary<TFloat, template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
SrcDesc, __device__ void blockwise_4d_tensor_im2col(
DstDesc, ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
RefDesc, {
decltype(f_copy), // do nothing
BlockSize>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy);
} }
...@@ -22,6 +22,14 @@ struct Sequence ...@@ -22,6 +22,14 @@ struct Sequence
{ {
return mData[I]; return mData[I];
} }
template <unsigned I>
__host__ __device__ constexpr auto GetNumber(Number<I>) const
{
constexpr unsigned N = Get(I);
return Number<N>{};
}
}; };
template <class Lengths, class Strides> template <class Lengths, class Strides>
...@@ -113,9 +121,31 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride ...@@ -113,9 +121,31 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return ConstantTensorDescriptor<Lengths, Strides>{}; return ConstantTensorDescriptor<Lengths, Strides>{};
} }
// this is ugly, only for 4d
template <class Desc, class Reorder>
__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto IT0 = Reorder{}.GetNumber(I0);
constexpr auto IT1 = Reorder{}.GetNumber(I1);
constexpr auto IT2 = Reorder{}.GetNumber(I2);
constexpr auto IT3 = Reorder{}.GetNumber(I3);
constexpr unsigned L0 = Desc{}.GetLength(IT0);
constexpr unsigned L1 = Desc{}.GetLength(IT1);
constexpr unsigned L2 = Desc{}.GetLength(IT2);
constexpr unsigned L3 = Desc{}.GetLength(IT3);
return make_ConstantTensorDescriptor(Sequence<L0, L1, L2, L3>{});
}
// this is ugly, only for 4d // this is ugly, only for 4d
template <class InDesc, class WeiDesc> template <class InDesc, class WeiDesc>
__host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDesc) __host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
{ {
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
......
...@@ -70,7 +70,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, ...@@ -70,7 +70,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides()); Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
constexpr auto out_thread_desc = constexpr auto out_thread_desc =
get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
// register // register
TFloat p_out_thread[out_thread_desc.GetElementSpace()]; TFloat p_out_thread[out_thread_desc.GetElementSpace()];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment