added blockwise tensor reorder operation

21c91816 · Chao Liu · 057c10e5 · 21c91816 · 21c91816 · 21c91816
Commit 21c91816 authored Jan 04, 2019 by Chao Liu
5 changed files
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -7,6 +7,7 @@
 #include "constant_tensor_descriptor.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
+//#include "device_implicit_gemm_convolution.cuh"
 //#include "device_winograd_convolution.cuh"

 struct GeneratorTensor_1
@@ -366,7 +367,7 @@ int main()

    auto in_desc  = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
    auto wei_desc = make_ConstantTensorDescriptor(Sequence<K, C, S, R>{});
-    auto out_desc = get_output_4d_tensor_descriptor(in_desc, wei_desc);
+    auto out_desc = get_convolution_output_4d_tensor_descriptor(in_desc, wei_desc);

    ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: ");
    ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: ");
@@ -393,6 +394,8 @@ int main()
        device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
 #elif 1
        device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
+#elif 0
+        device_implicit_gemm_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
 #elif 0
        device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
 #endif

--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});

    constexpr auto out_thread_desc =
-        get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
+        get_convolution_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);

    constexpr auto in_thread_block_desc =
        make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());

--- a/src/include/blockwise_tensor_op.cuh
+++ b/src/include/blockwise_tensor_op.cuh
@@ -2,7 +2,8 @@
 #include "constant_tensor_descriptor.cuh"

 template <class TFloat, class DstDesc, class F, unsigned BlockSize>
-__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
+__device__ void
+blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -75,82 +76,94 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri
    }
 }

-template <class TFloat, class DescA, class DescB, class DescRef, class F, unsigned BlockSize>
-__device__ void blockwise_4d_tensor_pointwise_op_binary(
-    DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f)
+template <class TFloat,
+          class SrcDesc,
+          class DstDesc,
+          class RefDesc,
+          class Reorder,
+          class F,
+          unsigned BlockSize>
+__device__ void
+blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
+                                                       TFloat* const __restrict__ p_src,
+                                                       DstDesc,
+                                                       TFloat* __restrict__ p_dst,
+                                                       RefDesc,
+                                                       Reorder,
+                                                       F f)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

-    constexpr auto desc_a   = DescA{};
-    constexpr auto desc_b   = DescB{};
-    constexpr auto desc_ref = DescRef{};
+    constexpr unsigned IT0 = Reorder{}.Get(I0);
+    constexpr unsigned IT1 = Reorder{}.Get(I1);
+    constexpr unsigned IT2 = Reorder{}.Get(I2);
+    constexpr unsigned IT3 = Reorder{}.Get(I3);

-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: ");
-        print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: ");
-        print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: ");
-    }
-#endif
+    constexpr auto src_desc = SrcDesc{};
+    constexpr auto dst_desc = DstDesc{};
+    constexpr auto ref_desc = RefDesc{};

-    constexpr unsigned NLoop = desc_ref.GetElementSize() / BlockSize;
+    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

-        const unsigned did0 = is / desc_ref.GetStride(I0);
+        unsigned did[4];

-        is -= did0 * desc_ref.GetStride(I0);
+        did[0] = is / ref_desc.GetStride(I0);

-        const unsigned did1 = is / desc_ref.GetStride(I1);
+        is -= did[0] * ref_desc.GetStride(I0);

-        is -= did1 * desc_ref.GetStride(I1);
+        did[1] = is / ref_desc.GetStride(I1);

-        const unsigned did2 = is / desc_ref.GetStride(I2);
+        is -= did[1] * ref_desc.GetStride(I1);

-        is -= did2 * desc_ref.GetStride(I2);
+        did[2] = is / ref_desc.GetStride(I2);

-        const unsigned did3 = is / desc_ref.GetStride(I3);
+        is -= did[2] * ref_desc.GetStride(I2);

-        const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
+        did[3] = is / ref_desc.GetStride(I3);

-        const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
+        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);

-        f(p_a[aindex], p_b[bindex]);
+        const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
+
+        f(p_src[aindex], p_dst[bindex]);
    }

-    constexpr bool has_tail = (desc_ref.GetElementSize() > NLoop * BlockSize);
+    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

-        if(is < desc_ref.GetElementSize())
+        if(is < ref_desc.GetElementSize())
        {
-            const unsigned did0 = is / desc_ref.GetStride(I0);
+            unsigned did[4];
+
+            did[0] = is / ref_desc.GetStride(I0);

-            is -= did0 * desc_ref.GetStride(I0);
+            is -= did[0] * ref_desc.GetStride(I0);

-            const unsigned did1 = is / desc_ref.GetStride(I1);
+            did[1] = is / ref_desc.GetStride(I1);

-            is -= did1 * desc_ref.GetStride(I1);
+            is -= did[1] * ref_desc.GetStride(I1);

-            const unsigned did2 = is / desc_ref.GetStride(I2);
+            did[2] = is / ref_desc.GetStride(I2);

-            is -= did2 * desc_ref.GetStride(I2);
+            is -= did[2] * ref_desc.GetStride(I2);

-            const unsigned did3 = is / desc_ref.GetStride(I3);
+            did[3] = is / ref_desc.GetStride(I3);

-            const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
+            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);

-            const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
+            const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);

-            f(p_a[aindex], p_b[bindex]);
+            f(p_src[aindex], p_dst[bindex]);
        }
    }
 }
@@ -160,21 +173,53 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst
 {
    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };

-    blockwise_4d_tensor_pointwise_op_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
+    blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
        DstDesc{}, p_dst, f_set_zero);
 }

+template <class TFloat,
+          class SrcDesc,
+          class DstDesc,
+          class RefDesc,
+          class Reorder,
+          unsigned BlockSize>
+__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
+                                                 TFloat* const __restrict__ p_src,
+                                                 DstDesc,
+                                                 TFloat* __restrict__ p_dst,
+                                                 RefDesc,
+                                                 Reorder)
+{
+    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
+
+    blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
+                                                           SrcDesc,
+                                                           DstDesc,
+                                                           RefDesc,
+                                                           Reorder,
+                                                           decltype(f_copy),
+                                                           BlockSize>(
+        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
+}
+
 template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
 __device__ void blockwise_4d_tensor_copy(
    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
 {
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
+    constexpr auto reorder = Sequence<0, 1, 2, 3>{};
+
+    blockwise_4d_tensor_copy_reorder<TFloat,
+                                     SrcDesc,
+                                     DstDesc,
+                                     RefDesc,
+                                     decltype(reorder),
+                                     BlockSize>(
+        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
+}

-    blockwise_4d_tensor_pointwise_op_binary<TFloat,
-                                            SrcDesc,
-                                            DstDesc,
-                                            RefDesc,
-                                            decltype(f_copy),
-                                            BlockSize>(
-        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy);
+template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_im2col(
+    ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
+{
+    // do nothing
 }
--- a/src/include/constant_tensor_descriptor.cuh
+++ b/src/include/constant_tensor_descriptor.cuh
@@ -22,6 +22,14 @@ struct Sequence
    {
        return mData[I];
    }
+
+    template <unsigned I>
+    __host__ __device__ constexpr auto GetNumber(Number<I>) const
+    {
+        constexpr unsigned N = Get(I);
+
+        return Number<N>{};
+    }
 };

 template <class Lengths, class Strides>
@@ -113,9 +121,31 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
    return ConstantTensorDescriptor<Lengths, Strides>{};
 }

+// this is ugly, only for 4d
+template <class Desc, class Reorder>
+__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto IT0 = Reorder{}.GetNumber(I0);
+    constexpr auto IT1 = Reorder{}.GetNumber(I1);
+    constexpr auto IT2 = Reorder{}.GetNumber(I2);
+    constexpr auto IT3 = Reorder{}.GetNumber(I3);
+
+    constexpr unsigned L0 = Desc{}.GetLength(IT0);
+    constexpr unsigned L1 = Desc{}.GetLength(IT1);
+    constexpr unsigned L2 = Desc{}.GetLength(IT2);
+    constexpr unsigned L3 = Desc{}.GetLength(IT3);
+
+    return make_ConstantTensorDescriptor(Sequence<L0, L1, L2, L3>{});
+}
+
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>
-__host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDesc)
+__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
 {
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};

--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -70,7 +70,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());

    constexpr auto out_thread_desc =
-        get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
+        get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);

    // register
    TFloat p_out_thread[out_thread_desc.GetElementSpace()];