refactor

df228b3c · Chao Liu · 0b8e67ef · df228b3c · df228b3c · df228b3c
Commit df228b3c authored Jan 08, 2019 by Chao Liu
5 changed files
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -5,6 +5,7 @@
 #include "nvToolsExt.h"
 #include "tensor.hpp"
 #include "constant_tensor_descriptor.cuh"
+#include "conv_common.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
 //#include "device_implicit_gemm_convolution.cuh"
@@ -367,7 +368,7 @@ int main()

    auto in_desc  = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
    auto wei_desc = make_ConstantTensorDescriptor(Sequence<K, C, S, R>{});
-    auto out_desc = get_convolution_output_4d_tensor_descriptor(in_desc, wei_desc);
+    auto out_desc = get_convolution_output_default_4d_tensor_descriptor(in_desc, wei_desc);

    ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: ");
    ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: ");

--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});

    constexpr auto out_thread_desc =
-        get_convolution_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
+        get_convolution_output_default_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);

    constexpr auto in_thread_block_desc =
        make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());

--- a/src/include/constant_tensor_descriptor.cuh
+++ b/src/include/constant_tensor_descriptor.cuh
@@ -23,14 +23,6 @@ struct Sequence
        return mData[I];
    }

-    template <unsigned I>
-    __host__ __device__ constexpr auto GetConstant(Number<I>) const
-    {
-        constexpr unsigned N = Get(I);
-
-        return Number<N>{};
-    }
-
    template <unsigned I0, unsigned I1>
    __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>) const
    {
@@ -61,17 +53,15 @@ struct Sequence
        return Sequence<IR0, IR1, IR2, IR3>{};
    }

-    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3, unsigned I4>
-    __host__ __device__ constexpr auto
-        Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>, Number<I4>) const
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
+    __host__ __device__ constexpr auto Reorder(Sequence<I0, I1, I2, I3>) const
    {
        constexpr unsigned IR0 = Get(Number<I0>{});
        constexpr unsigned IR1 = Get(Number<I1>{});
        constexpr unsigned IR2 = Get(Number<I2>{});
        constexpr unsigned IR3 = Get(Number<I3>{});
-        constexpr unsigned IR4 = Get(Number<I4>{});

-        return Sequence<IR0, IR1, IR2, IR3, IR4>{};
+        return Sequence<IR0, IR1, IR2, IR3>{};
    }
 };

@@ -132,7 +122,8 @@ struct ConstantTensorDescriptor
    }

    // this is ugly, only for 4d
-    __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const
+    __host__ __device__ unsigned
+    Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -140,24 +131,24 @@ struct ConstantTensorDescriptor
        constexpr auto I3 = Number<3>{};

        static_assert(nDim == 4, "nDim is not 4");
-        return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3);
-    }
-
-    template <class... Is>
-    __host__ __device__ constexpr auto Reorder(Is... is) const
-    {
-        constexpr auto lengths = Lengths{}.Reorder(is...);
-        constexpr auto strides = Strides{}.Reorder(is...);
-
-        return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
+        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3);
    }
 };

 // this is ugly, only for 4d
-template <unsigned N, unsigned C, unsigned H, unsigned W>
-__host__ __device__ constexpr auto calculate_default_strides(Sequence<N, C, H, W>)
+template <unsigned L0, unsigned L1, unsigned L2, unsigned L3>
+__host__ __device__ constexpr auto calculate_default_strides(Sequence<L0, L1, L2, L3>)
+{
+    return Sequence<L1 * L2 * L3, L2 * L3, L3, 1>{};
+}
+
+// this is ugly, only for 4d
+template <unsigned S0, unsigned S1, unsigned S2, unsigned S3>
+__host__ __device__ constexpr auto calculate_full_lengths(Sequence<S0, S1, S2, S3>)
 {
-    return Sequence<C * H * W, H * W, W, 1>{};
+    static_assert((S0 % S1 == 0) && (S1 % S2 == 0) && (S2 % S3 == 0), "cannot be evenly divided!");
+
+    return Sequence<1, S0 / S1, S1 / S2, S2 / S3>{};
 }

 template <class Lengths>
@@ -173,37 +164,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
    return ConstantTensorDescriptor<Lengths, Strides>{};
 }

-// this is ugly, only for 4d
-template <class InDesc, class WeiDesc>
-__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
-{
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    static_assert(in_desc.GetDimension() == 4, "input nDim is not 4");
-    static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4");
-    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
-                  "input & weight dimension not consistent");
-
-    constexpr auto N  = in_desc.GetLength(I0);
-    constexpr auto HI = in_desc.GetLength(I2);
-    constexpr auto WI = in_desc.GetLength(I3);
-
-    constexpr auto K = wei_desc.GetLength(I0);
-    constexpr auto S = wei_desc.GetLength(I2);
-    constexpr auto R = wei_desc.GetLength(I3);
-
-    constexpr auto HO = HI - S + 1;
-    constexpr auto WO = WI - R + 1;
-
-    return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
-}
-
 // this is ugly, only for 4d
 template <class TDesc>
 __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)

--- a/src/include/conv_common.cuh
+++ b/src/include/conv_common.cuh
+#pragma once
+#include "constant_tensor_descriptor.cuh"
+
+// this is ugly, only for 4d
+template <class InDesc, class WeiDesc>
+__host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
+                                                                                       WeiDesc)
+{
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    static_assert(in_desc.GetDimension() == 4, "input nDim is not 4");
+    static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4");
+    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
+                  "input & weight dimension not consistent");
+
+    constexpr auto N  = in_desc.GetLength(I0);
+    constexpr auto HI = in_desc.GetLength(I2);
+    constexpr auto WI = in_desc.GetLength(I3);
+
+    constexpr auto K = wei_desc.GetLength(I0);
+    constexpr auto S = wei_desc.GetLength(I2);
+    constexpr auto R = wei_desc.GetLength(I3);
+
+    constexpr auto HO = HI - S + 1;
+    constexpr auto WO = WI - R + 1;
+
+    return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
+}
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -69,8 +69,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());

-    constexpr auto out_thread_desc =
-        get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
+    constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_thread_block_desc, wei_thread_block_desc);

    // register
    Float p_out_thread[out_thread_desc.GetElementSpace()];