added implicit gemm v1r3 lds_double_buffer NCHW * CYXK = KNHW, reworked static functionals

569ad66e · Chao Liu · 87d8740b · 569ad66e · 569ad66e
Commit 569ad66e authored Apr 23, 2019 by Chao Liu
Showing with 32 additions and 245 deletions

src/include/threadwise_4d_tensor_op.hip.hpp src/include/threadwise_4d_tensor_op.hip.hpp +2 -0

src/include/threadwise_nd_tensor_op.hip.hpp src/include/threadwise_nd_tensor_op.hip.hpp +30 -245

No files found.
--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
@@ -139,6 +139,7 @@ __device__ void threadwise_4d_tensor_copy_reorder_given_dst2src(SrcDesc,
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy);
 }
+#if 0 // replaced threadwise_nd_tensor_copy
 template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
 __device__ void threadwise_4d_tensor_copy(
    SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
@@ -210,6 +211,7 @@ __device__ void threadwise_4d_tensor_copy_v2(SrcDesc,
        }
    }
 }
+#endif
 template <class SrcData,
          class DstData,

--- a/src/include/threadwise_nd_tensor_op.hip.hpp
+++ b/src/include/threadwise_nd_tensor_op.hip.hpp
@@ -3,7 +3,7 @@
 // need to assume src and dst is aligned
 template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
-__device__ void threadwise_6d_tensor_copy(SrcDesc,
+__device__ void threadwise_nd_tensor_copy(SrcDesc,
                                          const Float* __restrict__ p_src,
                                          DstDesc,
                                          Float* __restrict__ p_dst,
@@ -12,268 +12,53 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
 {
    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
-    static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 &&
+    constexpr index_t nDim = SrcOpLengths::GetSize();
-                      SrcOpLengths::nDim == 6,
-                  "wrong! should be 6 dimension");
-    constexpr auto I0 = Number<0>{};
+    static_assert(SrcDesc{}.GetDimension() == nDim && DstDesc{}.GetDimension() == nDim,
-    constexpr auto I1 = Number<1>{};
+                  "wrong! dimension not consistent");
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
-    static_assert(SrcDesc{}.GetStride(I5) == 1 && DstDesc{}.GetStride(I5) == 1,
+#if 0
-                  "wrong! only support stride5 == 1!\n");
+    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-    static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
-                  "wrong! only support DataPerRead == 1, 2 or 4!\n");
-    static_assert(SrcDesc{}.GetStride(I4) % DataPerRead == 0 &&
-                      DstDesc{}.GetStride(I4) % DataPerRead == 0,
-                  "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
-    constexpr index_t L5 = SrcOpLengths{}.Get(I5);
-    static_assert(L5 % DataPerRead == 0, "wrong! L5 should be evenly divided by DataPerRead");
-    constexpr index_t nloop_d5 = L5 / DataPerRead;
-    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
    {
-        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+        print_ConstantTensorDescriptor(src_desc, "src_desc");
-        {
+        print_ConstantTensorDescriptor(dst_desc, "dst_desc");
-            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
+        print_ConstantTensorDescriptor(ref_desc, "ref_desc");
-            {
-                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
-                {
-                    for(index_t did4 = 0; did4 < ref_desc.GetLength(I4); ++did4)
-                    {
-                        for(index_t iloop_d5 = 0; iloop_d5 < nloop_d5; ++iloop_d5)
-                        {
-                            const index_t src_index = src_desc.Get1dIndex(
-                                did0, did1, did2, did3, did4, iloop_d5 * DataPerRead);
-                            const index_t dst_index = dst_desc.Get1dIndex(
-                                did0, did1, did2, did3, did4, iloop_d5 * DataPerRead);
-                            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
-                                *(reinterpret_cast<const vector_t*>(p_src + src_index));
-                        }
-                    }
-                }
-            }
-        }
    }
-}
+#endif
-// need to assume src and dst is aligned
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
-__device__ void threadwise_8d_tensor_copy(SrcDesc,
-                                          const Float* __restrict__ p_src,
-                                          DstDesc,
-                                          Float* __restrict__ p_dst,
-                                          SrcOpLengths,
-                                          Number<DataPerRead>)
-{
-    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
-    static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 &&
-                      SrcOpLengths::nDim == 8,
-                  "wrong! should be 8 dimension");
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
-    static_assert(SrcDesc{}.GetStride(I7) == 1 && DstDesc{}.GetStride(I7) == 1,
+    static_assert(DataPerRead == 1 || (SrcDesc{}.GetStride(Number<nDim - 1>{}) == 1 &&
-                  "wrong! only support stride7 == 1!\n");
+                                       DstDesc{}.GetStride(Number<nDim - 1>{}) == 1),
+                  "wrong! only support stride[nDim-1] == 1!\n");
    static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
                  "wrong! only support DataPerRead == 1, 2 or 4!\n");
-    static_assert(SrcDesc{}.GetStride(I6) % DataPerRead == 0 &&
+    static_assert(
-                      DstDesc{}.GetStride(I6) % DataPerRead == 0,
+        SrcDesc{}.GetStride(Number<nDim - 2>{}) % DataPerRead == 0 &&
-                  "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
+            DstDesc{}.GetStride(Number<nDim - 2>{}) % DataPerRead == 0,
+        "wrong! src and dst stride[nDim-2] should be multiple of DataPerRead to keep alignment");
-    constexpr index_t L7 = SrcOpLengths{}.Get(I7);
+    constexpr index_t L_Back = SrcOpLengths{}.Back();
-    static_assert(L7 % DataPerRead == 0, "wrong! L7 should be evenly divided by DataPerRead");
+    static_assert(L_Back % DataPerRead == 0,
+                  "wrong! lengths[nDim-1] should be evenly divided by DataPerRead");
-    constexpr index_t nloop_d7 = L7 / DataPerRead;
+    constexpr index_t nRead = L_Back / DataPerRead;
-    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
+    static_ford<decltype(ref_desc.GetLengths().PopBack())>{}([=](auto Ids) {
-    {
+        static_for<0, nRead, 1>{}([=](auto IRead) {
-        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
+            constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{});
-        {
-            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
-            {
-                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
-                {
-                    for(index_t did4 = 0; did4 < ref_desc.GetLength(I4); ++did4)
-                    {
-                        for(index_t did5 = 0; did5 < ref_desc.GetLength(I5); ++did5)
-                        {
-                            for(index_t did6 = 0; did6 < ref_desc.GetLength(I6); ++did6)
-                            {
-                                for(index_t iloop_d7 = 0; iloop_d7 < nloop_d7; ++iloop_d7)
-                                {
-                                    const index_t src_index =
-                                        src_desc.Get1dIndex(did0,
-                                                            did1,
-                                                            did2,
-                                                            did3,
-                                                            did4,
-                                                            did5,
-                                                            did6,
-                                                            iloop_d7 * DataPerRead);
-                                    const index_t dst_index =
-                                        dst_desc.Get1dIndex(did0,
-                                                            did1,
-                                                            did2,
-                                                            did3,
-                                                            did4,
-                                                            did5,
-                                                            did6,
-                                                            iloop_d7 * DataPerRead);
-                                    *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
-                                        *(reinterpret_cast<const vector_t*>(p_src + src_index));
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-// need to assume src and dst is aligned
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
-__device__ void threadwise_10d_tensor_copy(SrcDesc,
-                                           const Float* __restrict__ p_src,
-                                           DstDesc,
-                                           Float* __restrict__ p_dst,
-                                           SrcOpLengths,
-                                           Number<DataPerRead>)
-{
-    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
-    static_assert(SrcDesc{}.GetDimension() == 10 && DstDesc{}.GetDimension() == 10 &&
-                      SrcOpLengths::GetSize() == 10,
-                  "wrong! should be 10 dimension");
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
-    constexpr auto I9 = Number<9>{};
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
-    static_assert(SrcDesc{}.GetStride(I9) == 1 && DstDesc{}.GetStride(I9) == 1,
-                  "wrong! only support stride7 == 1!\n");
-    static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
-                  "wrong! only support DataPerRead == 1, 2 or 4!\n");
-    static_assert(SrcDesc{}.GetStride(I8) % DataPerRead == 0 &&
+            const index_t src_index = src_desc.Get1dIndex(multi_id);
-                      DstDesc{}.GetStride(I8) % DataPerRead == 0,
-                  "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
-    constexpr index_t L9 = SrcOpLengths{}.Get(I9);
+            const index_t dst_index = dst_desc.Get1dIndex(multi_id);
-    static_assert(L9 % DataPerRead == 0, "wrong! L9 should be evenly divided by DataPerRead");
+            *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
+                *(reinterpret_cast<const vector_t*>(&p_src[src_index]));
-    constexpr index_t nloop_d9 = L9 / DataPerRead;
+        });
+    });
-#pragma unroll
-    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
-    {
-#pragma unroll
-        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
-        {
-#pragma unroll
-            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
-            {
-#pragma unroll
-                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
-                {
-#pragma unroll
-                    for(index_t did4 = 0; did4 < ref_desc.GetLength(I4); ++did4)
-                    {
-#pragma unroll
-                        for(index_t did5 = 0; did5 < ref_desc.GetLength(I5); ++did5)
-                        {
-#pragma unroll
-                            for(index_t did6 = 0; did6 < ref_desc.GetLength(I6); ++did6)
-                            {
-#pragma unroll
-                                for(index_t did7 = 0; did7 < ref_desc.GetLength(I7); ++did7)
-                                {
-#pragma unroll
-                                    for(index_t did8 = 0; did8 < ref_desc.GetLength(I8); ++did8)
-                                    {
-#pragma unroll
-                                        for(index_t iloop_d9 = 0; iloop_d9 < nloop_d9; ++iloop_d9)
-                                        {
-                                            const index_t src_index =
-                                                src_desc.Get1dIndex(did0,
-                                                                    did1,
-                                                                    did2,
-                                                                    did3,
-                                                                    did4,
-                                                                    did5,
-                                                                    did6,
-                                                                    did7,
-                                                                    did8,
-                                                                    iloop_d9 * DataPerRead);
-                                            const index_t dst_index =
-                                                dst_desc.Get1dIndex(did0,
-                                                                    did1,
-                                                                    did2,
-                                                                    did3,
-                                                                    did4,
-                                                                    did5,
-                                                                    did6,
-                                                                    did7,
-                                                                    did8,
-                                                                    iloop_d9 * DataPerRead);
-                                            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
-                                                *(reinterpret_cast<const vector_t*>(p_src +
-                                                                                    src_index));
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
 }