refactor

9a2744a6 · Chao Liu · 18328e2f · 9a2744a6 · 9a2744a6 · 9a2744a6
Commit 9a2744a6 authored Mar 25, 2019 by Chao Liu
5 changed files
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -86,15 +86,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t GemmNLevel1Cluster = 8;
    constexpr index_t GemmKPerThreadLoop = 1;

-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
    constexpr index_t InBlockCopyDataPerRead  = 4;
    constexpr index_t WeiBlockCopyDataPerRead = 4;

@@ -116,15 +107,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t GemmNLevel1Cluster = 4;
    constexpr index_t GemmKPerThreadLoop = 1;

-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
    constexpr index_t InBlockCopyDataPerRead  = 4;
    constexpr index_t WeiBlockCopyDataPerRead = 4;

@@ -147,15 +129,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t GemmNLevel1Cluster = 4;
    constexpr index_t GemmKPerThreadLoop = 1;

-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
    constexpr index_t InBlockCopyDataPerRead  = 4;
    constexpr index_t WeiBlockCopyDataPerRead = 4;

@@ -177,15 +150,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t GemmNLevel1Cluster = 4;
    constexpr index_t GemmKPerThreadLoop = 1;

-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
    constexpr index_t InBlockCopyDataPerRead  = 4;
    constexpr index_t WeiBlockCopyDataPerRead = 4;

@@ -207,15 +171,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t GemmNLevel1Cluster = 4;
    constexpr index_t GemmKPerThreadLoop = 1;

-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
    constexpr index_t InBlockCopyDataPerRead  = 4;
    constexpr index_t WeiBlockCopyDataPerRead = 4;

@@ -257,8 +212,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
             CPerBlock,
             BPerThread,
             KPerThread,
-             GemmThreadPerColumnPerCluster,
-             GemmThreadPerRowPerCluster,
             GemmMPerThreadSubC,
             GemmNPerThreadSubC,
             GemmMLevel0Cluster,
@@ -266,10 +219,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
             GemmMLevel1Cluster,
             GemmNLevel1Cluster,
             GemmKPerThreadLoop,
-             InBlockCopyThreadPerDim0,
-             InBlockCopyThreadPerDim1,
-             WeiBlockCopyThreadPerDim0,
-             WeiBlockCopyThreadPerDim1,
             InBlockCopyDataPerRead,
             WeiBlockCopyDataPerRead>,
            dim3(GridSize),

--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
 #elif 1
-    // 1x1 filter, 14x14 image, C = 128
+    // 1x1 filter, 14x14 image, C = 512
    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
+    constexpr index_t C  = 512;
    constexpr index_t HI = 14;
    constexpr index_t WI = 14;
    constexpr index_t K  = 512;

--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -21,21 +21,45 @@ struct is_same<T, T>
    static const bool value = true;
 };

-#if DEVICE_BACKEND_CUDA
-template <typename T>
-__host__ __device__ constexpr T max(T a, T b)
+__host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
 {
-    return a > b ? a : b;
+    return (a + b - 1) / b;
 }

-template <typename T>
-__host__ __device__ constexpr T min(T a, T b)
+namespace mod_conv {
+template <class T>
+__host__ __device__ constexpr T max(T x, T y)
 {
-    return a < b ? a : b;
+    return x > y ? x : y;
 }
-#endif

-__host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
+template <class T, class... Ts>
+__host__ __device__ constexpr T max(T x, Ts... xs)
 {
-    return (a + b - 1) / b;
+    static_assert(sizeof...(xs) > 0, "not enough argument");
+
+    auto y = max(xs...);
+
+    static_assert(is_same<decltype(y), T>::value, "not the same type");
+
+    return x > y ? x : y;
+}
+
+template <class T>
+__host__ __device__ constexpr T min(T x, T y)
+{
+    return x < y ? x : y;
+}
+
+template <class T, class... Ts>
+__host__ __device__ constexpr T min(T x, Ts... xs)
+{
+    static_assert(sizeof...(xs) > 0, "not enough argument");
+
+    auto y = min(xs...);
+
+    static_assert(is_same<decltype(y), T>::value, "not the same type");
+
+    return x < y ? x : y;
+}
 }
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -19,8 +19,6 @@ template <index_t GridSize,
          index_t CPerBlock,
          index_t BPerThread,
          index_t KPerThread,
-          index_t GemmThreadPerColumnPerCluster,
-          index_t GemmThreadPerRowPerCluster,
          index_t GemmMPerThreadSubC,
          index_t GemmNPerThreadSubC,
          index_t GemmMLevel0Cluster,
@@ -28,10 +26,6 @@ template <index_t GridSize,
          index_t GemmMLevel1Cluster,
          index_t GemmNLevel1Cluster,
          index_t GemmKPerThreadLoop,
-          index_t InBlockCopyThreadPerDim0,
-          index_t InBlockCopyThreadPerDim1,
-          index_t WeiBlockCopyThreadPerDim0,
-          index_t WeiBlockCopyThreadPerDim1,
          index_t InBlockCopyDataPerRead,
          index_t WeiBlockCopyDataPerRead>
 __global__ void
@@ -111,57 +105,23 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
    }
 #endif

-// blockwise in copy
-//   formmat is [CPerBlock,BPerBlock + BGhostRead]
-#if 0
-    const auto blockwise_in_copy =
-        Blockwise2dTensorCopy1<BlockSize,
-                               Float,
-                               decltype(in_cb_global_desc),
-                               decltype(in_cb_block_desc),
-                               decltype(in_cb_block_desc.GetLengths())>{};
-#elif 0
-    const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
-                                                          Float,
-                                                          decltype(in_cb_global_desc),
-                                                          decltype(in_cb_block_desc),
-                                                          decltype(in_cb_block_desc.GetLengths()),
-                                                          InBlockCopyThreadPerDim0,
-                                                          InBlockCopyThreadPerDim1>{};
-#elif 1
+    // blockwise in copy
+    //   formmat is [CPerBlock,BPerBlock + BGhostRead]
    const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
                                                          Float,
                                                          decltype(in_cb_global_desc),
                                                          decltype(in_cb_block_desc),
                                                          decltype(in_cb_block_desc.GetLengths()),
                                                          InBlockCopyDataPerRead>{};
-#endif

-// blockwise wei copy
-//   format is [CPerBlock*Y*X,KPerBlock]
-#if 0
-    const auto blockwise_wei_copy =
-        Blockwise2dTensorCopy1<BlockSize,
-                               Float,
-                               decltype(wei_ek_global_desc),
-                               decltype(wei_ek_block_desc),
-                               decltype(wei_ek_block_desc.GetLengths())>{};
-#elif 0
-    const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
-                                                           Float,
-                                                           decltype(wei_ek_global_desc),
-                                                           decltype(wei_ek_block_desc),
-                                                           decltype(wei_ek_block_desc.GetLengths()),
-                                                           WeiBlockCopyThreadPerDim0,
-                                                           WeiBlockCopyThreadPerDim1>{};
-#elif 1
+    // blockwise wei copy
+    //   format is [CPerBlock*Y*X,KPerBlock]
    const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
                                                           Float,
                                                           decltype(wei_ek_global_desc),
                                                           decltype(wei_ek_block_desc),
                                                           decltype(wei_ek_block_desc.GetLengths()),
                                                           WeiBlockCopyDataPerRead>{};
-#endif

    // a series of blockwise GEMM
    // c_mtx += transpose(a_mtx) * b_mtx
@@ -198,16 +158,27 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
    constexpr index_t wei_block_size =
        wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

-    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                      ? InBlockCopyDataPerRead
-                                      : WeiBlockCopyDataPerRead;
+    constexpr index_t max_align =
+        mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);

-    // LDS
+// LDS
+#if 1
    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];

-#if 1
-    const Float* p_lds_begin = p_in_block < p_wei_block ? p_in_block : p_wei_block;
+    const Float* const p_lds_begin = p_in_block < p_wei_block ? p_in_block : p_wei_block;
+#else // debug
+    constexpr index_t lds_byte = 8 * 1024;
+
+    constexpr index_t in_block_space  = max_align * ((in_block_size + max_align - 1) / max_align);
+    constexpr index_t wei_block_space = max_align * ((wei_block_size + max_align - 1) / max_align);
+
+    static_assert(lds_byte >= (in_block_space + wei_block_space) * sizeof(Float),
+                  "lds allocation not enough");
+
+    __shared__ Float p_lds_begin[lds_byte / sizeof(Float)];
+    Float* const p_in_block  = p_lds_begin;
+    Float* const p_wei_block = p_lds_begin + in_block_space;
 #endif

    const Float* p_in_global_block_offset =
@@ -240,11 +211,11 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
            for(index_t x = 0; x < X; ++x)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 1
+#if 0
                blockwise_gemm.Run
 #elif 0
                blockwise_gemm.Run_asm
-#elif 0
+#elif 1
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
                (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),

--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -55,17 +55,66 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
 #elif 1
    static_assert(NCol == 4, "only for NCol == 4");

-    using vector_t = typename vector_type<Float, 4>::MemoryType;
-
    for(index_t i = 0; i < NRow; ++i)
    {
        const index_t src_index = src_mtx.Get1dIndex(i, 0);
        const index_t dst_index = dst_mtx.Get1dIndex(i, 0);

 #if 1
+        using vector_t          = typename vector_type<Float, 4>::MemoryType;
+
        *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
            *(reinterpret_cast<const vector_t*>(p_src + src_index));
-#elif 1
+#elif 0
+        // ds_read_b32
+        asm volatile(
+            "\n \
+                    ds_read_b32 %0, %1 \n \
+                    "
+            : "=v"(p_dst[dst_index])
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
+
+        asm volatile(
+            "\n \
+                    ds_read_b32 %0, %1 \n \
+                    "
+            : "=v"(p_dst[dst_index + 1])
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 1) - p_lds_begin))));
+
+        asm volatile(
+            "\n \
+                    ds_read_b32 %0, %1 \n \
+                    "
+            : "=v"(p_dst[dst_index + 2])
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
+
+        asm volatile(
+            "\n \
+                    ds_read_b32 %0, %1 \n \
+                    "
+            : "=v"(p_dst[dst_index + 3])
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 3) - p_lds_begin))));
+#elif 0
+        // ds_read_b64
+        using vector_t = typename vector_type<Float, 2>::MemoryType;
+
+        asm volatile(
+            "\n \
+                    ds_read_b64 %0, %1 \n \
+                    "
+            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
+
+        asm volatile(
+            "\n \
+                    ds_read_b64 %0, %1 \n \
+                    "
+            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index + 2)))
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
+#elif 0
+        // ds_read_b128
+        using vector_t = typename vector_type<Float, 4>::MemoryType;
+
        asm volatile(
            "\n \
                    ds_read_b128 %0, %1 \n \