refactor

e43d7bc6 · Chao Liu · d058d164 · e43d7bc6 · e43d7bc6 · e43d7bc6
Commit e43d7bc6 authored Apr 01, 2019 by Chao Liu
13 changed files
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -270,7 +270,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,

    for(index_t i = 0; i < nrepeat; ++i)
    {
-        float time = launch_kernel(
+        constexpr auto gridwise_conv =
 #if 1
            gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
 #else
@@ -301,12 +301,14 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
             WeiBlockCopyThreadPerDim0,
             WeiBlockCopyThreadPerDim1,
             InBlockCopyDataPerRead,
-             WeiBlockCopyDataPerRead>,
-            dim3(GridSize),
-            dim3(BlockSize),
-            static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
+             WeiBlockCopyDataPerRead>();
+
+        float time = launch_kernel(gridwise_conv.Run,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));

        printf("Elapsed time : %f ms\n", time);
        usleep(std::min(time * 1000, float(10000)));

--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 0
+#elif 1
    // 1x1 filter, 14x14 image, C = 2048
    constexpr index_t N  = 128;
    constexpr index_t C  = 2048;

--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -137,7 +137,10 @@ struct ConstantTensorDescriptor
            }
        };

-        return static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + align.Get();
+        index_t element_space_unaligned =
+            static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + 1;
+
+        return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
    }

    template <class... Is>

--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -5,8 +5,6 @@
 #include "Array.hip.hpp"
 #include "functional.hip.hpp"

-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
-
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }

 __device__ index_t get_block_1d_id() { return blockIdx.x; }
@@ -23,21 +21,45 @@ struct is_same<T, T>
    static const bool value = true;
 };

-#if DEVICE_BACKEND_CUDA
-template <typename T>
-__host__ __device__ constexpr T max(T a, T b)
+__host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
 {
-    return a > b ? a : b;
+    return (a + b - 1) / b;
 }

-template <typename T>
-__host__ __device__ constexpr T min(T a, T b)
+namespace mod_conv {
+template <class T>
+__host__ __device__ constexpr T max(T x, T y)
 {
-    return a < b ? a : b;
+    return x > y ? x : y;
 }
-#endif

-__host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
+template <class T, class... Ts>
+__host__ __device__ constexpr T max(T x, Ts... xs)
 {
-    return (a + b - 1) / b;
+    static_assert(sizeof...(xs) > 0, "not enough argument");
+
+    auto y = max(xs...);
+
+    static_assert(is_same<decltype(y), T>::value, "not the same type");
+
+    return x > y ? x : y;
+}
+
+template <class T>
+__host__ __device__ constexpr T min(T x, T y)
+{
+    return x < y ? x : y;
+}
+
+template <class T, class... Ts>
+__host__ __device__ constexpr T min(T x, Ts... xs)
+{
+    static_assert(sizeof...(xs) > 0, "not enough argument");
+
+    auto y = min(xs...);
+
+    static_assert(is_same<decltype(y), T>::value, "not the same type");
+
+    return x < y ? x : y;
+}
 }
--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -59,12 +59,12 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    constexpr auto out_block_desc =
        make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());

-    constexpr index_t in_block_size  = in_block_desc.GetElementSpace();
-    constexpr index_t wei_block_size = wei_block_desc.GetElementSpace();
-    constexpr index_t out_block_size = out_block_desc.GetElementSpace();
+    constexpr index_t in_block_element_size  = in_block_desc.GetElementSpace();
+    constexpr index_t wei_block_element_size = wei_block_desc.GetElementSpace();
+    constexpr index_t out_block_size         = out_block_desc.GetElementSpace();

-    __shared__ Float p_in_block[in_block_size];
-    __shared__ Float p_wei_block[wei_block_size];
+    __shared__ Float p_in_block[in_block_element_size];
+    __shared__ Float p_wei_block[wei_block_element_size];
    __shared__ Float p_out_block[out_block_size];

    const index_t block_id = blockIdx.x;

--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -63,17 +63,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});

    // shared mem
-    constexpr index_t in_block_size =
+    constexpr index_t in_block_element_size =
        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
-    constexpr index_t wei_block_size =
+    constexpr index_t wei_block_element_size =
        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
                                      ? InBlockCopyDataPerRead
                                      : WeiBlockCopyDataPerRead;

-    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
-    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_in_block[max_align * ((in_block_element_size + max_align - 1) / max_align)];
+    __shared__ Float
+        p_wei_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)];

    // threadwise tensors
    constexpr index_t HiPerThread = HoPerThread + Y - 1;

--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -73,10 +73,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                      Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{});

    // shared mem
-    constexpr index_t in_block_size =
+    constexpr index_t in_block_element_size =
        in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});

-    constexpr index_t wei_block_size =
+    constexpr index_t wei_block_element_size =
        wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
@@ -84,9 +84,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                      : WeiBlockCopyDataPerRead;

    __shared__ in_vector_mem_t
-        p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+        p_in_vec_block[max_align * ((in_block_element_size + max_align - 1) / max_align)];
    __shared__ in_vector_mem_t
-        p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+        p_wei_vec_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)];

    // threadwise tensors
    constexpr index_t HiPerThread = HoPerThread + Y - 1;

--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
@@ -164,18 +164,19 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
        HoPerThread>{};

    // LDS: be careful of alignment
-    constexpr index_t in_block_size =
+    constexpr index_t in_block_element_size =
        in_chwn_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});

-    constexpr index_t wei_block_size =
+    constexpr index_t wei_block_element_size =
        wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});

    constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
                                      ? InBlockCopyDataPerRead
                                      : WeiBlockCopyDataPerRead;

-    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
-    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_in_block[max_align * ((in_block_element_size + max_align - 1) / max_align)];
+    __shared__ Float
+        p_wei_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)];

    // register
    Float p_out_thread[out_khwn_thread_desc.GetElementSpace()];

--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
@@ -204,11 +204,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
                                                         true>{};

    // LDS
-    constexpr index_t in_block_size  = in_chwn_block_desc.GetElementSpace();
-    constexpr index_t wei_block_size = wei_cyxk_block_desc.GetElementSpace();
+    constexpr index_t in_block_element_size  = in_chwn_block_desc.GetElementSpace();
+    constexpr index_t wei_block_element_size = wei_cyxk_block_desc.GetElementSpace();

-    __shared__ Float p_in_block[in_block_size];
-    __shared__ Float p_wei_block[wei_block_size];
+    __shared__ Float p_in_block[in_block_element_size];
+    __shared__ Float p_wei_block[wei_block_element_size];

    // register
    Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()];

--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -10,11 +10,9 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};

-#if 1
-    //NRow = 1
+#if 0
    for(index_t i = 0; i < NRow; ++i)
    {
-        //NCol = 4
        for(index_t j = 0; j < NCol; ++j)
        {
            const index_t src_index = src_mtx.Get1dIndex(i, j);
@@ -23,7 +21,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
            p_dst[dst_index] = p_src[src_index];
        }
    }
-#elif 0
+#elif 1
    static_assert(NCol == 4, "only for NCol == 4");

    using vector_t = typename vector_type<Float, 4>::MemoryType;
@@ -33,22 +31,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
        const index_t src_index = src_mtx.Get1dIndex(i, 0);
        const index_t dst_index = dst_mtx.Get1dIndex(i, 0);

-#if 0
-        *(reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+        *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
            *(reinterpret_cast<const vector_t*>(&p_src[src_index]));
-#elif 0
-        asm volatile("\n \
-                ds_read2_b64 %0, %1 offset1:1 \n \
-                s_waitcnt lgkmcnt(0)"
-                     : "=v"(*(reinterpret_cast<vector_t*>(&p_dst[dst_index])))
-                     : "v"(__to_local((void*)(&p_src[src_index]))));
-#elif 1
-        asm volatile("\n \
-                ds_read_b128 %0, %1 \n \
-                s_waitcnt lgkmcnt(0)"
-                     : "=v"(*(reinterpret_cast<vector_t*>(&p_dst[dst_index])))
-                     : "v"(__to_local((void*)(&p_src[src_index]))));
-#endif
    }
 #endif
 }
@@ -84,13 +68,10 @@ __device__ void threadwise_gemm(MatrixA,
        constexpr index_t N = c_mtx.NCol();
        constexpr index_t K = a_mtx.NRow(); // A is transposed

-        // K = 1
        for(index_t k = 0; k < K; ++k)
        {
-            // M = 8
            for(index_t i = 0; i < M; ++i)
            {
-                // N = 8
                for(index_t j = 0; j < N; ++j)
                {
                    const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed