clean up

5245a016 · Chao Liu · 7a251a09 · 5245a016 · 5245a016 · 5245a016
Commit 5245a016 authored Apr 06, 2019 by Chao Liu
7 changed files
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 1
+#elif 0
    // 1x1 filter, 14x14 image, C = 512
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;

--- a/src/include/inline_asm.hpp
+++ b/src/include/inline_asm.hpp
 #pragma once
-
-typedef float Float4 __attribute__((ext_vector_type(4)));
-
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+#include "common.hip.hpp"

 #define NO_VM_WAIT 0
 #define NO_LGKM_WAIT 0
@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
 #define NO_DS_WRITE 0
 #define NO_GLB_READ 0

-inline __device__ void vmcnt(index_t cnt)
+// cast a pointer of LDS to its address
+extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+
+__device__ void vmcnt(index_t cnt)
 {
 #if !NO_VM_WAIT
    if(cnt == 0)
@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt)
 #endif
 }

-inline __device__ void lgkmcnt(index_t cnt)
+__device__ void lgkmcnt(index_t cnt)
 {
 #if !NO_LGKM_WAIT
    if(cnt == 0)
@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt)
 #endif
 }

-inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
+__device__ void outerProduct1x4(const float* a, const float* b, float* c)
 {
    asm volatile("\n \
            v_mac_f32 %0, %4, %5 \n \
@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
                   "3"(c[3]));
 }

-inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& c)
+__device__ void outerProduct1x4(const float& a,
+                                const vector_type<float, 4>::MemoryType& b,
+                                vector_type<float, 4>::MemoryType& c)
 {
 #if 0
    asm volatile(
@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4&
 #endif
 }

-inline __device__ void
-outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4& c2, Float4& c3)
+__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
+                                const vector_type<float, 4>::MemoryType& b,
+                                vector_type<float, 4>::MemoryType& c0,
+                                vector_type<float, 4>::MemoryType& c1,
+                                vector_type<float, 4>::MemoryType& c2,
+                                vector_type<float, 4>::MemoryType& c3)
 {
 #if 0
    asm volatile(
@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4
 #endif
 }

-inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* c)
+__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
+                                const vector_type<float, 4>::MemoryType* b,
+                                vector_type<float, 4>::MemoryType* c)
 {
    outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
    outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
    outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
 }

-inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
+__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
 {
 #if !NO_DS_READ
    if(offset == 0)
@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
 #endif
 }

-inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0)
+__device__ void global_load(vector_type<float, 4>::MemoryType& r,
+                            const vector_type<float, 4>::MemoryType* ptr,
+                            index_t offset = 0)
 {
 #if !NO_GLB_READ
    if(offset == 0)
@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset
 #endif
 }

-inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0)
+__device__ void
+ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
 {
 #if !NO_DS_WRITE
    if(offset == 0)

--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                            const FloatB* __restrict__ p_b_block,
                            FloatC* __restrict__ p_c_thread) const
    {
+        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
+                          is_same<FloatC, float>::value,
+                      "Run_asm only deal with float\n");
+
        constexpr auto True  = integral_constant<bool, true>{};
        constexpr auto False = integral_constant<bool, false>{};

@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});

+        static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
+                          MPerThread == 8 && NPerThread == 8,
+                      "Run_asm cannot deal with this GEMM shape yet\n");
+
+        using Float4 = vector_type<float, 4>::MemoryType;
+
        float p_thread[a_thread_mtx.GetElementSpace() + b_thread_mtx.GetElementSpace()];

        FloatA* p_a_thread = p_thread;

--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -5,8 +5,8 @@
 #include "Array.hip.hpp"
 #include "functional.hip.hpp"

-#if DEVICE_BACKEDN_HIP
-#include "inline_asm.hpp"
+#if DEVICE_BACKEND_HIP
+#include "amd_inline_asm.hip.hpp"
 #endif

 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
    return x < y ? x : y;
 }
 } // namespace mod_conv
-
-#if DEVICE_BACKEND_HIP
-// cast a pointer of LDS to its address
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
-#endif
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -16,12 +16,14 @@ template <>
 struct vector_type<float, 2>
 {
 #if DEVICE_BACKEND_HIP
-    // For some reason, HIP compiler need this definition to generate optimal load and store instruction
+    // For some reason, HIP compiler need this definition to generate optimal load and store
+    // instruction
    typedef float MemoryType __attribute__((ext_vector_type(2)));
 #elif DEVICE_BACKEND_CUDA
    // For some reason, CUDA need this definition to, otherwise
    //   compiler won't generate optimal load and store instruction, and
-    //   kernel would produce wrong result, indicating the compiler fail to generate correct instruction, 
+    //   kernel would produce wrong result, indicating the compiler fail to generate correct
+    //   instruction,
    using MemoryType = float2;
 #endif

@@ -43,12 +45,14 @@ template <>
 struct vector_type<float, 4>
 {
 #if DEVICE_BACKEND_HIP
-    // For some reason, HIP compiler need this definition to generate optimal load and store instruction
+    // For some reason, HIP compiler need this definition to generate optimal load and store
+    // instruction
    typedef float MemoryType __attribute__((ext_vector_type(4)));
 #elif DEVICE_BACKEND_CUDA
    // For some reason, CUDA need this definition to, otherwise
    //   compiler won't generate optimal load and store instruction, and
-    //   kernel would produce wrong result, indicating the compiler fail to generate correct instruction, 
+    //   kernel would produce wrong result, indicating the compiler fail to generate correct
+    //   instruction,
    using MemoryType = float4;
 #endif
 };

--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -34,11 +34,6 @@ template <index_t GridSize,
          index_t WeiBlockCopyDataPerRead>
 struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 {
-    __host__
-        __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer()
-    {
-    }
-
    __device__ void Run(const Float* const __restrict__ p_in_global,
                        const Float* const __restrict__ p_wei_global,
                        Float* const __restrict__ p_out_global) const
@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer

        // preload data into LDS
        {
-#if 1
            Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
            Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];

@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
                                                        p_wei_register_clipboard);

-            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
-                                                            p_in_block_double);
+#if 0
+            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
            blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
                                                         p_wei_block_double);
-#elif 0
-            Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
-            Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
-
-            blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
-                                                       p_in_register_clipboard);
-            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
-                                                        p_wei_register_clipboard);
-
+#elif 1
            vmcnt(0);
            blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
                                                            p_in_block_double);
@@ -278,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                {
                    for(index_t x = 0; x < X; ++x)
                    {
-#if 1
+#if 0
                        blockwise_gemm.Run
 #elif 0
                        blockwise_gemm.Run_RegisterDoubleBuffer
@@ -291,15 +277,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                    }
                }

-#if 1
+#if 0
                blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
                                                            p_in_block_next);
                blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
                                                             p_wei_block_next);
-#elif 0
-                // if work with RunLoadRegisterClipboard_asm, need to wait
+#elif 1
                vmcnt(0);
-
                blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
                                                                p_in_block_next);
                blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
@@ -329,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
            {
                for(index_t x = 0; x < X; ++x)
                {
-#if 1
+#if 0
                    blockwise_gemm.Run
 #elif 0
                    blockwise_gemm.Run_RegisterDoubleBuffer
@@ -342,19 +326,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                }
            }

-#if 1
+#if 0
            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
                                                        p_in_block_double + in_block_space);

            blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
                                                         p_wei_block_double + wei_block_space);
 #else
-            // if work with RunLoadRegisterClipboard_asm, need to wait
            vmcnt(0);
-
            blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
                                                            p_in_block_double + in_block_space);
-
            blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
                                                             p_wei_block_double + wei_block_space);
 #endif
@@ -366,7 +347,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
            {
                for(index_t x = 0; x < X; ++x)
                {
-#if 1
+#if 0
                    blockwise_gemm.Run
 #elif 0
                    blockwise_gemm.Run_RegisterDoubleBuffer