Jing's ds_read inline asm

d6d9a8e4 · Chao Liu · 766b0a9e · d6d9a8e4 · d6d9a8e4 · d6d9a8e4
Commit d6d9a8e4 authored Mar 28, 2019 by Chao Liu
6 changed files
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -190,8 +190,8 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t WeiBlockCopyDataPerRead = 4;

    constexpr index_t BlockSize = 256;
-#elif 1
-    // 1x1, 14x14, Vega 10
+#elif 0
+    // 1x1, 14x14, Vega 20
    constexpr index_t BPerBlock = 64;
    constexpr index_t KPerBlock = 128;
    constexpr index_t CPerBlock = 8;
@@ -219,6 +219,36 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t InBlockCopyDataPerRead  = 4;
    constexpr index_t WeiBlockCopyDataPerRead = 4;

+    constexpr index_t BlockSize = 128;
+#elif 1
+    // 1x1, 14x14, Vega 20, hack CPerBlock = 1
+    constexpr index_t BPerBlock = 64;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t CPerBlock = 1;
+
+    constexpr index_t BPerThread = 8;
+    constexpr index_t KPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t GemmThreadPerColumnPerCluster = 8;
+    constexpr index_t GemmThreadPerRowPerCluster    = 8;
+
+    constexpr index_t InBlockCopyThreadPerDim0 = 4;
+    constexpr index_t InBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t InBlockCopyDataPerRead  = 4;
+    constexpr index_t WeiBlockCopyDataPerRead = 4;
+
    constexpr index_t BlockSize = 128;
 #endif


--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -420,9 +420,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    }

    template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run_asm(const FloatA* __restrict__ p_a_block,
-                            const FloatB* __restrict__ p_b_block,
-                            FloatC* __restrict__ p_c_thread,
+    __device__ void Run_asm(const FloatA* const __restrict__ p_a_block,
+                            const FloatB* const __restrict__ p_b_block,
+                            FloatC* const __restrict__ p_c_thread,
                            Accumulator f_accum) const
    {
        constexpr auto True  = integral_constant<bool, true>{};
@@ -462,11 +462,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;

+        static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && MRepeat == 2 && NRepeat == 2 &&
+                          KPerThreadLoop == 1 && K == 1,
+                      "asm is not for this mtx shape");
+
+        const FloatA* const p_a_block_thread_offset = p_a_block + mMyThreadOffsetA;
+
 #pragma unroll
        // loop over k
        for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
        {
-            //#pragma unroll
+#if 0
+#pragma unroll
            // copy A-sub to form A
            for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
            {
@@ -475,9 +482,65 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                    p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
                        mMyThreadOffsetA,
                    a_thread_mtx,
-                    p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                    a_thread_sub_mtx.NCol(p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
                    a_thread_sub_mtx.GetLengths());
            }
+#elif 1
+            // this produce right result
+            using vectorA_t = typename vector_type<FloatA, 4>::MemoryType; // this is float4*
+
+            asm volatile(
+                "\n \
+                    ds_read_b128 %0, %1 \n \
+                    s_waitcnt lgkmcnt(0)"
+                : "=v"(*(reinterpret_cast<vectorA_t*>(p_a_thread + a_thread_mtx.Get1dIndex(0, 0))))
+                : "v"(__to_local(
+                    (void*)(p_a_block + a_block_mtx.Get1dIndex(k_begin, 0) + mMyThreadOffsetA))));
+
+            asm volatile("\n \
+                    ds_read_b128 %0, %1 \n \
+                    s_waitcnt lgkmcnt(0)"
+                         : "=v"(*(reinterpret_cast<vectorA_t*>(
+                             p_a_thread + a_thread_mtx.Get1dIndex(0, MPerThreadSubC))))
+                         : "v"(__to_local((
+                             void*)(p_a_block + a_block_mtx.Get1dIndex(k_begin, MPerLevel1Cluster) +
+                                    mMyThreadOffsetA))));
+#elif 0
+            // this produce wrong result
+            using vectorA_t = typename vector_type<FloatA, 4>::MemoryType; // this is float4*
+
+            asm volatile(
+                "\n \
+                    ds_read_b128 %0, %2  \n \
+                    ds_read_b128 %1, %3  \n \
+                    s_waitcnt lgkmcnt(0)"
+                : "=v"(*(reinterpret_cast<vectorA_t*>(p_a_thread + a_thread_mtx.Get1dIndex(0, 0)))),
+                  "=v"(*(reinterpret_cast<vectorA_t*>(p_a_thread +
+                                                      a_thread_mtx.Get1dIndex(0, MPerThreadSubC))))
+                : "v"(__to_local(
+                      (void*)(p_a_block + a_block_mtx.Get1dIndex(k_begin, 0) + mMyThreadOffsetA))),
+                  "v"(__to_local((void*)(p_a_block +
+                                         a_block_mtx.Get1dIndex(k_begin, MPerLevel1Cluster) +
+                                         mMyThreadOffsetA))));
+#elif 1
+            // this produce wrong result
+            using vectorA_t = typename vector_type<FloatA, 4>::MemoryType; // this is float4*
+
+            asm volatile(
+                "\n \
+                    ds_read_b128 %0, %1 \n \
+                    s_waitcnt lgkmcnt(0)"
+                : "=v"(*(reinterpret_cast<vectorA_t*>(p_a_thread + a_thread_mtx.Get1dIndex(0, 0))))
+                : "v"(__to_local((void*)(p_a_block_thread_offset))));
+
+            asm volatile("\n \
+                    ds_read_b128 %0, %1 offset:16 \n \
+                    s_waitcnt lgkmcnt(0)"
+                         : "=v"(*(reinterpret_cast<vectorA_t*>(
+                             p_a_thread + a_thread_mtx.Get1dIndex(0, MPerThreadSubC))))
+                         : "v"(__to_local((void*)(p_a_block_thread_offset))));
+
+#endif

            //#pragma unroll
            // copy B-sub to form B

--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -5,6 +5,8 @@
 #include "Array.hip.hpp"
 #include "functional.hip.hpp"

+extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }

 __device__ index_t get_block_1d_id() { return blockIdx.x; }

--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -238,7 +238,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
 #if 0
                blockwise_gemm.Run
-#elif 0
+#elif 1
                blockwise_gemm.Run_asm
 #elif 1
                blockwise_gemm.Run_RegisterDoubleBuffer

--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -289,10 +289,10 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(
 #else
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
-                (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                 p_in_block_now + y * Wi + x,
-                 p_out_thread,
-                 f_accum);
+                    (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block_now + y * Wi + x,
+                     p_out_thread,
+                     f_accum);
            }
        }

@@ -319,10 +319,10 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(
 #else
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
-                (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                 p_in_block_now + y * Wi + x,
-                 p_out_thread,
-                 f_accum);
+                    (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block_now + y * Wi + x,
+                     p_out_thread,
+                     f_accum);
            }
        }
    }

--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -10,7 +10,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};

-#if 0
+#if 1
    for(index_t i = 0; i < NRow; ++i)
    {
        for(index_t j = 0; j < NCol; ++j)
@@ -21,7 +21,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
            p_dst[dst_index] = p_src[src_index];
        }
    }
-#elif 1
+#elif 0
    static_assert(NCol == 4, "only for NCol == 4");

    using vector_t = typename vector_type<Float, 4>::MemoryType;
@@ -31,15 +31,21 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
        const index_t src_index = src_mtx.Get1dIndex(i, 0);
        const index_t dst_index = dst_mtx.Get1dIndex(i, 0);

-#if 1
-        *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
-            *(reinterpret_cast<const vector_t*>(p_src + src_index));
+#if 0
+        *(reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+            *(reinterpret_cast<const vector_t*>(&p_src[src_index]));
+#elif 0
+        asm volatile("\n \
+                ds_read2_b64 %0, %1 offset1:1 \n \
+                s_waitcnt lgkmcnt(0)"
+                     : "=v"(*(reinterpret_cast<vector_t*>(&p_dst[dst_index])))
+                     : "v"(__to_local((void*)(&p_src[src_index]))));
 #elif 1
        asm volatile("\n \
-                    ds_read_b128 %0, %1, offset:0 \n \
-                    "
-                     : "=v"(*(reinterpret_cast<vector_t*>(p_dst+dst_index)))
-                     : "v"((uint32_t)(p_src + src_index)));
+                ds_read_b128 %0, %1 \n \
+                s_waitcnt lgkmcnt(0)"
+                     : "=v"(*(reinterpret_cast<vector_t*>(&p_dst[dst_index])))
+                     : "v"(__to_local((void*)(&p_src[src_index]))));
 #endif
    }
 #endif