experimenting lds read

18328e2f · Chao Liu · 4facbe99 · 18328e2f · 18328e2f · 18328e2f
Commit 18328e2f authored Mar 25, 2019 by Chao Liu
4 changed files
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
 #elif 1
-    // 1x1 filter, 14x14 image, C = 256
+    // 1x1 filter, 14x14 image, C = 128
    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
+    constexpr index_t C  = 128;
    constexpr index_t HI = 14;
    constexpr index_t WI = 14;
    constexpr index_t K  = 512;
@@ -638,10 +638,16 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-#if 1
+#if 0
        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 0
+        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+#elif 0
+        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+#elif 1
        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #elif 0

--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -336,7 +336,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    __device__ void Run(const FloatA* __restrict__ p_a_block,
                        const FloatB* __restrict__ p_b_block,
                        FloatC* __restrict__ p_c_thread,
-                        Accumulator f_accum) const
+                        Accumulator f_accum,
+                        const float* const p_lds_begin) const
    {
        constexpr auto True  = integral_constant<bool, true>{};
        constexpr auto False = integral_constant<bool, false>{};
@@ -383,28 +384,36 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
            // copy A-sub to form A
            for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
            {
-                threadwise_matrix_copy(
+                threadwise_matrix_copy_v2(
                    a_block_mtx,
                    p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
                        mMyThreadOffsetA,
                    a_thread_mtx,
                    p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
-                    a_thread_sub_mtx.GetLengths());
+                    a_thread_sub_mtx.GetLengths(),
+                    p_lds_begin);
            }
 #pragma unroll
            // copy B-sub to form B
            for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
            {
-                threadwise_matrix_copy(
+                threadwise_matrix_copy_v2(
                    b_block_mtx,
                    p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
                        mMyThreadOffsetB,
                    b_thread_mtx,
                    p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
-                    b_thread_sub_mtx.GetLengths());
+                    b_thread_sub_mtx.GetLengths(),
+                    p_lds_begin);
            }
+#if 0
+            asm volatile("\n \
+            s_waitcnt lgkmcnt(0) \n \
+            " ::);
+#endif
            // C = A * B
            threadwise_gemm(a_thread_mtx,
                            True,
@@ -564,7 +573,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                                             FloatB* const p_b_block,
                                             FloatC* p_c_thread,
                                             Accumulator f_accum,
-                                             float* p_lds_begin) const
+                                             const float* const p_lds_begin) const
    {
        constexpr auto True  = integral_constant<bool, true>{};
        constexpr auto False = integral_constant<bool, false>{};
@@ -669,7 +678,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                                          p_lds_begin);
            }
-#if 1
+#if 0
            asm volatile("\n \
            s_waitcnt lgkmcnt(0) \n \
            " ::);

--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -207,7 +207,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
 #if 1
-    constexpr Float* p_lds_begin = p_wei_block;
+    const Float* p_lds_begin = p_in_block < p_wei_block ? p_in_block : p_wei_block;
 #endif
    const Float* p_in_global_block_offset =
@@ -240,18 +240,18 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
            for(index_t x = 0; x < X; ++x)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 0
+#if 1
                blockwise_gemm.Run
 #elif 0
                blockwise_gemm.Run_asm
-#elif 1
+#elif 0
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
-                (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                 p_in_block + y * Wi + x,
+                     p_in_block + y * Wi + x,
-                 p_out_thread,
+                     p_out_thread,
-                 f_accum,
+                     f_accum,
-                 p_lds_begin);
+                     p_lds_begin);
            }
        }
    }

--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -28,12 +28,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
                                          DstMatrix,
                                          Float* __restrict__ p_dst,
                                          Sequence<NRow, NCol>,
-                                          const float* p_lds_begin)
+                                          const float* const p_lds_begin)
 {
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};
-#if 1
+#if 0
    for(index_t i = 0; i < NRow; ++i)
    {
        for(index_t j = 0; j < NCol; ++j)
@@ -48,11 +48,11 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
                        ds_read_b32 %0, %1 \n \
                        "
                         : "=v"(p_dst[dst_index])
-                         : "v"((uint32_t)((uintptr_t)((p_src + src_index) - p_lds_begin))));
+                         : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
 #endif
        }
    }
-#elif 0
+#elif 1
    static_assert(NCol == 4, "only for NCol == 4");
    using vector_t = typename vector_type<Float, 4>::MemoryType;
@@ -66,11 +66,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
        *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
            *(reinterpret_cast<const vector_t*>(p_src + src_index));
 #elif 1
-        asm volatile("\n \
+        asm volatile(
-                    ds_read_b128 %0, %1, offset:0 \n \
+            "\n \
+                    ds_read_b128 %0, %1 \n \
                    "
-                     : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
+            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
-                     : "v"((uint32_t)((uintptr_t)(p_src + src_index - p_lds_begin))));
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
 #endif
    }
 #endif