clean code

caa91db0 · root · 269adde8 · caa91db0 · caa91db0 · caa91db0
Commit caa91db0 authored Mar 17, 2021 by root
6 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -160,7 +160,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
 #if 1
            a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) +
                                  mMyThreadOffsetA,
-                              p_a_thread);
+                              p_a_thread +
+                                  b_thread_mtx.CalculateOffset(make_tuple(cyx_begin, 0, 0, 0)));
 #else
            for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++)
                p_a_thread[i] = 1;

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
@@ -102,6 +102,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
        const index_t k_block_work_id  = get_block_1d_id() / hw_block_work_num;
        const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;

+#else
+        // Hack: this force result into SGPR
+        const index_t m_block_work_num  = __builtin_amdgcn_readfirstlane(K / KPerBlock);
+        const index_t h_block_work_num  = __builtin_amdgcn_readfirstlane(H / HPerBlock);
+        const index_t w_block_work_num  = __builtin_amdgcn_readfirstlane(W / WPerBlock);
+        const index_t hw_block_work_num = h_block_work_num * w_block_work_num;
+
+        const index_t k_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hw_block_work_num);
+        const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
+#endif
+
        const index_t h_block_work_id = hw_block_work_id / w_block_work_num;
        const index_t w_block_work_id = hw_block_work_id - h_block_work_id * w_block_work_num;

@@ -110,23 +122,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2

        static_assert(KPerBlock == KPerThread, "");

+        const auto k_thread_id = 0;
        const auto h_thread_id = get_thread_local_1d_id() / w_num_threads;
        const auto w_thread_id = get_thread_local_1d_id() % w_num_threads;
-#else
-        // Hack: this force result into SGPR
-        const index_t m_block_work_num  = __builtin_amdgcn_readfirstlane(K / KPerBlock);
-        const index_t hw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock);
-
-        const index_t k_block_work_id =
-            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hw_block_work_num);
-        const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
-#endif
-
-        const index_t m_block_data_on_global = k_block_work_id * KPerBlock;

+        const index_t k_block_data_on_global = k_block_work_id * KPerBlock;
        const index_t h_block_data_on_global = h_block_work_id * HPerBlock;
        const index_t w_block_data_on_global = w_block_work_id * WPerBlock;

+        const index_t k_thread_data_on_global = k_block_data_on_global + k_thread_id * KPerThread;
+        const index_t h_thread_data_on_global = h_block_data_on_global + h_thread_id * HPerThread;
+        const index_t w_thread_data_on_global = w_block_data_on_global + w_thread_id * WPerThread;
+
        // lds max alignment
        constexpr auto max_lds_align =
            math::lcm(Number<ABlockTransferDstScalarPerVector_M>{}, Number<KPerThread>{});
@@ -167,20 +174,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
                                                   AThreadTransferSrcResetCoordinateAfterRun,
                                                   true>(
                a_cyx_k_global_desc,
-                make_multi_index(0, m_block_data_on_global),
+                make_multi_index(0, k_block_data_on_global),
                a_cyx_k_block_desc,
                make_multi_index(0, 0));

        constexpr auto b_cyx_n_h_w_thread_desc =
            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<CYXPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+                Number<CYXPerBlock>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));

        using ThreadwiseTensorSliceTransferB = ThreadwiseDynamicTensorSliceTransfer_v2<
            Float,
            Float,
            decltype(b_cyx_n_h_w_global_desc),
            decltype(b_cyx_n_h_w_thread_desc),
-            Sequence<CYXPerThread, 1, HPerThread, WPerThread>,
+            Sequence<CYXPerBlock, 1, HPerThread, WPerThread>,
            Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder,
            3,                    // BBlockTransferSrcVectorDim,
            1,                    // BBlockTransferSrcScalarPerVector,
@@ -192,10 +199,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2

        ThreadwiseTensorSliceTransferB b_threadwise_transfer(
            b_cyx_n_h_w_global_desc,
-            make_multi_index(0,
-                             0,
-                             h_block_data_on_global + h_thread_id * HPerThread,
-                             w_block_data_on_global + w_thread_id * WPerThread));
+            make_multi_index(0, 0, h_thread_data_on_global, w_thread_data_on_global));

        // c_thread_mtx definition: this is a mess
        // TODO:: more elegent way of defining c_thread_mtx
@@ -229,13 +233,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
        // register allocation for output
        AccFloat p_c_thread[c_k_n_h_w_thread_desc.GetElementSpaceSize()];

-        for(index_t i = 0; i < c_k_n_h_w_thread_desc.GetElementSpaceSize(); i++)
-        {
-            p_c_thread[i] = 0;
-        }
-
        // zero out threadwise output
-        // threadwise_matrix_set_zero_v2(c_k_n_h_w_thread_desc, p_c_thread);
+        threadwise_matrix_set_zero_v3(c_k_n_h_w_thread_desc, p_c_thread);

        constexpr auto a_block_slice_copy_step  = make_multi_index(CYXPerBlock, 0);
        constexpr auto b_thread_slice_copy_step = make_multi_index(CYXPerBlock, 0, 0, 0);
@@ -272,10 +271,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
 #if 0
            __syncthreads();

-            //index_t sum = 0;
-            //for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
-                //sum += p_b_thread_double[i];
-
            p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
            p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
            p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8];
@@ -407,24 +402,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
                    Number<KPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));

-            // calculate origin of thread input tensor on global memory
-            //     blockwise GEMM c matrix starting index
-#if 0
-            const auto c_thread_mtx_on_block =
-                blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-
-            const index_t m_thread_data_on_global =
-                m_block_data_on_global + c_thread_mtx_on_block.row;
-
-            const index_t n_thread_data_on_global =
-                n_block_data_on_global + c_thread_mtx_on_block.col;
-#endif
-            const index_t m_thread_data_on_global = m_block_data_on_global;
-            const index_t h_thread_data_on_global =
-                h_block_data_on_global + h_thread_id * HPerThread;
-            const index_t w_thread_data_on_global =
-                w_block_data_on_global + w_thread_id * WPerThread;
-
            // hack to control index calculation when iterating over c_k_n_h_w_global tensor
            constexpr auto c_k_n_h_w_global_tensor_iterator_hacks = CGlobalIteratorHacks{};

@@ -444,7 +421,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
                true>(
                c_k_n_h_w_global_desc,
                make_multi_index(
-                    m_thread_data_on_global, 0, h_thread_data_on_global, w_thread_data_on_global))
+                    k_thread_data_on_global, 0, h_thread_data_on_global, w_thread_data_on_global))
                .Run(c_k_n_h_w_thread_desc,
                     make_tuple(I0, I0, I0, I0),
                     p_c_thread,

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -12,20 +12,24 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
    static_assert(Desc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time");

    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};

    constexpr auto desc = Desc{};

-    constexpr auto M = desc.GetLength(I0);
-    constexpr auto N = desc.GetLength(I1);
+    constexpr auto K = desc.GetLength(I0);
+    constexpr auto H = desc.GetLength(I2);
+    constexpr auto W = desc.GetLength(I3);

-    static_for<0, M, 1>{}([&](auto i) {
-        static_for<0, N, 1>{}([&](auto j) {
-            constexpr auto offset = desc.CalculateOffset(make_tuple(i, j));
+    static_for<0, K, 1>{}([&](auto i) {
+        static_for<0, H, 1>{}([&](auto j) {
+            static_for<0, W, 1>{}([&](auto k) {
+                constexpr auto offset = desc.CalculateOffset(make_tuple(i, 0, j, k));

                p_thread[offset] = Float(0);
            });
        });
+    });
 }

 // C[M, N] += transpose(A[K, M]) * B[K, N]

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -71,12 +71,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
    constexpr index_t BlockSize = 64;

    constexpr index_t KPerBlock   = 16;
-    constexpr index_t HPerBlock   = 8;
+    constexpr index_t HPerBlock   = 16;
    constexpr index_t WPerBlock   = 16;
    constexpr index_t CYXPerBlock = 4;

-    constexpr index_t KPerThread   = 16;
-    constexpr index_t HPerThread   = 1;
+    constexpr index_t KPerThread   = KPerBlock;
+    constexpr index_t HPerThread   = 2;
    constexpr index_t WPerThread   = 2;
    constexpr index_t CYXPerThread = 4;


--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -82,8 +82,8 @@ int main(int argc, char* argv[])
 #elif 1
    constexpr index_t N  = 1;
    constexpr index_t C  = 4;
-    constexpr index_t HI = 1080;
-    constexpr index_t WI = 1920;
+    constexpr index_t HI = 1024;
+    constexpr index_t WI = 2048;
    constexpr index_t K  = 16;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

--- a/script/cmake-rocm3.7.sh
+++ b/script/cmake-rocm3.7.sh
@@ -10,7 +10,7 @@ cmake
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                      \
 -D CMAKE_BUILD_TYPE=Release                                                                                                        \
 -D DEVICE_BACKEND="AMD"                                                                                                            \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0"                   \
+-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -save-temps=$CWD -ftemplate-backtrace-limit=0"                   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                          \
 -D CMAKE_PREFIX_PATH="/opt/rocm"                                                                                                   \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                  \