refactor

1ab4ca7e · Chao Liu · 9a2744a6 · 9a2744a6 · 9a2744a6 · 1ab4ca7e
Commit 1ab4ca7e authored Mar 26, 2019 by Chao Liu
5 changed files
--- a/build/cmake-cuda.sh
+++ b/build/cmake-cuda.sh
-#!/bin/bash
-
-rm -f CMakeCache.txt
-rm -f *.cmake
-rm -rf CMakeFiles
-
-MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
-MY_PROJECT_INSTALL=../install.dir
-
-cmake                                                                                       \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
-D CMAKE_CXX_COMPILER=clang++                                                               \
-D CMAKE_BUILD_TYPE=Release                                                                 \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
-D DEVICE_BACKEND=CUDA                                                                      \
-D BOOST_ROOT="/package/install/boost_1.67.0"                                               \
-D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"  \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
-${MY_PROJECT_SOURCE}
-
-
-#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc"                                \
-#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61"            \
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61 -Xptxas -v -maxrregcount=128" \
--- a/build/cmake-hip.sh
+++ b/build/cmake-hip.sh
-#!/bin/bash
-
-rm -f CMakeCache.txt
-rm -f *.cmake
-rm -rf CMakeFiles
-
-MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
-MY_PROJECT_INSTALL=../install.dir
-
-cmake                                                                                       \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
-D CMAKE_BUILD_TYPE=Release                                                                 \
-D DEVICE_BACKEND="HIP"                                                                     \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only"                                  \
-D CMAKE_CXX_FLAGS="-gline-tables-only"                                                     \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                   \
-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep"                             \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
-${MY_PROJECT_SOURCE}
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -408,7 +408,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                    p_lds_begin);
            }

-#if 0
+#if 1
            asm volatile("\n \
            s_waitcnt lgkmcnt(0) \n \
            " ::);

--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -211,18 +211,18 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
            for(index_t x = 0; x < X; ++x)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 0
+#if 1
                blockwise_gemm.Run
 #elif 0
                blockwise_gemm.Run_asm
-#elif 1
+#elif 0
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
-                (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                 p_in_block + y * Wi + x,
-                 p_out_thread,
-                 f_accum,
-                 p_lds_begin);
+                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block + y * Wi + x,
+                     p_out_thread,
+                     f_accum,
+                     p_lds_begin);
            }
        }
    }

--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -94,6 +94,23 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
                    "
            : "=v"(p_dst[dst_index + 3])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 3) - p_lds_begin))));
+#elif 0
+        // ds_read2_b32
+        using vector_t = typename vector_type<Float, 2>::MemoryType;
+
+        asm volatile(
+            "\n \
+                    ds_read2_b32 %0, %1 offset1:1\n \
+                    "
+            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
+
+        asm volatile(
+            "\n \
+                    ds_read2_b32 %0, %1 offset1:1\n \
+                    "
+            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index + 2)))
+            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
 #elif 0
        // ds_read_b64
        using vector_t = typename vector_type<Float, 2>::MemoryType;