"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "8cb5ac1eb22e20f83db2cab985bc879d93b5fabc"
Commit 60d30627 authored by Jing Zhang's avatar Jing Zhang
Browse files

added inline assembly into thread-wise gemm

parent 2c9b8c24
#!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_CXX_COMPILER=clang++ \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=CUDA \
-D BOOST_ROOT="/package/install/boost_1.67.0" \
-D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
${MY_PROJECT_SOURCE}
#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \
#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61 -Xptxas -v -maxrregcount=128" \
#!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="HIP" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE}
...@@ -53,17 +53,39 @@ __device__ void threadwise_gemm(MatrixA, ...@@ -53,17 +53,39 @@ __device__ void threadwise_gemm(MatrixA,
constexpr unsigned N = c_mtx.NCol(); constexpr unsigned N = c_mtx.NCol();
constexpr unsigned K = a_mtx.NRow(); // A is transposed constexpr unsigned K = a_mtx.NRow(); // A is transposed
assert(M == 8);
assert(N == 8);
assert(K == 1);
for(unsigned k = 0; k < K; ++k) for(unsigned k = 0; k < K; ++k)
{ {
const unsigned bindex = b_mtx.Get1dIndex(k, 0);
for(unsigned i = 0; i < M; ++i) for(unsigned i = 0; i < M; ++i)
{ {
for(unsigned j = 0; j < N; ++j) const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed
const unsigned cindex = c_mtx.Get1dIndex(i, 0);
//N = 8
//for(unsigned j = 0; j < N; ++j)
{ {
const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed //const unsigned bindex = b_mtx.Get1dIndex(k, j);
const unsigned bindex = b_mtx.Get1dIndex(k, j); //const unsigned cindex = c_mtx.Get1dIndex(i, j);
const unsigned cindex = c_mtx.Get1dIndex(i, j); //f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); asm volatile("\n \
v_mac_f32 %0, %8, %9 \n \
v_mac_f32 %1, %8, %10 \n \
v_mac_f32 %2, %8, %11 \n \
v_mac_f32 %3, %8, %12 \n \
v_mac_f32 %4, %8, %13 \n \
v_mac_f32 %5, %8, %14 \n \
v_mac_f32 %6, %8, %15 \n \
v_mac_f32 %7, %8, %16 \n \
"
: "=v"(p_c_thread[cindex + 0]),"=v"(p_c_thread[cindex + 1]),"=v"(p_c_thread[cindex + 2]),"=v"(p_c_thread[cindex + 3]),"=v"(p_c_thread[cindex + 4]),"=v"(p_c_thread[cindex + 5]),"=v"(p_c_thread[cindex + 6]),"=v"(p_c_thread[cindex + 7])
: "v"(p_a_thread[aindex]), "v"(p_b_thread[bindex + 0]), "v"(p_b_thread[bindex + 1]),"v"(p_b_thread[bindex + 2]),"v"(p_b_thread[bindex + 3]),"v"(p_b_thread[bindex + 4]),"v"(p_b_thread[bindex + 5]),"v"(p_b_thread[bindex + 6]),"v"(p_b_thread[bindex + 7]),
"0"(p_c_thread[cindex + 0]),"1"(p_c_thread[cindex + 1]),"2"(p_c_thread[cindex + 2]),"3"(p_c_thread[cindex + 3]),"4"(p_c_thread[cindex + 4]),"5"(p_c_thread[cindex + 5]),"6"(p_c_thread[cindex + 6]),"7"(p_c_thread[cindex + 7])
);
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment