Commit 1ab4ca7e authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 9a2744a6
#!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_CXX_COMPILER=clang++ \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=CUDA \
-D BOOST_ROOT="/package/install/boost_1.67.0" \
-D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
${MY_PROJECT_SOURCE}
#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \
#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61 -Xptxas -v -maxrregcount=128" \
#!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="HIP" \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only" \
-D CMAKE_CXX_FLAGS="-gline-tables-only" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE}
......@@ -408,7 +408,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_lds_begin);
}
#if 0
#if 1
asm volatile("\n \
s_waitcnt lgkmcnt(0) \n \
" ::);
......
......@@ -211,18 +211,18 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
for(index_t x = 0; x < X; ++x)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 0
#if 1
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_asm
#elif 1
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
#endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block + y * Wi + x,
p_out_thread,
f_accum,
p_lds_begin);
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block + y * Wi + x,
p_out_thread,
f_accum,
p_lds_begin);
}
}
}
......
......@@ -94,6 +94,23 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
"
: "=v"(p_dst[dst_index + 3])
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 3) - p_lds_begin))));
#elif 0
// ds_read2_b32
using vector_t = typename vector_type<Float, 2>::MemoryType;
asm volatile(
"\n \
ds_read2_b32 %0, %1 offset1:1\n \
"
: "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
asm volatile(
"\n \
ds_read2_b32 %0, %1 offset1:1\n \
"
: "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index + 2)))
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
#elif 0
// ds_read_b64
using vector_t = typename vector_type<Float, 2>::MemoryType;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment