"...composable_kernel_rocm.git" did not exist on "37bfa01c0dce53959ea05abf31a9802ecca66c48"
Commit caa91db0 authored by root's avatar root
Browse files

clean code

parent 269adde8
...@@ -160,7 +160,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -160,7 +160,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
#if 1 #if 1
a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) + a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) +
mMyThreadOffsetA, mMyThreadOffsetA,
p_a_thread); p_a_thread +
b_thread_mtx.CalculateOffset(make_tuple(cyx_begin, 0, 0, 0)));
#else #else
for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++) for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++)
p_a_thread[i] = 1; p_a_thread[i] = 1;
......
...@@ -102,6 +102,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -102,6 +102,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const index_t k_block_work_id = get_block_1d_id() / hw_block_work_num; const index_t k_block_work_id = get_block_1d_id() / hw_block_work_num;
const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num; const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
#else
// Hack: this force result into SGPR
const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock);
const index_t h_block_work_num = __builtin_amdgcn_readfirstlane(H / HPerBlock);
const index_t w_block_work_num = __builtin_amdgcn_readfirstlane(W / WPerBlock);
const index_t hw_block_work_num = h_block_work_num * w_block_work_num;
const index_t k_block_work_id =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / hw_block_work_num);
const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
#endif
const index_t h_block_work_id = hw_block_work_id / w_block_work_num; const index_t h_block_work_id = hw_block_work_id / w_block_work_num;
const index_t w_block_work_id = hw_block_work_id - h_block_work_id * w_block_work_num; const index_t w_block_work_id = hw_block_work_id - h_block_work_id * w_block_work_num;
...@@ -110,23 +122,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -110,23 +122,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
static_assert(KPerBlock == KPerThread, ""); static_assert(KPerBlock == KPerThread, "");
const auto k_thread_id = 0;
const auto h_thread_id = get_thread_local_1d_id() / w_num_threads; const auto h_thread_id = get_thread_local_1d_id() / w_num_threads;
const auto w_thread_id = get_thread_local_1d_id() % w_num_threads; const auto w_thread_id = get_thread_local_1d_id() % w_num_threads;
#else
// Hack: this force result into SGPR
const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock);
const index_t hw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock);
const index_t k_block_work_id =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / hw_block_work_num);
const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
#endif
const index_t m_block_data_on_global = k_block_work_id * KPerBlock;
const index_t k_block_data_on_global = k_block_work_id * KPerBlock;
const index_t h_block_data_on_global = h_block_work_id * HPerBlock; const index_t h_block_data_on_global = h_block_work_id * HPerBlock;
const index_t w_block_data_on_global = w_block_work_id * WPerBlock; const index_t w_block_data_on_global = w_block_work_id * WPerBlock;
const index_t k_thread_data_on_global = k_block_data_on_global + k_thread_id * KPerThread;
const index_t h_thread_data_on_global = h_block_data_on_global + h_thread_id * HPerThread;
const index_t w_thread_data_on_global = w_block_data_on_global + w_thread_id * WPerThread;
// lds max alignment // lds max alignment
constexpr auto max_lds_align = constexpr auto max_lds_align =
math::lcm(Number<ABlockTransferDstScalarPerVector_M>{}, Number<KPerThread>{}); math::lcm(Number<ABlockTransferDstScalarPerVector_M>{}, Number<KPerThread>{});
...@@ -167,20 +174,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -167,20 +174,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
AThreadTransferSrcResetCoordinateAfterRun, AThreadTransferSrcResetCoordinateAfterRun,
true>( true>(
a_cyx_k_global_desc, a_cyx_k_global_desc,
make_multi_index(0, m_block_data_on_global), make_multi_index(0, k_block_data_on_global),
a_cyx_k_block_desc, a_cyx_k_block_desc,
make_multi_index(0, 0)); make_multi_index(0, 0));
constexpr auto b_cyx_n_h_w_thread_desc = constexpr auto b_cyx_n_h_w_thread_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<CYXPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{})); Number<CYXPerBlock>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
using ThreadwiseTensorSliceTransferB = ThreadwiseDynamicTensorSliceTransfer_v2< using ThreadwiseTensorSliceTransferB = ThreadwiseDynamicTensorSliceTransfer_v2<
Float, Float,
Float, Float,
decltype(b_cyx_n_h_w_global_desc), decltype(b_cyx_n_h_w_global_desc),
decltype(b_cyx_n_h_w_thread_desc), decltype(b_cyx_n_h_w_thread_desc),
Sequence<CYXPerThread, 1, HPerThread, WPerThread>, Sequence<CYXPerBlock, 1, HPerThread, WPerThread>,
Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder, Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder,
3, // BBlockTransferSrcVectorDim, 3, // BBlockTransferSrcVectorDim,
1, // BBlockTransferSrcScalarPerVector, 1, // BBlockTransferSrcScalarPerVector,
...@@ -192,10 +199,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -192,10 +199,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
ThreadwiseTensorSliceTransferB b_threadwise_transfer( ThreadwiseTensorSliceTransferB b_threadwise_transfer(
b_cyx_n_h_w_global_desc, b_cyx_n_h_w_global_desc,
make_multi_index(0, make_multi_index(0, 0, h_thread_data_on_global, w_thread_data_on_global));
0,
h_block_data_on_global + h_thread_id * HPerThread,
w_block_data_on_global + w_thread_id * WPerThread));
// c_thread_mtx definition: this is a mess // c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx // TODO:: more elegent way of defining c_thread_mtx
...@@ -229,13 +233,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -229,13 +233,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// register allocation for output // register allocation for output
AccFloat p_c_thread[c_k_n_h_w_thread_desc.GetElementSpaceSize()]; AccFloat p_c_thread[c_k_n_h_w_thread_desc.GetElementSpaceSize()];
for(index_t i = 0; i < c_k_n_h_w_thread_desc.GetElementSpaceSize(); i++)
{
p_c_thread[i] = 0;
}
// zero out threadwise output // zero out threadwise output
// threadwise_matrix_set_zero_v2(c_k_n_h_w_thread_desc, p_c_thread); threadwise_matrix_set_zero_v3(c_k_n_h_w_thread_desc, p_c_thread);
constexpr auto a_block_slice_copy_step = make_multi_index(CYXPerBlock, 0); constexpr auto a_block_slice_copy_step = make_multi_index(CYXPerBlock, 0);
constexpr auto b_thread_slice_copy_step = make_multi_index(CYXPerBlock, 0, 0, 0); constexpr auto b_thread_slice_copy_step = make_multi_index(CYXPerBlock, 0, 0, 0);
...@@ -272,10 +271,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -272,10 +271,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#if 0 #if 0
__syncthreads(); __syncthreads();
//index_t sum = 0;
//for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
//sum += p_b_thread_double[i];
p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2]; p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5]; p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8]; p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8];
...@@ -407,24 +402,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -407,24 +402,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<KPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{})); Number<KPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
#if 0
const auto c_thread_mtx_on_block =
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const index_t m_thread_data_on_global =
m_block_data_on_global + c_thread_mtx_on_block.row;
const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col;
#endif
const index_t m_thread_data_on_global = m_block_data_on_global;
const index_t h_thread_data_on_global =
h_block_data_on_global + h_thread_id * HPerThread;
const index_t w_thread_data_on_global =
w_block_data_on_global + w_thread_id * WPerThread;
// hack to control index calculation when iterating over c_k_n_h_w_global tensor // hack to control index calculation when iterating over c_k_n_h_w_global tensor
constexpr auto c_k_n_h_w_global_tensor_iterator_hacks = CGlobalIteratorHacks{}; constexpr auto c_k_n_h_w_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
...@@ -444,7 +421,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -444,7 +421,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
true>( true>(
c_k_n_h_w_global_desc, c_k_n_h_w_global_desc,
make_multi_index( make_multi_index(
m_thread_data_on_global, 0, h_thread_data_on_global, w_thread_data_on_global)) k_thread_data_on_global, 0, h_thread_data_on_global, w_thread_data_on_global))
.Run(c_k_n_h_w_thread_desc, .Run(c_k_n_h_w_thread_desc,
make_tuple(I0, I0, I0, I0), make_tuple(I0, I0, I0, I0),
p_c_thread, p_c_thread,
......
...@@ -12,18 +12,22 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread ...@@ -12,18 +12,22 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
static_assert(Desc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time"); static_assert(Desc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time");
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto desc = Desc{}; constexpr auto desc = Desc{};
constexpr auto M = desc.GetLength(I0); constexpr auto K = desc.GetLength(I0);
constexpr auto N = desc.GetLength(I1); constexpr auto H = desc.GetLength(I2);
constexpr auto W = desc.GetLength(I3);
static_for<0, M, 1>{}([&](auto i) { static_for<0, K, 1>{}([&](auto i) {
static_for<0, N, 1>{}([&](auto j) { static_for<0, H, 1>{}([&](auto j) {
constexpr auto offset = desc.CalculateOffset(make_tuple(i, j)); static_for<0, W, 1>{}([&](auto k) {
constexpr auto offset = desc.CalculateOffset(make_tuple(i, 0, j, k));
p_thread[offset] = Float(0); p_thread[offset] = Float(0);
});
}); });
}); });
} }
......
...@@ -71,12 +71,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -71,12 +71,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
constexpr index_t KPerBlock = 16; constexpr index_t KPerBlock = 16;
constexpr index_t HPerBlock = 8; constexpr index_t HPerBlock = 16;
constexpr index_t WPerBlock = 16; constexpr index_t WPerBlock = 16;
constexpr index_t CYXPerBlock = 4; constexpr index_t CYXPerBlock = 4;
constexpr index_t KPerThread = 16; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HPerThread = 1; constexpr index_t HPerThread = 2;
constexpr index_t WPerThread = 2; constexpr index_t WPerThread = 2;
constexpr index_t CYXPerThread = 4; constexpr index_t CYXPerThread = 4;
......
...@@ -82,8 +82,8 @@ int main(int argc, char* argv[]) ...@@ -82,8 +82,8 @@ int main(int argc, char* argv[])
#elif 1 #elif 1
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 4; constexpr index_t C = 4;
constexpr index_t HI = 1080; constexpr index_t HI = 1024;
constexpr index_t WI = 1920; constexpr index_t WI = 2048;
constexpr index_t K = 16; constexpr index_t K = 16;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
......
...@@ -10,7 +10,7 @@ cmake ...@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="AMD" \ -D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \ -D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -save-temps=$CWD -ftemplate-backtrace-limit=0" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \ -D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment