Commit 2ec3f4c3 authored by aska-0096's avatar aska-0096
Browse files

1. change blockwise gemm loopover direction from kmn to mnk ( ~1% improvement)

2. change kernel timing mode to 50 warmup + 50 timed repeat
parent 5bf77d8b
...@@ -37,14 +37,14 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle ...@@ -37,14 +37,14 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
GemmDefault, GemmDefault,
2, // Prefetch stage 2, // Prefetch stage
128, // BlockSize 128, // BlockSize
64, // MPerBlock 128, // MPerBlock
128, // NPerBlock 64, // NPerBlock
64, // KPerBlock 64, // KPerBlock
8, // K1 8, // K1
16, // MPerWmma 16, // MPerWmma
16, // NPerWmma 16, // NPerWmma
2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave
4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave
S<4, 32, 1>, S<4, 32, 1>,
S<1, 0, 2>, S<1, 0, 2>,
S<1, 0, 2>, S<1, 0, 2>,
......
...@@ -32,10 +32,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -32,10 +32,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
printf("Warm up 1 time\n"); printf("Warm up 1 time\n");
#endif #endif
// warm up const int nrepeat = 50;
// kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
for(int i = 0; i < nrepeat; ++i)
{
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
}
const int nrepeat = 100;
#if DEBUG_LOG #if DEBUG_LOG
printf("Start running %d times...\n", nrepeat); printf("Start running %d times...\n", nrepeat);
#endif #endif
......
...@@ -312,10 +312,14 @@ struct BlockwiseGemmWMMA ...@@ -312,10 +312,14 @@ struct BlockwiseGemmWMMA
// basic intrinsic to determine loopover direction // basic intrinsic to determine loopover direction
if constexpr(MRepeat < NRepeat) if constexpr(MRepeat < NRepeat)
{ {
static_for<0, KPerBlock / WmmaK, 1>{}(
[&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
static_for<0, MRepeat, 1>{}([&](auto m0) { static_for<0, MRepeat, 1>{}([&](auto m0) {
// read A
static_for<0, NRepeat, 1>{}([&](auto n0) {
static_for<0, KPerBlock / WmmaK, 1>{}(
[&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
// read A
a_thread_copy_.Run( a_thread_copy_.Run(
a_block_desc_k0_m0_m1_m2_k1, a_block_desc_k0_m0_m1_m2_k1,
make_tuple(Number<k * WmmaK / A_K1 * A_Data_Duplicated_Rate / 2>{}, make_tuple(Number<k * WmmaK / A_K1 * A_Data_Duplicated_Rate / 2>{},
...@@ -327,8 +331,6 @@ struct BlockwiseGemmWMMA ...@@ -327,8 +331,6 @@ struct BlockwiseGemmWMMA
a_thread_desc_, a_thread_desc_,
make_tuple(I0, m0, I0, I0, I0), make_tuple(I0, m0, I0, I0, I0),
a_thread_buf); a_thread_buf);
static_for<0, NRepeat, 1>{}([&](auto n0) {
// read B // read B
b_thread_copy_.Run( b_thread_copy_.Run(
b_block_desc_k0_n0_n1_n2_k1, b_block_desc_k0_n0_n1_n2_k1,
...@@ -370,10 +372,14 @@ struct BlockwiseGemmWMMA ...@@ -370,10 +372,14 @@ struct BlockwiseGemmWMMA
} }
else else
{ {
static_for<0, KPerBlock / WmmaK, 1>{}(
[&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, NRepeat, 1>{}([&](auto n0) {
// read B
static_for<0, MRepeat, 1>{}([&](auto m0) {
static_for<0, KPerBlock / WmmaK, 1>{}(
[&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
// read B
b_thread_copy_.Run( b_thread_copy_.Run(
b_block_desc_k0_n0_n1_n2_k1, b_block_desc_k0_n0_n1_n2_k1,
make_tuple(Number<k * WmmaK / B_K1 * B_Data_Duplicated_Rate / 2>{}, make_tuple(Number<k * WmmaK / B_K1 * B_Data_Duplicated_Rate / 2>{},
...@@ -385,8 +391,6 @@ struct BlockwiseGemmWMMA ...@@ -385,8 +391,6 @@ struct BlockwiseGemmWMMA
b_thread_desc_, b_thread_desc_,
make_tuple(I0, n0, I0, I0, I0), make_tuple(I0, n0, I0, I0, I0),
b_thread_buf); b_thread_buf);
static_for<0, MRepeat, 1>{}([&](auto m0) {
// read A // read A
a_thread_copy_.Run( a_thread_copy_.Run(
a_block_desc_k0_m0_m1_m2_k1, a_block_desc_k0_m0_m1_m2_k1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment