Commit 362e470d authored by ltqin's avatar ltqin
Browse files

add sched barrier

parent 9e03ca59
...@@ -587,7 +587,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1 ...@@ -587,7 +587,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
read_a_lds_data(); read_a_lds_data();
read_b_last_half_data(); read_b_last_half_data();
// s_nop(); sched_barrier();
static_for<0, BaseMultK0 / 2, 1>{}([&](auto ii) { static_for<0, BaseMultK0 / 2, 1>{}([&](auto ii) {
blockwise_gemm.Run(a_thread_buf(Number<ii>{}), blockwise_gemm.Run(a_thread_buf(Number<ii>{}),
...@@ -598,7 +598,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1 ...@@ -598,7 +598,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
read_a_lds_data(); read_a_lds_data();
read_b_first_half_data(); read_b_first_half_data();
// s_nop(); sched_barrier();
static_for<BaseMultK0 / 2, BaseMultK0, 1>{}([&](auto ii) { static_for<BaseMultK0 / 2, BaseMultK0, 1>{}([&](auto ii) {
blockwise_gemm.Run(a_thread_buf(Number<ii - BaseMultK0 / 2>{}), blockwise_gemm.Run(a_thread_buf(Number<ii - BaseMultK0 / 2>{}),
...@@ -626,7 +626,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1 ...@@ -626,7 +626,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
read_a_lds_data(); read_a_lds_data();
read_b_last_half_data(); read_b_last_half_data();
// s_nop(); sched_barrier();
static_for<0, BaseMultK0 / 2, 1>{}([&](auto ii) { static_for<0, BaseMultK0 / 2, 1>{}([&](auto ii) {
blockwise_gemm.Run( blockwise_gemm.Run(
...@@ -640,7 +640,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1 ...@@ -640,7 +640,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
read_b_first_half_data(); read_b_first_half_data();
} }
// s_nop(); sched_barrier();
static_for<BaseMultK0 / 2, BaseMultK0, 1>{}([&](auto ii) { static_for<BaseMultK0 / 2, BaseMultK0, 1>{}([&](auto ii) {
blockwise_gemm.Run(a_thread_buf(Number<ii - BaseMultK0 / 2>{}), blockwise_gemm.Run(a_thread_buf(Number<ii - BaseMultK0 / 2>{}),
......
...@@ -16,11 +16,15 @@ __device__ void block_sync_lds() ...@@ -16,11 +16,15 @@ __device__ void block_sync_lds()
__syncthreads(); __syncthreads();
#endif #endif
} }
__device__ void s_nop() __device__ void sched_barrier()
{ {
#if 1
asm volatile("\ asm volatile("\
s_nop 0 \n \ s_nop 0 \n \
" ::); " ::);
#else
__builtin_amdgcn_sched_barrier(0);
#endif
} }
__device__ void s_barrier() __device__ void s_barrier()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment