Commit 01136036 authored by wangshaojie6's avatar wangshaojie6
Browse files

try improve pipeline for wrw

parent 438138c0
...@@ -561,26 +561,26 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16 ...@@ -561,26 +561,26 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
v_add_u32_e32 v48, s42, v48 v_add_u32_e32 v48, s42, v48
;v_add_u32_e32 v9, s2, v48 ;v_add_u32_e32 v9, s2, v48;-
;v_add_u32_e32 v13, s2, v9 ;v_add_u32_e32 v13, s2, v9;-
v_cmp_le_i32_e32 vcc, s28, v36 v_cmp_le_i32_e32 vcc, s28, v36
v_cmp_gt_i32_e64 s[0:1], s27, v36 v_cmp_gt_i32_e64 s[0:1], s27, v36
;v_lshlrev_b32_e32 v1, 1, v48 ;v_lshlrev_b32_e32 v1, 1, v48;-
;v_add_u32_e32 v48, s2, v13 ;v_add_u32_e32 v48, s2, v13;-
s_and_b64 s[44:45], vcc, s[0:1] s_and_b64 s[44:45], vcc, s[0:1]
v_cmp_le_i32_e32 vcc, s25, v37 v_cmp_le_i32_e32 vcc, s25, v37
v_cmp_gt_i32_e64 s[0:1], s24, v37 v_cmp_gt_i32_e64 s[0:1], s24, v37
;v_lshlrev_b32_e32 v5, 1, v9 ;v_lshlrev_b32_e32 v5, 1, v9;-
;v_lshlrev_b32_e32 v9, 1, v13 ;v_lshlrev_b32_e32 v9, 1, v13;-
;v_lshlrev_b32_e32 v13, 1, v48 ;v_lshlrev_b32_e32 v13, 1, v48;-
s_and_b64 s[0:1], vcc, s[0:1] s_and_b64 s[0:1], vcc, s[0:1]
;buffer_load_dwordx4 v[1:4], v1, s[12:15], 0 offen ;buffer_load_dwordx4 v[1:4], v1, s[12:15], 0 offen ;-
s_and_b64 s[0:1], s[0:1], s[44:45] s_and_b64 s[0:1], s[0:1], s[44:45]
;buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen ;buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen ;-
v_add_u32_e32 v55, s39, v38 v_add_u32_e32 v55, s39, v38
;buffer_load_dwordx4 v[9:12], v9, s[12:15], 0 offen ;buffer_load_dwordx4 v[9:12], v9, s[12:15], 0 offen ;-
v_add_u32_e32 v71, s29, v29 v_add_u32_e32 v71, s29, v29
;buffer_load_dwordx4 v[21:24], v13, s[12:15], 0 offen ;buffer_load_dwordx4 v[21:24], v13, s[12:15], 0 offen ;-
;;#ASMSTART ;;#ASMSTART
s_waitcnt lgkmcnt(0) s_waitcnt lgkmcnt(0)
s_barrier s_barrier
...@@ -597,6 +597,8 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16 ...@@ -597,6 +597,8 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
buffer_load_dwordx4 v[13:16], v13, s[16:19], 0 offen buffer_load_dwordx4 v[13:16], v13, s[16:19], 0 offen
v_add_u32_e32 v63, s29, v31 v_add_u32_e32 v63, s29, v31
buffer_load_dwordx4 v[17:20], v17, s[16:19], 0 offen buffer_load_dwordx4 v[17:20], v17, s[16:19], 0 offen
;s_setprio 1
;ds_read2_b64 v[55:58], v55 offset1:1 ;ds_read2_b64 v[55:58], v55 offset1:1
;ds_read2_b64 v[59:62], v31 offset1:1 ;ds_read2_b64 v[59:62], v31 offset1:1
;ds_read2_b64 v[67:70], v29 offset1:1 ;ds_read2_b64 v[67:70], v29 offset1:1
...@@ -855,6 +857,7 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16 ...@@ -855,6 +857,7 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
ds_write2_b32 v32, v7, v8 offset0:24 offset1:28 ds_write2_b32 v32, v7, v8 offset0:24 offset1:28
v_mfma_f32_32x32x8f16 a[48:63], v[78:79], v[73:74], a[48:63] v_mfma_f32_32x32x8f16 a[48:63], v[78:79], v[73:74], a[48:63]
;s_setprio 0
s_cbranch_scc1 BB0_1 s_cbranch_scc1 BB0_1
; %bb.2: ; %_ZZN2ck23Merge_v2_magic_divisionINS_5TupleIJNS_17integral_constantIiLi4EEENS2_IiLi2EEEiiiEEEEC1ERKS5_ENKUlT_E_clIS4_EEDaS9_.exit.i.i.i.i.i.i.i.i ; %bb.2: ; %_ZZN2ck23Merge_v2_magic_divisionINS_5TupleIJNS_17integral_constantIiLi4EEENS2_IiLi2EEEiiiEEEEC1ERKS5_ENKUlT_E_clIS4_EEDaS9_.exit.i.i.i.i.i.i.i.i
;;#ASMSTART ;;#ASMSTART
......
This diff is collapsed.
...@@ -304,18 +304,18 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -304,18 +304,18 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
private: private:
// A[M0, M1, M2, KPerThread] // A[M0, M1, M2, KPerThread]
static constexpr auto a_thread_desc_ = static constexpr auto a_thread_desc_ =
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{})); make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, I1, I1, Number<KPerThread>{}));
// B[N0, N1, N2, KPerThread] // B[N0, N1, N2, KPerThread]
static constexpr auto b_thread_desc_ = static constexpr auto b_thread_desc_ =
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{})); make_naive_tensor_descriptor_packed(make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerThread>{}));
// C[M, N, NumRegXdlops] // C[M, N, NumRegXdlops]
static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops())); make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
// static constexpr index_t A_K1_vec = A_K1 / 2; static constexpr index_t A_K1_vec = A_K1 / 2;
// static constexpr index_t B_K1_vec = B_K1 / 2; static constexpr index_t B_K1_vec = B_K1 / 2;
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB, using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
FloatAB, FloatAB,
...@@ -324,7 +324,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -324,7 +324,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence<1, 1, 1, KPerThread>, Sequence<1, 1, 1, KPerThread>,
Sequence<0, 1, 2, 3>, Sequence<0, 1, 2, 3>,
3, 3,
A_K1, A_K1_vec,
A_K1>; A_K1>;
using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB, using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
...@@ -334,7 +334,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -334,7 +334,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence<1, 1, 1, KPerThread>, Sequence<1, 1, 1, KPerThread>,
Sequence<0, 1, 2, 3>, Sequence<0, 1, 2, 3>,
3, 3,
B_K1, B_K1_vec,
B_K1>; B_K1>;
AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
......
...@@ -658,6 +658,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -658,6 +658,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf); a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf); b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
} }
// Initialize C // Initialize C
...@@ -671,11 +675,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -671,11 +675,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
do do
{ {
a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf); //a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
block_sync_lds(); block_sync_lds();
b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf); //b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
...@@ -687,6 +691,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 ...@@ -687,6 +691,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf); a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf); b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
k0_block_data_begin += K0PerBlock; k0_block_data_begin += K0PerBlock;
} while(k0_block_data_begin < (K0 - K0PerBlock)); } while(k0_block_data_begin < (K0 - K0PerBlock));
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment