"...composable_kernel_rocm.git" did not exist on "44789d992add03ae064b9b0f89c0542734d487b2"
Commit 01136036 authored by wangshaojie6's avatar wangshaojie6
Browse files

try improve pipeline for wrw

parent 438138c0
......@@ -561,26 +561,26 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
v_add_u32_e32 v48, s42, v48
;v_add_u32_e32 v9, s2, v48
;v_add_u32_e32 v13, s2, v9
;v_add_u32_e32 v9, s2, v48;-
;v_add_u32_e32 v13, s2, v9;-
v_cmp_le_i32_e32 vcc, s28, v36
v_cmp_gt_i32_e64 s[0:1], s27, v36
;v_lshlrev_b32_e32 v1, 1, v48
;v_add_u32_e32 v48, s2, v13
;v_lshlrev_b32_e32 v1, 1, v48;-
;v_add_u32_e32 v48, s2, v13;-
s_and_b64 s[44:45], vcc, s[0:1]
v_cmp_le_i32_e32 vcc, s25, v37
v_cmp_gt_i32_e64 s[0:1], s24, v37
;v_lshlrev_b32_e32 v5, 1, v9
;v_lshlrev_b32_e32 v9, 1, v13
;v_lshlrev_b32_e32 v13, 1, v48
;v_lshlrev_b32_e32 v5, 1, v9;-
;v_lshlrev_b32_e32 v9, 1, v13;-
;v_lshlrev_b32_e32 v13, 1, v48;-
s_and_b64 s[0:1], vcc, s[0:1]
;buffer_load_dwordx4 v[1:4], v1, s[12:15], 0 offen
;buffer_load_dwordx4 v[1:4], v1, s[12:15], 0 offen ;-
s_and_b64 s[0:1], s[0:1], s[44:45]
;buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen
;buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen ;-
v_add_u32_e32 v55, s39, v38
;buffer_load_dwordx4 v[9:12], v9, s[12:15], 0 offen
;buffer_load_dwordx4 v[9:12], v9, s[12:15], 0 offen ;-
v_add_u32_e32 v71, s29, v29
;buffer_load_dwordx4 v[21:24], v13, s[12:15], 0 offen
;buffer_load_dwordx4 v[21:24], v13, s[12:15], 0 offen ;-
;;#ASMSTART
s_waitcnt lgkmcnt(0)
s_barrier
......@@ -597,6 +597,8 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
buffer_load_dwordx4 v[13:16], v13, s[16:19], 0 offen
v_add_u32_e32 v63, s29, v31
buffer_load_dwordx4 v[17:20], v17, s[16:19], 0 offen
;s_setprio 1
;ds_read2_b64 v[55:58], v55 offset1:1
;ds_read2_b64 v[59:62], v31 offset1:1
;ds_read2_b64 v[67:70], v29 offset1:1
......@@ -855,6 +857,7 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
ds_write2_b32 v32, v7, v8 offset0:24 offset1:28
v_mfma_f32_32x32x8f16 a[48:63], v[78:79], v[73:74], a[48:63]
;s_setprio 0
s_cbranch_scc1 BB0_1
; %bb.2: ; %_ZZN2ck23Merge_v2_magic_divisionINS_5TupleIJNS_17integral_constantIiLi4EEENS2_IiLi2EEEiiiEEEEC1ERKS5_ENKUlT_E_clIS4_EEDaS9_.exit.i.i.i.i.i.i.i.i
;;#ASMSTART
......
This diff is collapsed.
......@@ -304,18 +304,18 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
private:
// A[M0, M1, M2, KPerThread]
static constexpr auto a_thread_desc_ =
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, I1, I1, Number<KPerThread>{}));
// B[N0, N1, N2, KPerThread]
static constexpr auto b_thread_desc_ =
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
make_naive_tensor_descriptor_packed(make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerThread>{}));
// C[M, N, NumRegXdlops]
static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
// static constexpr index_t A_K1_vec = A_K1 / 2;
// static constexpr index_t B_K1_vec = B_K1 / 2;
static constexpr index_t A_K1_vec = A_K1 / 2;
static constexpr index_t B_K1_vec = B_K1 / 2;
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
FloatAB,
......@@ -324,7 +324,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence<1, 1, 1, KPerThread>,
Sequence<0, 1, 2, 3>,
3,
A_K1,
A_K1_vec,
A_K1>;
using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
......@@ -334,7 +334,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence<1, 1, 1, KPerThread>,
Sequence<0, 1, 2, 3>,
3,
B_K1,
B_K1_vec,
B_K1>;
AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
......
......@@ -658,6 +658,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
}
// Initialize C
......@@ -671,11 +675,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
do
{
a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
//a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
block_sync_lds();
b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
//b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
......@@ -687,6 +691,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
k0_block_data_begin += K0PerBlock;
} while(k0_block_data_begin < (K0 - K0PerBlock));
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment