Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
01136036
Commit
01136036
authored
May 04, 2022
by
wangshaojie6
Browse files
try improve pipeline for wrw
parent
438138c0
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
31851 additions
and
19 deletions
+31851
-19
conv2d_asm_interleave_ds_write_part.s
conv2d_asm_interleave_ds_write_part.s
+14
-11
conv2d_asm_interleave_ds_write_part_256x256x32.s
conv2d_asm_interleave_ds_write_part_256x256x32.s
+10542
-0
conv2d_asm_interleave_ds_write_part_buffer_load_part_256x256x32.s
...sm_interleave_ds_write_part_buffer_load_part_256x256x32.s
+10603
-0
conv2d_asm_interleave_ds_write_part_buffer_load_part_shuffle_buffer_load_256x256x32.s
...te_part_buffer_load_part_shuffle_buffer_load_256x256x32.s
+10677
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+6
-6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
...tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+9
-2
No files found.
conv2d_asm_interleave_ds_write_part.s
View file @
01136036
...
@@ -561,26 +561,26 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
...
@@ -561,26 +561,26 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
v_add_u32_e32
v48
,
s42
,
v48
v_add_u32_e32
v48
,
s42
,
v48
;v_add_u32_e32 v9, s2, v48
;v_add_u32_e32 v9, s2, v48
;-
;v_add_u32_e32 v13, s2, v9
;v_add_u32_e32 v13, s2, v9
;-
v_cmp_le_i32_e32
vcc
,
s28
,
v36
v_cmp_le_i32_e32
vcc
,
s28
,
v36
v_cmp_gt_i32_e64
s
[
0
:
1
],
s27
,
v36
v_cmp_gt_i32_e64
s
[
0
:
1
],
s27
,
v36
;v_lshlrev_b32_e32 v1, 1, v48
;v_lshlrev_b32_e32 v1, 1, v48
;-
;v_add_u32_e32 v48, s2, v13
;v_add_u32_e32 v48, s2, v13
;-
s_and_b64
s
[
44
:
45
],
vcc
,
s
[
0
:
1
]
s_and_b64
s
[
44
:
45
],
vcc
,
s
[
0
:
1
]
v_cmp_le_i32_e32
vcc
,
s25
,
v37
v_cmp_le_i32_e32
vcc
,
s25
,
v37
v_cmp_gt_i32_e64
s
[
0
:
1
],
s24
,
v37
v_cmp_gt_i32_e64
s
[
0
:
1
],
s24
,
v37
;v_lshlrev_b32_e32 v5, 1, v9
;v_lshlrev_b32_e32 v5, 1, v9
;-
;v_lshlrev_b32_e32 v9, 1, v13
;v_lshlrev_b32_e32 v9, 1, v13
;-
;v_lshlrev_b32_e32 v13, 1, v48
;v_lshlrev_b32_e32 v13, 1, v48
;-
s_and_b64
s
[
0
:
1
],
vcc
,
s
[
0
:
1
]
s_and_b64
s
[
0
:
1
],
vcc
,
s
[
0
:
1
]
;buffer_load_dwordx4 v[1:4], v1, s[12:15], 0 offen
;buffer_load_dwordx4 v[1:4], v1, s[12:15], 0 offen
;-
s_and_b64
s
[
0
:
1
],
s
[
0
:
1
],
s
[
44
:
45
]
s_and_b64
s
[
0
:
1
],
s
[
0
:
1
],
s
[
44
:
45
]
;buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen
;buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen
;-
v_add_u32_e32
v55
,
s39
,
v38
v_add_u32_e32
v55
,
s39
,
v38
;buffer_load_dwordx4 v[9:12], v9, s[12:15], 0 offen
;buffer_load_dwordx4 v[9:12], v9, s[12:15], 0 offen
;-
v_add_u32_e32
v71
,
s29
,
v29
v_add_u32_e32
v71
,
s29
,
v29
;buffer_load_dwordx4 v[21:24], v13, s[12:15], 0 offen
;buffer_load_dwordx4 v[21:24], v13, s[12:15], 0 offen
;-
;;#ASMSTART
;;#ASMSTART
s_waitcnt
lgkmcnt
(
0
)
s_waitcnt
lgkmcnt
(
0
)
s_barrier
s_barrier
...
@@ -597,6 +597,8 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
...
@@ -597,6 +597,8 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
buffer_load_dwordx4
v
[
13
:
16
],
v13
,
s
[
16
:
19
],
0
offen
buffer_load_dwordx4
v
[
13
:
16
],
v13
,
s
[
16
:
19
],
0
offen
v_add_u32_e32
v63
,
s29
,
v31
v_add_u32_e32
v63
,
s29
,
v31
buffer_load_dwordx4
v
[
17
:
20
],
v17
,
s
[
16
:
19
],
0
offen
buffer_load_dwordx4
v
[
17
:
20
],
v17
,
s
[
16
:
19
],
0
offen
;s_setprio 1
;ds_read2_b64 v[55:58], v55 offset1:1
;ds_read2_b64 v[55:58], v55 offset1:1
;ds_read2_b64 v[59:62], v31 offset1:1
;ds_read2_b64 v[59:62], v31 offset1:1
;ds_read2_b64 v[67:70], v29 offset1:1
;ds_read2_b64 v[67:70], v29 offset1:1
...
@@ -855,6 +857,7 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
...
@@ -855,6 +857,7 @@ BB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16
ds_write2_b32
v32
,
v7
,
v8
offset0
:
24
offset1
:
28
ds_write2_b32
v32
,
v7
,
v8
offset0
:
24
offset1
:
28
v_mfma_f32_32x32x8f16
a
[
48
:
63
],
v
[
78
:
79
],
v
[
73
:
74
],
a
[
48
:
63
]
v_mfma_f32_32x32x8f16
a
[
48
:
63
],
v
[
78
:
79
],
v
[
73
:
74
],
a
[
48
:
63
]
;s_setprio 0
s_cbranch_scc1
BB0_1
s_cbranch_scc1
BB0_1
; %bb.2: ; %_ZZN2ck23Merge_v2_magic_divisionINS_5TupleIJNS_17integral_constantIiLi4EEENS2_IiLi2EEEiiiEEEEC1ERKS5_ENKUlT_E_clIS4_EEDaS9_.exit.i.i.i.i.i.i.i.i
; %bb.2: ; %_ZZN2ck23Merge_v2_magic_divisionINS_5TupleIJNS_17integral_constantIiLi4EEENS2_IiLi2EEEiiiEEEEC1ERKS5_ENKUlT_E_clIS4_EEDaS9_.exit.i.i.i.i.i.i.i.i
;;#ASMSTART
;;#ASMSTART
...
...
conv2d_asm_interleave_ds_write_part_256x256x32.s
0 → 100755
View file @
01136036
This diff is collapsed.
Click to expand it.
conv2d_asm_interleave_ds_write_part_buffer_load_part_256x256x32.s
0 → 100755
View file @
01136036
This diff is collapsed.
Click to expand it.
conv2d_asm_interleave_ds_write_part_buffer_load_part_shuffle_buffer_load_256x256x32.s
0 → 100755
View file @
01136036
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
01136036
...
@@ -304,18 +304,18 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -304,18 +304,18 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
private:
private:
// A[M0, M1, M2, KPerThread]
// A[M0, M1, M2, KPerThread]
static
constexpr
auto
a_thread_desc_
=
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPerThread
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{}
,
I1
,
I1
,
Number
<
KPerThread
>
{}));
// B[N0, N1, N2, KPerThread]
// B[N0, N1, N2, KPerThread]
static
constexpr
auto
b_thread_desc_
=
static
constexpr
auto
b_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPerThread
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
NRepeat
>
{}
,
I1
,
I1
,
Number
<
KPerThread
>
{}));
// C[M, N, NumRegXdlops]
// C[M, N, NumRegXdlops]
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{},
xdlops_gemm
.
GetRegSizePerXdlops
()));
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{},
xdlops_gemm
.
GetRegSizePerXdlops
()));
//
static constexpr index_t A_K1_vec = A_K1 / 2;
static
constexpr
index_t
A_K1_vec
=
A_K1
/
2
;
//
static constexpr index_t B_K1_vec = B_K1 / 2;
static
constexpr
index_t
B_K1_vec
=
B_K1
/
2
;
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
FloatAB
,
FloatAB
,
...
@@ -324,7 +324,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -324,7 +324,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence
<
1
,
1
,
1
,
KPerThread
>
,
Sequence
<
1
,
1
,
1
,
KPerThread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
A_K1
,
A_K1
_vec
,
A_K1
>
;
A_K1
>
;
using
BThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
using
BThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
...
@@ -334,7 +334,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -334,7 +334,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence
<
1
,
1
,
1
,
KPerThread
>
,
Sequence
<
1
,
1
,
1
,
KPerThread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
B_K1
,
B_K1
_vec
,
B_K1
>
;
B_K1
>
;
AThreadCopy
a_thread_copy_
{
CalculateAThreadOriginDataIndex
()};
AThreadCopy
a_thread_copy_
{
CalculateAThreadOriginDataIndex
()};
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
View file @
01136036
...
@@ -658,6 +658,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
...
@@ -658,6 +658,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
a_blockwise_copy
.
RunWrite
(
a_b_k0_m_k1_block_desc
,
a_block_buf
);
a_blockwise_copy
.
RunWrite
(
a_b_k0_m_k1_block_desc
,
a_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_b_k0_n_k1_block_desc
,
b_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_b_k0_n_k1_block_desc
,
b_block_buf
);
a_blockwise_copy
.
RunRead
(
a_b_k0_m_k1_grid_desc
,
a_grid_buf
);
b_blockwise_copy
.
RunRead
(
b_b_k0_n_k1_grid_desc
,
b_grid_buf
);
}
}
// Initialize C
// Initialize C
...
@@ -671,11 +675,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
...
@@ -671,11 +675,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
do
do
{
{
a_blockwise_copy
.
RunRead
(
a_b_k0_m_k1_grid_desc
,
a_grid_buf
);
//
a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
block_sync_lds
();
block_sync_lds
();
b_blockwise_copy
.
RunRead
(
b_b_k0_n_k1_grid_desc
,
b_grid_buf
);
//
b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
...
@@ -687,6 +691,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
...
@@ -687,6 +691,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
a_blockwise_copy
.
RunWrite
(
a_b_k0_m_k1_block_desc
,
a_block_buf
);
a_blockwise_copy
.
RunWrite
(
a_b_k0_m_k1_block_desc
,
a_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_b_k0_n_k1_block_desc
,
b_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_b_k0_n_k1_block_desc
,
b_block_buf
);
a_blockwise_copy
.
RunRead
(
a_b_k0_m_k1_grid_desc
,
a_grid_buf
);
b_blockwise_copy
.
RunRead
(
b_b_k0_n_k1_grid_desc
,
b_grid_buf
);
k0_block_data_begin
+=
K0PerBlock
;
k0_block_data_begin
+=
K0PerBlock
;
}
while
(
k0_block_data_begin
<
(
K0
-
K0PerBlock
));
}
while
(
k0_block_data_begin
<
(
K0
-
K0PerBlock
));
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment