conv2d_wrw_outline.s 2.72 KB
Newer Older
wangshaojie6's avatar
wangshaojie6 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
;origin loop
.origin_loop_start:
	ds_read2_b64 v_lda[0:3]
	ds_read2_b64 v_ldb[0:3]
	ds_read2_b64 v_lda[4:7]
	ds_read2_b64 v_ldb[4:7]
	v_mfma v_lda[0:1], v_ldb[0:1]
	v_mfma v_lda[2:3], v_ldb[2:3]
	v_mfma v_lda[0:1], v_ldb[4:5]
	v_mfma v_lda[2:3], v_ldb[6:7]
	v_mfma v_lda[4:5], v_ldb[0:1]
	v_mfma v_lda[6:7], v_ldb[2:3]
	v_mfma v_lda[4:5], v_ldb[4:5]
	v_mfma v_lda[6:7], v_ldb[6:7]

	ds_read2_b64 v_lda[0:3] offset: next k
	ds_read2_b64 v_lda[4:7] offset: next k
	ds_read2_b64 v_ldb[0:3] offset: next k
	ds_read2_b64 v_ldb[4:7] offset: next k

	s_barrier

	v_mfma v_lda[0:1], v_ldb[0:1]
	v_mfma v_lda[2:3], v_ldb[2:3]
	v_mfma v_lda[0:1], v_ldb[4:5]
	v_mfma v_lda[2:3], v_ldb[6:7]

	v_pack v_lda[0], v_gla[0], v_gla[1], lo
	v_pack v_lda[1], v_gla[0], v_gla[1], hi
	v_pack v_lda[2], v_gla[2], v_gla[3], lo
	v_pack v_lda[3], v_gla[2], v_gla[3], hi
	ds_write2_b64 v_lda[0:1], v_lda[2:3]

	v_pack v_pkb[0], v_glb[0], v_glb[1], lo
	v_pack v_pkb[1], v_glb[0], v_glb[1], hi
	v_pack v_pkb[2], v_glb[2], v_glb[3], lo
	v_pack v_pkb[3], v_glb[2], v_glb[3], hi
	ds_write2_b64 v_pkb[0:1], v_pkb[2:3]

	s_barrier

	v_move_slice_window 0
	v_move_slice_window 1
	; ... ~60 valus

	buffer_load_dwordx4 v_gla[0:3]
	buffer_load_dwordx4 v_glb[0:3]

	v_mfma v_lda[4:5], v_ldb[0:1]
	v_mfma v_lda[6:7], v_ldb[2:3]
	v_mfma v_lda[4:5], v_ldb[4:5]
	v_mfma v_lda[6:7], v_ldb[6:7]
	s_branch origin_loop_start


;optimized loop
.optimized_loop_start:
	ds_read2_b64 v_lda[0:3]
	ds_read2_b64 v_ldb[0:3]
	ds_read2_b64 v_lda[4:7]
	ds_read2_b64 v_ldb[4:7]
	v_mfma v_lda[0:1], v_ldb[0:1]
	v_mfma v_lda[2:3], v_ldb[2:3]
	v_mfma v_lda[0:1], v_ldb[4:5]
	v_mfma v_lda[2:3], v_ldb[6:7]
	v_mfma v_lda[4:5], v_ldb[0:1]
	v_mfma v_lda[6:7], v_ldb[2:3]
	v_mfma v_lda[4:5], v_ldb[4:5]
	v_mfma v_lda[6:7], v_ldb[6:7]

	ds_read2_b64 v_lda[8:11] offset: next k
	ds_read2_b64 v_lda[12:15] offset: next k
	ds_read2_b64 v_ldb[8:11] offset: next k
	ds_read2_b64 v_ldb[12:15] offset: next k

	v_mfma v_lda[8:9], v_ldb[8:9]
	s_barrier
	v_mfma v_lda[10:11], v_ldb[10:11]

	v_pack v_lda[0], v_gla[0], v_gla[1], lo
	v_pack v_lda[1], v_gla[0], v_gla[1], hi
	v_pack v_lda[2], v_gla[2], v_gla[3], lo
	v_pack v_lda[3], v_gla[2], v_gla[3], hi

	ds_write2_b64 v_lda[0:1], v_lda[2:3]
	v_mfma v_lda[8:9], v_ldb[12:13]

	v_pack v_pkb[0], v_glb[0], v_glb[1], lo
	v_pack v_pkb[1], v_glb[0], v_glb[1], hi
	v_pack v_pkb[2], v_glb[2], v_glb[3], lo
	v_pack v_pkb[3], v_glb[2], v_glb[3], hi
	ds_write2_b64 v_pkb[0:1], v_pkb[2:3]
	v_mfma v_lda[10:11], v_ldb[14:15]

	s_barrier
	v_mfma v_lda[12:13], v_ldb[8:9]

	v_move_slice_window 0
	v_mfma v_lda[12:13], v_ldb[10:11]
	v_move_slice_window 1

	buffer_load_dwordx4 v_gla[0:3]
	v_mfma v_lda[12:13], v_ldb[12:13]
	buffer_load_dwordx4 v_glb[0:3]
	v_mfma v_lda[14:15], v_ldb[14:15]
	s_branch optimized_loop_start