Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
1d9b9701
Commit
1d9b9701
authored
Mar 29, 2019
by
Jing Zhang
Browse files
inline gemm 8x8
parent
57a8ccf3
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
223 additions
and
63 deletions
+223
-63
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+223
-63
No files found.
src/include/blockwise_gemm.hip.hpp
View file @
1d9b9701
...
@@ -464,20 +464,19 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -464,20 +464,19 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
const
float4
*
b_loc
=
(
const
float4
*
)(
p_b_block
+
b_src_index
);
const
float4
*
b_loc
=
(
const
float4
*
)(
p_b_block
+
b_src_index
);
float4
*
reg
=
(
float4
*
)(
p_a_thread
+
dst_index
);
float4
*
reg
=
(
float4
*
)(
p_a_thread
+
dst_index
);
//reg[0] = a_loc[0];
reg
[
0
]
=
a_loc
[
0
];
//reg[1] = a_loc[16];
reg
[
1
]
=
a_loc
[
16
];
//reg[2] = b_loc[0];
reg
[
2
]
=
b_loc
[
0
];
//reg[3] = b_loc[8];
reg
[
3
]
=
b_loc
[
8
];
//s_waitcnt lgkmcnt(0) // 000000001398: BF8CC07F
//asm volatile("\n \
asm
volatile
(
"
\n
\
//ds_read2_b64 %0, %4 offset1:1 \n \
ds_read2_b64 %0, %4 offset1:1
\n
\
//ds_read2_b64 %1, %4 offset0:32 offset1:33 \n \
ds_read2_b64 %1, %4 offset0:32 offset1:33
\n
\
//ds_read2_b64 %2, %5 offset1:1 \n \
ds_read2_b64 %2, %5 offset1:1
\n
\
//ds_read2_b64 %3, %5 offset0:16 offset1:17 \n \
ds_read2_b64 %3, %5 offset0:16 offset1:17
\n
\
//s_waitcnt lgkmcnt(0)"
s_waitcnt lgkmcnt(0)"
//: "=v"(reg[0]), "=v"(reg[1]), "=v"(reg[2]), "=v"(reg[3])
:
"=v"
(
reg
[
0
]),
"=v"
(
reg
[
1
]),
"=v"
(
reg
[
2
]),
"=v"
(
reg
[
3
])
//: "v"(__to_local((void *)(a_loc))), "v"(__to_local((void *)(b_loc)))
:
"v"
(
__to_local
((
void
*
)(
a_loc
))),
"v"
(
__to_local
((
void
*
)(
b_loc
)))
//);
);
#endif
#endif
...
@@ -495,58 +494,219 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -495,58 +494,219 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_c_thread,
p_c_thread,
f_accum);
f_accum);
#else
#else
for
(
index_t
k
=
0
;
k
<
1
;
++
k
)
{
// M = 8
const
index_t
bindex
=
b_thread_sub_mtx
.
Get1dIndex
(
k
,
0
);
for
(
index_t
i
=
0
;
i
<
8
;
++
i
)
{
// N = 8
const
index_t
aindex
=
a_thread_sub_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
index_t
cindex
=
c_thread_mtx
.
Get1dIndex
(
i
,
0
);
//for(index_t j = 0; j < 8; ++j)
{
//p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex];
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_mac_f32 %0, %8, %9
\n
\
v_mac_f32 %0, %64, %72
\n
\
v_mac_f32 %1, %8, %10
\n
\
v_mac_f32 %1, %64, %73
\n
\
v_mac_f32 %2, %8, %11
\n
\
v_mac_f32 %2, %64, %74
\n
\
v_mac_f32 %3, %8, %12
\n
\
v_mac_f32 %3, %64, %75
\n
\
v_mac_f32 %4, %8, %13
\n
\
v_mac_f32 %4, %64, %76
\n
\
v_mac_f32 %5, %8, %14
\n
\
v_mac_f32 %5, %64, %77
\n
\
v_mac_f32 %6, %8, %15
\n
\
v_mac_f32 %6, %64, %78
\n
\
v_mac_f32 %7, %8, %16
\n
\
v_mac_f32 %7, %64, %79
\n
\
v_mac_f32 %8, %65, %72
\n
\
v_mac_f32 %9, %65, %73
\n
\
v_mac_f32 %10, %65, %74
\n
\
v_mac_f32 %11, %65, %75
\n
\
v_mac_f32 %12, %65, %76
\n
\
v_mac_f32 %13, %65, %77
\n
\
v_mac_f32 %14, %65, %78
\n
\
v_mac_f32 %15, %65, %79
\n
\
v_mac_f32 %16, %66, %72
\n
\
v_mac_f32 %17, %66, %73
\n
\
v_mac_f32 %18, %66, %74
\n
\
v_mac_f32 %19, %66, %75
\n
\
v_mac_f32 %20, %66, %76
\n
\
v_mac_f32 %21, %66, %77
\n
\
v_mac_f32 %22, %66, %78
\n
\
v_mac_f32 %23, %66, %79
\n
\
v_mac_f32 %24, %67, %72
\n
\
v_mac_f32 %25, %67, %73
\n
\
v_mac_f32 %26, %67, %74
\n
\
v_mac_f32 %27, %67, %75
\n
\
v_mac_f32 %28, %67, %76
\n
\
v_mac_f32 %29, %67, %77
\n
\
v_mac_f32 %30, %67, %78
\n
\
v_mac_f32 %31, %67, %79
\n
\
v_mac_f32 %32, %68, %72
\n
\
v_mac_f32 %33, %68, %73
\n
\
v_mac_f32 %34, %68, %74
\n
\
v_mac_f32 %35, %68, %75
\n
\
v_mac_f32 %36, %68, %76
\n
\
v_mac_f32 %37, %68, %77
\n
\
v_mac_f32 %38, %68, %78
\n
\
v_mac_f32 %39, %68, %79
\n
\
v_mac_f32 %40, %69, %72
\n
\
v_mac_f32 %41, %69, %73
\n
\
v_mac_f32 %42, %69, %74
\n
\
v_mac_f32 %43, %69, %75
\n
\
v_mac_f32 %44, %69, %76
\n
\
v_mac_f32 %45, %69, %77
\n
\
v_mac_f32 %46, %69, %78
\n
\
v_mac_f32 %47, %69, %79
\n
\
v_mac_f32 %48, %70, %72
\n
\
v_mac_f32 %49, %70, %73
\n
\
v_mac_f32 %50, %70, %74
\n
\
v_mac_f32 %51, %70, %75
\n
\
v_mac_f32 %52, %70, %76
\n
\
v_mac_f32 %53, %70, %77
\n
\
v_mac_f32 %54, %70, %78
\n
\
v_mac_f32 %55, %70, %79
\n
\
v_mac_f32 %56, %71, %72
\n
\
v_mac_f32 %57, %71, %73
\n
\
v_mac_f32 %58, %71, %74
\n
\
v_mac_f32 %59, %71, %75
\n
\
v_mac_f32 %60, %71, %76
\n
\
v_mac_f32 %61, %71, %77
\n
\
v_mac_f32 %62, %71, %78
\n
\
v_mac_f32 %63, %71, %79
\n
\
"
"
:
"=v"
(
p_c_thread
[
cindex
+
0
]),
:
"=v"
(
p_c_thread
[
cindex
+
1
]),
"=v"
(
p_c_thread
[
0
]),
"=v"
(
p_c_thread
[
cindex
+
2
]),
"=v"
(
p_c_thread
[
1
]),
"=v"
(
p_c_thread
[
cindex
+
3
]),
"=v"
(
p_c_thread
[
2
]),
"=v"
(
p_c_thread
[
cindex
+
4
]),
"=v"
(
p_c_thread
[
3
]),
"=v"
(
p_c_thread
[
cindex
+
5
]),
"=v"
(
p_c_thread
[
4
]),
"=v"
(
p_c_thread
[
cindex
+
6
]),
"=v"
(
p_c_thread
[
5
]),
"=v"
(
p_c_thread
[
cindex
+
7
])
"=v"
(
p_c_thread
[
6
]),
:
"v"
(
p_a_thread
[
aindex
]),
"=v"
(
p_c_thread
[
7
]),
"v"
(
p_b_thread
[
bindex
+
0
]),
"=v"
(
p_c_thread
[
8
]),
"v"
(
p_b_thread
[
bindex
+
1
]),
"=v"
(
p_c_thread
[
9
]),
"v"
(
p_b_thread
[
bindex
+
2
]),
"=v"
(
p_c_thread
[
10
]),
"v"
(
p_b_thread
[
bindex
+
3
]),
"=v"
(
p_c_thread
[
11
]),
"v"
(
p_b_thread
[
bindex
+
4
]),
"=v"
(
p_c_thread
[
12
]),
"v"
(
p_b_thread
[
bindex
+
5
]),
"=v"
(
p_c_thread
[
13
]),
"v"
(
p_b_thread
[
bindex
+
6
]),
"=v"
(
p_c_thread
[
14
]),
"v"
(
p_b_thread
[
bindex
+
7
]),
"=v"
(
p_c_thread
[
15
]),
"0"
(
p_c_thread
[
cindex
+
0
]),
"=v"
(
p_c_thread
[
16
]),
"1"
(
p_c_thread
[
cindex
+
1
]),
"=v"
(
p_c_thread
[
17
]),
"2"
(
p_c_thread
[
cindex
+
2
]),
"=v"
(
p_c_thread
[
18
]),
"3"
(
p_c_thread
[
cindex
+
3
]),
"=v"
(
p_c_thread
[
19
]),
"4"
(
p_c_thread
[
cindex
+
4
]),
"=v"
(
p_c_thread
[
20
]),
"5"
(
p_c_thread
[
cindex
+
5
]),
"=v"
(
p_c_thread
[
21
]),
"6"
(
p_c_thread
[
cindex
+
6
]),
"=v"
(
p_c_thread
[
22
]),
"7"
(
p_c_thread
[
cindex
+
7
])
"=v"
(
p_c_thread
[
23
]),
"=v"
(
p_c_thread
[
24
]),
"=v"
(
p_c_thread
[
25
]),
"=v"
(
p_c_thread
[
26
]),
"=v"
(
p_c_thread
[
27
]),
"=v"
(
p_c_thread
[
28
]),
"=v"
(
p_c_thread
[
29
]),
"=v"
(
p_c_thread
[
30
]),
"=v"
(
p_c_thread
[
31
]),
"=v"
(
p_c_thread
[
32
]),
"=v"
(
p_c_thread
[
33
]),
"=v"
(
p_c_thread
[
34
]),
"=v"
(
p_c_thread
[
35
]),
"=v"
(
p_c_thread
[
36
]),
"=v"
(
p_c_thread
[
37
]),
"=v"
(
p_c_thread
[
38
]),
"=v"
(
p_c_thread
[
39
]),
"=v"
(
p_c_thread
[
40
]),
"=v"
(
p_c_thread
[
41
]),
"=v"
(
p_c_thread
[
42
]),
"=v"
(
p_c_thread
[
43
]),
"=v"
(
p_c_thread
[
44
]),
"=v"
(
p_c_thread
[
45
]),
"=v"
(
p_c_thread
[
46
]),
"=v"
(
p_c_thread
[
47
]),
"=v"
(
p_c_thread
[
48
]),
"=v"
(
p_c_thread
[
49
]),
"=v"
(
p_c_thread
[
50
]),
"=v"
(
p_c_thread
[
51
]),
"=v"
(
p_c_thread
[
52
]),
"=v"
(
p_c_thread
[
53
]),
"=v"
(
p_c_thread
[
54
]),
"=v"
(
p_c_thread
[
55
]),
"=v"
(
p_c_thread
[
56
]),
"=v"
(
p_c_thread
[
57
]),
"=v"
(
p_c_thread
[
58
]),
"=v"
(
p_c_thread
[
59
]),
"=v"
(
p_c_thread
[
60
]),
"=v"
(
p_c_thread
[
61
]),
"=v"
(
p_c_thread
[
62
]),
"=v"
(
p_c_thread
[
63
])
:
"v"
(
p_a_thread
[
0
]),
"v"
(
p_a_thread
[
1
]),
"v"
(
p_a_thread
[
2
]),
"v"
(
p_a_thread
[
3
]),
"v"
(
p_a_thread
[
4
]),
"v"
(
p_a_thread
[
5
]),
"v"
(
p_a_thread
[
6
]),
"v"
(
p_a_thread
[
7
]),
"v"
(
p_b_thread
[
0
]),
"v"
(
p_b_thread
[
1
]),
"v"
(
p_b_thread
[
2
]),
"v"
(
p_b_thread
[
3
]),
"v"
(
p_b_thread
[
4
]),
"v"
(
p_b_thread
[
5
]),
"v"
(
p_b_thread
[
6
]),
"v"
(
p_b_thread
[
7
]),
"0"
(
p_c_thread
[
0
]),
"1"
(
p_c_thread
[
1
]),
"2"
(
p_c_thread
[
2
]),
"3"
(
p_c_thread
[
3
]),
"4"
(
p_c_thread
[
4
]),
"5"
(
p_c_thread
[
5
]),
"6"
(
p_c_thread
[
6
]),
"7"
(
p_c_thread
[
7
]),
"8"
(
p_c_thread
[
8
]),
"9"
(
p_c_thread
[
9
]),
"10"
(
p_c_thread
[
10
]),
"11"
(
p_c_thread
[
11
]),
"12"
(
p_c_thread
[
12
]),
"13"
(
p_c_thread
[
13
]),
"14"
(
p_c_thread
[
14
]),
"15"
(
p_c_thread
[
15
]),
"16"
(
p_c_thread
[
16
]),
"17"
(
p_c_thread
[
17
]),
"18"
(
p_c_thread
[
18
]),
"19"
(
p_c_thread
[
19
]),
"20"
(
p_c_thread
[
20
]),
"21"
(
p_c_thread
[
21
]),
"22"
(
p_c_thread
[
22
]),
"23"
(
p_c_thread
[
23
]),
"24"
(
p_c_thread
[
24
]),
"25"
(
p_c_thread
[
25
]),
"26"
(
p_c_thread
[
26
]),
"27"
(
p_c_thread
[
27
]),
"28"
(
p_c_thread
[
28
]),
"29"
(
p_c_thread
[
29
]),
"30"
(
p_c_thread
[
30
]),
"31"
(
p_c_thread
[
31
]),
"32"
(
p_c_thread
[
32
]),
"33"
(
p_c_thread
[
33
]),
"34"
(
p_c_thread
[
34
]),
"35"
(
p_c_thread
[
35
]),
"36"
(
p_c_thread
[
36
]),
"37"
(
p_c_thread
[
37
]),
"38"
(
p_c_thread
[
38
]),
"39"
(
p_c_thread
[
39
]),
"40"
(
p_c_thread
[
40
]),
"41"
(
p_c_thread
[
41
]),
"42"
(
p_c_thread
[
42
]),
"43"
(
p_c_thread
[
43
]),
"44"
(
p_c_thread
[
44
]),
"45"
(
p_c_thread
[
45
]),
"46"
(
p_c_thread
[
46
]),
"47"
(
p_c_thread
[
47
]),
"48"
(
p_c_thread
[
48
]),
"49"
(
p_c_thread
[
49
]),
"50"
(
p_c_thread
[
50
]),
"51"
(
p_c_thread
[
51
]),
"52"
(
p_c_thread
[
52
]),
"53"
(
p_c_thread
[
53
]),
"54"
(
p_c_thread
[
54
]),
"55"
(
p_c_thread
[
55
]),
"56"
(
p_c_thread
[
56
]),
"57"
(
p_c_thread
[
57
]),
"58"
(
p_c_thread
[
58
]),
"59"
(
p_c_thread
[
59
]),
"60"
(
p_c_thread
[
60
]),
"61"
(
p_c_thread
[
61
]),
"62"
(
p_c_thread
[
62
]),
"63"
(
p_c_thread
[
63
])
);
);
}
}
}
#endif
#endif
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment