Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
fe6f3744
Commit
fe6f3744
authored
Apr 26, 2019
by
Jing Zhang
Browse files
disable asm for debug
parent
46a0aec1
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
48 additions
and
25 deletions
+48
-25
driver/driver.hip.cpp
driver/driver.hip.cpp
+2
-2
src/include/amd_inline_asm.hip.hpp
src/include/amd_inline_asm.hip.hpp
+12
-2
src/include/blockwise_batched_gemm.hip.hpp
src/include/blockwise_batched_gemm.hip.hpp
+32
-19
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+2
-2
No files found.
driver/driver.hip.cpp
View file @
fe6f3744
...
...
@@ -413,8 +413,8 @@ int main(int argc, char* argv[])
{
#if 1
// 3x3, 34x34
constexpr
index_t
N
=
6
4
;
constexpr
index_t
C
=
256
;
constexpr
index_t
N
=
1
6
;
constexpr
index_t
C
=
8
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr
index_t
K
=
128
;
...
...
src/include/amd_inline_asm.hip.hpp
View file @
fe6f3744
...
...
@@ -46,7 +46,7 @@ __device__ void vmcnt(index_t cnt)
__device__
void
lgkmcnt
(
index_t
cnt
)
{
#if
!NO_LGKM_WAIT
#if
0
if(cnt == 0)
{
asm volatile("\n \
...
...
@@ -86,6 +86,7 @@ __device__ void lgkmcnt(index_t cnt)
__device__
void
outerProduct1x4
(
const
float
*
a
,
const
float
*
b
,
float
*
c
)
{
#if 0
asm volatile("\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
...
...
@@ -102,6 +103,12 @@ __device__ void outerProduct1x4(const float* a, const float* b, float* c)
"1"(c[1]),
"2"(c[2]),
"3"(c[3]));
#else
c
[
0
]
+=
a
[
0
]
*
b
[
0
];
c
[
1
]
+=
a
[
0
]
*
b
[
1
];
c
[
2
]
+=
a
[
0
]
*
b
[
2
];
c
[
3
]
+=
a
[
0
]
*
b
[
3
];
#endif
}
__device__
void
outerProduct1x4
(
const
float
&
a
,
...
...
@@ -197,7 +204,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
__device__
void
ds_read_b128
(
vector_type
<
float
,
4
>::
MemoryType
&
r
,
void
*
lds
,
index_t
offset
=
0
)
{
#if
!NO_DS_READ
#if
0
if(offset == 0)
{
asm volatile("\n \
...
...
@@ -418,6 +425,9 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
{
assert(false);
}
#else
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
r
=
((
Float4
*
)
lds
)[
offset
];
#endif
}
...
...
src/include/blockwise_batched_gemm.hip.hpp
View file @
fe6f3744
...
...
@@ -435,34 +435,47 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr
index_t
a_lds_cluster_col_stride
=
sizeof
(
Float
)
*
MPerLevel1Cluster
;
constexpr
index_t
b_lds_cluster_col_stride
=
sizeof
(
Float
)
*
NPerLevel1Cluster
;
ds_read_b128
(
reg_a
[
0
],
a_lds_loc
,
0
);
ds_read_b128
(
reg_b
[
0
],
b_lds_loc
,
0
);
ds_read_b128
(
reg_b
[
1
],
b_lds_loc
,
b_lds_cluster_col_stride
);
ds_read_b128
(
reg_a
[
1
],
a_lds_loc
,
a_lds_cluster_col_stride
);
lgkmcnt
(
2
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
lgkmcnt
(
1
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
#pragma unroll
for
(
index_t
k
=
1
;
k
<
K
;
++
k
)
for
(
index_t
k
=
0
;
k
<
K
;
k
++
)
{
ds_read_b128
(
reg_a
[
0
],
a_lds_loc
,
k
*
a_lds_row_stride
);
lgkmcnt
(
1
);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
ds_read_b128
(
reg_b
[
0
],
b_lds_loc
,
k
*
b_lds_row_stride
);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
ds_read_b128
(
reg_b
[
1
],
b_lds_loc
,
b_lds_cluster_col_stride
+
k
*
b_lds_row_stride
);
ds_read_b128
(
reg_a
[
1
],
a_lds_loc
,
a_lds_cluster_col_stride
+
k
*
a_lds_row_stride
);
lgkmcnt
(
2
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
lgkmcnt
(
1
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
}
lgkmcnt
(
0
);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
//ds_read_b128(reg_a[0], a_lds_loc, 0);
//ds_read_b128(reg_b[0], b_lds_loc, 0);
//ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride);
//ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride);
//lgkmcnt(2);
//outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
//lgkmcnt(1);
//outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
//#pragma unroll
//for(index_t k = 1; k < K; ++k)
//{
//ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride);
//lgkmcnt(1);
//outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
//ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride);
//outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
//ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride);
//ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride);
//lgkmcnt(2);
//outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
//lgkmcnt(1);
//outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
//}
//lgkmcnt(0);
//outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
//outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
}
#endif
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
View file @
fe6f3744
...
...
@@ -273,11 +273,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
__syncthreads
();
#if
1
#if
0
blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
#elif
0
blockwise_batch_gemm
.
Run_asm
(
p_wei_block
,
p_in_block
,
p_out_thread
);
#elif
0
#elif
1
blockwise_batch_gemm
.
Run_asm_v2
(
p_wei_block
,
p_in_block
,
p_out_thread
);
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment