Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
18328e2f
Commit
18328e2f
authored
Mar 25, 2019
by
Chao Liu
Browse files
experimenting lds read
parent
4facbe99
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
42 additions
and
26 deletions
+42
-26
driver/driver.hip.cpp
driver/driver.hip.cpp
+9
-3
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+16
-7
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+8
-8
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+9
-8
No files found.
driver/driver.hip.cpp
View file @
18328e2f
...
...
@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 1
// 1x1 filter, 14x14 image, C =
256
// 1x1 filter, 14x14 image, C =
128
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
512
;
...
...
@@ -638,10 +638,16 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
#if
1
#if
0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#elif 1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 0
...
...
src/include/blockwise_gemm.hip.hpp
View file @
18328e2f
...
...
@@ -336,7 +336,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
__device__
void
Run
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
,
Accumulator
f_accum
)
const
Accumulator
f_accum
,
const
float
*
const
p_lds_begin
)
const
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
...
...
@@ -383,28 +384,36 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
// copy A-sub to form A
for
(
index_t
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
threadwise_matrix_copy
(
threadwise_matrix_copy
_v2
(
a_block_mtx
,
p_a_block
+
a_block_mtx
.
Get1dIndex
(
k_begin
,
m_repeat
*
MPerLevel1Cluster
)
+
mMyThreadOffsetA
,
a_thread_mtx
,
p_a_thread
+
a_thread_mtx
.
Get1dIndex
(
0
,
m_repeat
*
MPerThreadSubC
),
a_thread_sub_mtx
.
GetLengths
());
a_thread_sub_mtx
.
GetLengths
(),
p_lds_begin
);
}
#pragma unroll
// copy B-sub to form B
for
(
index_t
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
threadwise_matrix_copy
(
threadwise_matrix_copy
_v2
(
b_block_mtx
,
p_b_block
+
b_block_mtx
.
Get1dIndex
(
k_begin
,
n_repeat
*
NPerLevel1Cluster
)
+
mMyThreadOffsetB
,
b_thread_mtx
,
p_b_thread
+
b_thread_mtx
.
Get1dIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
b_thread_sub_mtx
.
GetLengths
());
b_thread_sub_mtx
.
GetLengths
(),
p_lds_begin
);
}
#if 0
asm volatile("\n \
s_waitcnt lgkmcnt(0) \n \
" ::);
#endif
// C = A * B
threadwise_gemm
(
a_thread_mtx
,
True
,
...
...
@@ -564,7 +573,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatB
*
const
p_b_block
,
FloatC
*
p_c_thread
,
Accumulator
f_accum
,
float
*
p_lds_begin
)
const
const
float
*
const
p_lds_begin
)
const
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
...
...
@@ -669,7 +678,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_lds_begin
);
}
#if
1
#if
0
asm volatile("\n \
s_waitcnt lgkmcnt(0) \n \
" ::);
...
...
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
View file @
18328e2f
...
...
@@ -207,7 +207,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
#if 1
const
expr
Float
*
p_lds_begin
=
p_wei_block
;
const
Float
*
p_lds_begin
=
p_in_block
<
p_wei_block
?
p_in_block
:
p_wei_block
;
#endif
const
Float
*
p_in_global_block_offset
=
...
...
@@ -240,11 +240,11 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if
0
#if
1
blockwise_gemm
.
Run
#elif 0
blockwise_gemm
.
Run_asm
#elif
1
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#endif
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
...
...
src/include/threadwise_gemm.hip.hpp
View file @
18328e2f
...
...
@@ -28,12 +28,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
DstMatrix
,
Float
*
__restrict__
p_dst
,
Sequence
<
NRow
,
NCol
>
,
const
float
*
p_lds_begin
)
const
float
*
const
p_lds_begin
)
{
constexpr
auto
src_mtx
=
SrcMatrix
{};
constexpr
auto
dst_mtx
=
DstMatrix
{};
#if
1
#if
0
for(index_t i = 0; i < NRow; ++i)
{
for(index_t j = 0; j < NCol; ++j)
...
...
@@ -48,11 +48,11 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
ds_read_b32 %0, %1 \n \
"
: "=v"(p_dst[dst_index])
:
"v"
((
uint32_t
)((
uintptr_t
)((
p_src
+
src_index
)
-
p_lds_begin
))));
: "v"((uint32_t)(
sizeof(Float) *
(uintptr_t)((p_src + src_index) - p_lds_begin))));
#endif
}
}
#elif
0
#elif
1
static_assert
(
NCol
==
4
,
"only for NCol == 4"
);
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
...
...
@@ -66,11 +66,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
#elif 1
asm
volatile
(
"
\n
\
ds_read_b128 %0, %1, offset:0
\n
\
asm
volatile
(
"
\n
\
ds_read_b128 %0, %1
\n
\
"
:
"=v"
(
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
)))
:
"v"
((
uint32_t
)((
uintptr_t
)(
p_src
+
src_index
-
p_lds_begin
))));
:
"v"
((
uint32_t
)(
sizeof
(
Float
)
*
(
uintptr_t
)(
(
p_src
+
src_index
)
-
p_lds_begin
))));
#endif
}
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment