Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
15c47cfd
Commit
15c47cfd
authored
Apr 09, 2019
by
Jing Zhang
Browse files
clean asm
parent
c075d3f7
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
64 additions
and
102 deletions
+64
-102
driver/driver.hip.cpp
driver/driver.hip.cpp
+2
-2
src/include/amd_inline_asm.hip.hpp
src/include/amd_inline_asm.hip.hpp
+49
-88
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+13
-12
No files found.
driver/driver.hip.cpp
View file @
15c47cfd
...
@@ -661,9 +661,9 @@ int main(int argc, char* argv[])
...
@@ -661,9 +661,9 @@ int main(int argc, char* argv[])
device_direct_convolution_2_nchw_kcyx_nkhw
device_direct_convolution_2_nchw_kcyx_nkhw
#elif 0
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 1
device_implicit_gemm_convolution_2_chwn_cyxk_khwn
device_implicit_gemm_convolution_2_chwn_cyxk_khwn
#endif
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
...
...
src/include/amd_inline_asm.hip.hpp
View file @
15c47cfd
...
@@ -10,7 +10,11 @@
...
@@ -10,7 +10,11 @@
// cast a pointer of LDS to its address
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
__device__
void
vmcnt
(
index_t
cnt
)
#define data4_t vector_type<float, 4>::MemoryType
#define data_t float
template
<
unsigned
cnt
>
inline
__device__
void
vmcnt
()
{
{
#if !NO_VM_WAIT
#if !NO_VM_WAIT
if
(
cnt
==
0
)
if
(
cnt
==
0
)
...
@@ -31,10 +35,16 @@ __device__ void vmcnt(index_t cnt)
...
@@ -31,10 +35,16 @@ __device__ void vmcnt(index_t cnt)
s_waitcnt vmcnt(2)
\n
\
s_waitcnt vmcnt(2)
\n
\
"
::
);
"
::
);
}
}
else
if
(
cnt
==
3
)
{
asm
volatile
(
"
\n
\
s_waitcnt vmcnt(3)
\n
\
"
::
);
}
else
if
(
cnt
==
4
)
else
if
(
cnt
==
4
)
{
{
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
s_waitcnt vmcnt(
2
)
\n
\
s_waitcnt vmcnt(
4
)
\n
\
"
::
);
"
::
);
}
}
else
else
...
@@ -44,7 +54,7 @@ __device__ void vmcnt(index_t cnt)
...
@@ -44,7 +54,7 @@ __device__ void vmcnt(index_t cnt)
#endif
#endif
}
}
__device__
void
lgkmcnt
(
index_t
cnt
)
inline
__device__
void
lgkmcnt
(
index_t
cnt
)
{
{
#if !NO_LGKM_WAIT
#if !NO_LGKM_WAIT
if
(
cnt
==
0
)
if
(
cnt
==
0
)
...
@@ -84,31 +94,11 @@ __device__ void lgkmcnt(index_t cnt)
...
@@ -84,31 +94,11 @@ __device__ void lgkmcnt(index_t cnt)
#endif
#endif
}
}
__device__
void
outerProduct1x4
(
const
float
*
a
,
const
float
*
b
,
float
*
c
)
inline
__device__
void
outerProduct1x4
(
const
data_t
&
a
,
const
data4_t
&
b
,
data4_t
&
c
)
{
{
asm
volatile
(
"
\n
\
#if 0
v_mac_f32 %0, %4, %5
\n
\
v_mac_f32 %1, %4, %6
\n
\
v_mac_f32 %2, %4, %7
\n
\
v_mac_f32 %3, %4, %8
\n
\
"
:
"=v"
(
c
[
0
]),
"=v"
(
c
[
1
]),
"=v"
(
c
[
2
]),
"=v"
(
c
[
3
])
:
"v"
(
a
[
0
]),
"v"
(
b
[
0
]),
"v"
(
b
[
1
]),
"v"
(
b
[
2
]),
"v"
(
b
[
3
]),
"0"
(
c
[
0
]),
"1"
(
c
[
1
]),
"2"
(
c
[
2
]),
"3"
(
c
[
3
]));
}
__device__
void
outerProduct1x4
(
const
float
&
a
,
const
vector_type
<
float
,
4
>::
MemoryType
&
b
,
vector_type
<
float
,
4
>::
MemoryType
&
c
)
{
#if 0
asm volatile(
asm volatile(
"\n \
"\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %0, %4, %5 \n \
...
@@ -121,73 +111,44 @@ __device__ void outerProduct1x4(const float& a,
...
@@ -121,73 +111,44 @@ __device__ void outerProduct1x4(const float& a,
"v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
"v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
);
);
#else
#else
outerProduct1x4
(
&
a
,
(
float
*
)
&
b
,
(
float
*
)
&
c
);
//hijack here due to a compiler issue that cannot perform proper register
#endif
//mapping for float4 c
}
data_t
*
c_p
=
(
data_t
*
)
&
c
;
asm
volatile
(
"
\n
\
__device__
void
outerProduct4x4
(
const
vector_type
<
float
,
4
>::
MemoryType
&
a
,
const
vector_type
<
float
,
4
>::
MemoryType
&
b
,
vector_type
<
float
,
4
>::
MemoryType
&
c0
,
vector_type
<
float
,
4
>::
MemoryType
&
c1
,
vector_type
<
float
,
4
>::
MemoryType
&
c2
,
vector_type
<
float
,
4
>::
MemoryType
&
c3
)
{
#if 0
asm volatile(
"\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \
"
:
:"v"(c0.x),"v"(c0.y),"v"(c0.z),"v"(c0.w), \
"v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
);
asm volatile(
"\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \
"
:
:"v"(c1.x),"v"(c1.y),"v"(c1.z),"v"(c1.w), \
"v"(a.y),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
);
asm volatile(
"\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \
"
:
:"v"(c2.x),"v"(c2.y),"v"(c2.z),"v"(c2.w), \
"v"(a.z),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
);
asm volatile(
"\n \
v_mac_f32 %0, %4, %5
\n
\
v_mac_f32 %0, %4, %5
\n
\
v_mac_f32 %1, %4, %6
\n
\
v_mac_f32 %1, %4, %6
\n
\
v_mac_f32 %2, %4, %7
\n
\
v_mac_f32 %2, %4, %7
\n
\
v_mac_f32 %3, %4, %8
\n
\
v_mac_f32 %3, %4, %8
\n
\
"
"
:
:
"=v"
(
c
[
0
]),
"=v"
(
c
[
1
]),
"=v"
(
c
[
2
]),
"=v"
(
c
[
3
])
:"v"(c3.x),"v"(c3.y),"v"(c3.z),"v"(c3.w), \
:
"v"
(
a
),
"v"(a.w),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
"v"
(
b
.
x
),
);
"v"
(
b
.
y
),
#else
"v"
(
b
.
z
),
"v"
(
b
.
w
),
"0"
(
c
[
0
]),
"1"
(
c
[
1
]),
"2"
(
c
[
2
]),
"3"
(
c
[
3
]));
#endif
}
inline
__device__
void
outerProduct4x4
(
const
data4_t
&
a
,
const
data4_t
&
b
,
data4_t
&
c0
,
data4_t
&
c1
,
data4_t
&
c2
,
data4_t
&
c3
)
{
outerProduct1x4
(
a
.
x
,
b
,
c0
);
outerProduct1x4
(
a
.
x
,
b
,
c0
);
outerProduct1x4
(
a
.
y
,
b
,
c1
);
outerProduct1x4
(
a
.
y
,
b
,
c1
);
outerProduct1x4
(
a
.
z
,
b
,
c2
);
outerProduct1x4
(
a
.
z
,
b
,
c2
);
outerProduct1x4
(
a
.
w
,
b
,
c3
);
outerProduct1x4
(
a
.
w
,
b
,
c3
);
#endif
}
}
__device__
void
outerProduct8x8
(
const
vector_type
<
float
,
4
>::
MemoryType
*
a
,
inline
__device__
void
outerProduct8x8
(
const
data4_t
*
a
,
const
vector_type
<
float
,
4
>::
MemoryType
*
b
,
const
data4_t
*
b
,
vector_type
<
float
,
4
>::
MemoryType
*
c
)
data4_t
*
c
)
{
{
outerProduct4x4
(
a
[
0
],
b
[
0
],
c
[
0
],
c
[
2
],
c
[
4
],
c
[
6
]);
outerProduct4x4
(
a
[
0
],
b
[
0
],
c
[
0
],
c
[
2
],
c
[
4
],
c
[
6
]);
outerProduct4x4
(
a
[
0
],
b
[
1
],
c
[
1
],
c
[
3
],
c
[
5
],
c
[
7
]);
outerProduct4x4
(
a
[
0
],
b
[
1
],
c
[
1
],
c
[
3
],
c
[
5
],
c
[
7
]);
...
@@ -195,7 +156,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
...
@@ -195,7 +156,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
outerProduct4x4
(
a
[
1
],
b
[
1
],
c
[
9
],
c
[
11
],
c
[
13
],
c
[
15
]);
outerProduct4x4
(
a
[
1
],
b
[
1
],
c
[
9
],
c
[
11
],
c
[
13
],
c
[
15
]);
}
}
__device__
void
ds_read_b128
(
vector_type
<
float
,
4
>::
MemoryType
&
r
,
void
*
lds
,
index_t
offset
=
0
)
inline
__device__
void
ds_read_b128
(
data4_t
&
r
,
void
*
lds
,
index_t
offset
=
0
)
{
{
#if !NO_DS_READ
#if !NO_DS_READ
if
(
offset
==
0
)
if
(
offset
==
0
)
...
@@ -421,8 +382,8 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
...
@@ -421,8 +382,8 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
#endif
#endif
}
}
__device__
void
global_load
(
vector_type
<
float
,
4
>::
MemoryType
&
r
,
inline
__device__
void
global_load
(
data4_t
&
r
,
const
vector_type
<
float
,
4
>::
MemoryType
*
ptr
,
const
data4_t
*
ptr
,
index_t
offset
=
0
)
index_t
offset
=
0
)
{
{
#if !NO_GLB_READ
#if !NO_GLB_READ
...
@@ -441,8 +402,8 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r,
...
@@ -441,8 +402,8 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r,
#endif
#endif
}
}
__device__
void
inline
__device__
void
ds_write_b128
(
const
vector_type
<
float
,
4
>::
MemoryType
&
r
,
void
*
lds
,
index_t
offset
=
0
)
ds_write_b128
(
const
data4_t
&
r
,
void
*
lds
,
index_t
offset
=
0
)
{
{
#if !NO_DS_WRITE
#if !NO_DS_WRITE
if
(
offset
==
0
)
if
(
offset
==
0
)
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
15c47cfd
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#include "threadwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
#include "blockwise_gemm.hip.hpp"
#include "gridwise_ops.hip.hpp"
// define B = flatten(N, Hi, Wi)
// define B = flatten(N, Hi, Wi)
template
<
index_t
GridSize
,
template
<
index_t
GridSize
,
...
@@ -208,12 +209,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -208,12 +209,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
p_wei_register_clipboard
);
p_wei_register_clipboard
);
#if
1
#if
0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double);
p_wei_block_double);
#else
#else
vmcnt
(
0
);
global_load_waitall
(
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_double
);
p_in_block_double
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
...
@@ -266,11 +267,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -266,11 +267,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
1
#if
0
blockwise_gemm.Run
blockwise_gemm.Run
#elif
0
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif
0
#elif
1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block_now
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
(
p_wei_block_now
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
...
@@ -279,13 +280,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -279,13 +280,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
}
}
#if
1
#if
0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next);
p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next);
p_wei_block_next);
#else
#else
vmcnt
(
0
);
global_load_waitall
(
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_next
);
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
...
@@ -315,11 +316,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -315,11 +316,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
1
#if
0
blockwise_gemm.Run
blockwise_gemm.Run
#elif
0
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif
0
#elif
1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block_double
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
(
p_wei_block_double
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
...
@@ -328,14 +329,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -328,14 +329,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
}
}
#if
1
#if
0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_space);
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space);
p_wei_block_double + wei_block_space);
#else
#else
vmcnt
(
0
);
global_load_waitall
(
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
...
@@ -349,11 +350,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -349,11 +350,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
1
#if
0
blockwise_gemm.Run
blockwise_gemm.Run
#elif
0
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif
0
#elif
1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block_double
+
wei_block_space
+
(
p_wei_block_double
+
wei_block_space
+
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment