Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b3108646
Commit
b3108646
authored
Jun 12, 2019
by
Jing Zhang
Browse files
add asm
parent
5be6a5f9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
257 additions
and
11 deletions
+257
-11
src/include/amd_inline_asm.hip.hpp
src/include/amd_inline_asm.hip.hpp
+122
-11
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
...implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+3
-0
src/include/threadwise_generic_tensor_slice_op.hip.hpp
src/include/threadwise_generic_tensor_slice_op.hip.hpp
+132
-0
No files found.
src/include/amd_inline_asm.hip.hpp
View file @
b3108646
...
...
@@ -8,7 +8,7 @@
#define NO_GLB_READ 0
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
const
void
*
p
)[[
hc
]];
__device__
void
vmcnt
(
index_t
cnt
)
{
...
...
@@ -721,18 +721,54 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
#endif
}
__device__
void
global_load
(
vector_type
<
float
,
4
>::
MemoryType
&
r
,
const
vector_type
<
float
,
4
>::
MemoryType
*
ptr
,
index_t
offset
=
0
)
__device__
void
global_loadx4
(
void
*
r
,
const
void
*
ptr
,
index_t
offset
=
0
)
{
#if !NO_GLB_READ
if
(
offset
==
0
)
{
//*(vector_type<float, 4>::MemoryType*)(r) = *(vector_type<float, 4>::MemoryType*)(ptr);
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1, off
\n
\
"
:
"=v"
(
r
)
:
"v"
(
ptr
));
:
"=v"
(
*
(
vector_type
<
float
,
4
>::
MemoryType
*
)(
r
))
:
"r"
(
ptr
));
}
else
{
assert
(
false
);
}
#endif
}
__device__
void
global_loadx2
(
void
*
r
,
const
void
*
ptr
,
index_t
offset
=
0
)
{
#if !NO_GLB_READ
if
(
offset
==
0
)
{
asm
volatile
(
"
\n
\
global_load_dwordx2 %0, %1, off
\n
\
"
:
"=v"
(
*
(
vector_type
<
float
,
2
>::
MemoryType
*
)(
r
))
:
"r"
(
ptr
));
}
else
{
assert
(
false
);
}
#endif
}
__device__
void
global_loadx1
(
void
*
r
,
const
void
*
ptr
,
index_t
offset
=
0
)
{
#if !NO_GLB_READ
if
(
offset
==
0
)
{
//*(float*)(r) = *(float*)(ptr);
asm
volatile
(
"
\n
\
global_load_dword %0, %1, off
\n
\
"
:
"=v"
(
*
(
float
*
)(
r
))
:
"r"
(
ptr
));
}
else
{
...
...
@@ -741,17 +777,92 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r,
#endif
}
__device__
void
ds_write_b128
(
const
vector_type
<
float
,
4
>::
MemoryType
&
r
,
void
*
lds
,
index_t
offset
=
0
)
__device__
void
global_storex4
(
const
void
*
ptr
,
const
void
*
r
,
index_t
offset
=
0
)
{
#if !NO_GLB_READ
if
(
offset
==
0
)
{
//*(vector_type<float, 4>::MemoryType*)(ptr) = *(vector_type<float, 4>::MemoryType*)(r);
asm
volatile
(
"
\n
\
global_store_dwordx4 %0, %1, off
\n
\
"
:
:
"r"
(
ptr
),
"v"
(
*
(
vector_type
<
float
,
4
>::
MemoryType
*
)(
r
)));
}
else
{
assert
(
false
);
}
#endif
}
__device__
void
global_storex2
(
const
void
*
ptr
,
const
void
*
r
,
index_t
offset
=
0
)
{
#if !NO_GLB_READ
if
(
offset
==
0
)
{
asm
volatile
(
"
\n
\
global_store_dwordx2 %0, %1, off
\n
\
"
:
:
"r"
(
ptr
),
"v"
(
*
(
vector_type
<
float
,
2
>::
MemoryType
*
)(
r
)));
}
else
{
assert
(
false
);
}
#endif
}
__device__
void
global_storex1
(
const
void
*
ptr
,
const
void
*
r
,
index_t
offset
=
0
)
{
#if !NO_GLB_READ
if
(
offset
==
0
)
{
//*(float*)(ptr) = *(float*)(r);
asm
volatile
(
"
\n
\
global_store_dword %0, %1, off
\n
\
"
:
:
"r"
(
ptr
),
"v"
(
*
(
float
*
)(
r
)));
}
else
{
assert
(
false
);
}
#endif
}
__device__
void
ds_write_b128
(
const
void
*
lds
,
const
void
*
r
,
index_t
offset
=
0
)
{
#if !NO_DS_WRITE
if
(
offset
==
0
)
{
//*(vector_type<float, 4>::MemoryType*)(lds) = *(vector_type<float, 4>::MemoryType*)(r);
asm
volatile
(
"
\n
\
ds_write_b128 %0, %1
\n
\
"
ds_write_b128 %0, %1
\n
\
"
:
:
"v"
(
__to_local
(
lds
)),
"v"
(
*
(
vector_type
<
float
,
4
>::
MemoryType
*
)(
r
)));
}
else
{
assert
(
false
);
}
#endif
}
__device__
void
ds_write_b32
(
const
void
*
lds
,
const
void
*
r
,
index_t
offset
=
0
)
{
#if !NO_DS_WRITE
if
(
offset
==
0
)
{
//*(float*)(lds) = *(float*)(r);
asm
volatile
(
"
\n
\
ds_write_b32 %0, %1
\n
\
"
:
:
"v"
(
__to_local
(
lds
)),
"v"
(
r
));
:
"v"
(
__to_local
(
lds
)),
"v"
(
*
(
float
*
)(
r
)
));
}
else
{
...
...
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
View file @
b3108646
...
...
@@ -460,6 +460,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_block_on_global
,
p_wei_register_clipboard
);
// vmcnt(0);
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_double
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_double
);
...
...
@@ -504,6 +505,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
// LDS double buffer: GEMM on current data
run_blockwise_gemm
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// vmcnt(0);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_next
);
...
...
@@ -535,6 +537,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
// LDS double buffer: GEMM on current data
run_blockwise_gemm
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// vmcnt(0);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_double
+
in_block_space
);
...
...
src/include/threadwise_generic_tensor_slice_op.hip.hpp
View file @
b3108646
...
...
@@ -97,3 +97,135 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
});
#endif
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
,
index_t
DataPerAccess
,
index_t
OpType
>
__device__
void
threadwise_generic_tensor_slice_copy_v1_asm
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
Array
<
index_t
,
SrcDesc
::
GetNumOfDimension
()
>
src_multi_id_begin
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Array
<
index_t
,
DstDesc
::
GetNumOfDimension
()
>
dst_multi_id_begin
,
SliceLengths
,
DimAccessOrder
,
Number
<
DataPerAccess
>
,
Number
<
OpType
>
)
{
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
#if 0
// doesn't compile, because merged-tensor reordering is not implemented
// TODO: implement tensor desc ops for merged-tensor
constexpr auto src_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
constexpr auto dst_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
// check src/dst stride on the lowest access dimension
static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
(DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
"wrong! src/dst stride on the lowest access dimension needs to be 1 for "
"vectorized read/write");
#endif
constexpr
auto
slice_lengths_in_access_order
=
SliceLengths
::
ReorderGivenNew2Old
(
DimAccessOrder
{});
// check slice length on the lowest access dimension
static_assert
(
slice_lengths_in_access_order
.
Back
()
%
DataPerAccess
==
0
,
"wrong! slice length on the lowest access dimension should be evenly divided by "
"DataPerAccess"
);
constexpr
index_t
num_access_on_lowest_access_dimension
=
slice_lengths_in_access_order
.
Back
()
/
DataPerAccess
;
constexpr
auto
access_lengths
=
slice_lengths_in_access_order
.
Modify
(
Number
<
nDim
-
1
>
{},
Number
<
num_access_on_lowest_access_dimension
>
{});
using
vector_t
=
typename
vector_type
<
Float
,
DataPerAccess
>::
MemoryType
;
#if 1
ford
<
decltype
(
access_lengths
)
>
{}([
&
](
auto
access_multi_id
)
{
auto
data_multi_id_in_access_order
=
access_multi_id
;
data_multi_id_in_access_order
(
nDim
-
1
)
=
access_multi_id
[
nDim
-
1
]
*
DataPerAccess
;
const
auto
data_multi_id
=
reorder_array_given_old2new
(
data_multi_id_in_access_order
,
DimAccessOrder
{});
const
index_t
src_index
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_multi_id_begin
+
data_multi_id
);
const
index_t
dst_index
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_multi_id_begin
+
data_multi_id
);
static_assert
(
DataPerAccess
==
1
||
DataPerAccess
==
4
,
"unsupported DataPerAccess"
);
static_assert
(
OpType
==
1
,
"unsupported OpType"
);
if
(
DataPerAccess
==
4
)
{
if
(
OpType
==
1
)
{
global_loadx4
(
&
p_dst
[
dst_index
],
&
p_src
[
src_index
]);
}
else
if
(
OpType
==
2
)
{
global_storex4
(
&
p_dst
[
dst_index
],
&
p_src
[
src_index
]);
}
else
{
ds_write_b128
(
&
p_dst
[
dst_index
],
&
p_src
[
src_index
]);
}
}
if
(
DataPerAccess
==
1
)
{
if
(
OpType
==
1
)
{
global_loadx1
(
&
p_dst
[
dst_index
],
&
p_src
[
src_index
]);
}
else
if
(
OpType
==
2
)
{
global_storex1
(
&
p_dst
[
dst_index
],
&
p_src
[
src_index
]);
}
else
{
ds_write_b32
(
&
p_dst
[
dst_index
],
&
p_src
[
src_index
]);
}
}
//*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
//*reinterpret_cast<const vector_t*>(&p_src[src_index]);
});
#else
static_ford
<
decltype
(
access_lengths
)
>
{}([
&
](
auto
access_multi_id
)
{
constexpr
index_t
itmp
=
access_multi_id
.
Back
()
*
DataPerAccess
;
constexpr
auto
data_multi_id_in_access_order
=
access_multi_id
.
Modify
(
Number
<
nDim
-
1
>
{},
Number
<
itmp
>
{});
constexpr
auto
data_multi_id
=
reorder_array_given_old2new
(
sequence2array
(
data_multi_id_in_access_order
),
DimAccessOrder
{});
const
index_t
src_index
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_multi_id_begin
+
data_multi_id
);
const
index_t
dst_index
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_multi_id_begin
+
data_multi_id
);
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]);
});
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment