Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7cdb665a
Commit
7cdb665a
authored
Jun 01, 2019
by
Chao Liu
Browse files
refactor
parent
97ba755f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
30 additions
and
284 deletions
+30
-284
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+26
-3
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
...dwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+2
-3
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+2
-3
src/include/threadwise_2d_tensor_op.hip.hpp
src/include/threadwise_2d_tensor_op.hip.hpp
+0
-137
src/include/threadwise_4d_tensor_op.hip.hpp
src/include/threadwise_4d_tensor_op.hip.hpp
+0
-138
No files found.
src/include/conv_common.hip.hpp
View file @
7cdb665a
...
...
@@ -3,7 +3,7 @@
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
__host__
__device__
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
InDesc
,
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
{
constexpr
auto
in_desc
=
InDesc
{};
...
...
@@ -34,7 +34,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
}
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
__host__
__device__
constexpr
auto
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
constexpr
auto
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
,
LowerPads
,
UpperPads
)
{
constexpr
auto
in_desc
=
InDesc
{};
...
...
@@ -71,7 +71,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
}
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
__host__
__device__
constexpr
std
::
size_t
calculate_convolution_flops
(
InDesc
,
WeiDesc
,
OutDesc
)
constexpr
std
::
size_t
calculate_convolution_flops
(
InDesc
,
WeiDesc
,
OutDesc
)
{
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
...
...
@@ -92,3 +92,26 @@ __host__ __device__ constexpr std::size_t calculate_convolution_flops(InDesc, We
return
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
}
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
constexpr
std
::
size_t
calculate_convolution_memory_size
(
Float
,
InDesc
,
WeiDesc
,
OutDesc
)
{
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
index_t
N
=
out_desc
.
GetLength
(
I0
);
constexpr
index_t
K
=
out_desc
.
GetLength
(
I1
);
constexpr
index_t
Ho
=
out_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_desc
.
GetLength
(
I3
);
constexpr
index_t
C
=
wei_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_desc
.
GetLength
(
I3
);
return
sizeof
(
Float
)
*
(
InDesc
::
GetElementSpace
()
+
WeiDesc
::
GetElementSpace
()
+
OutDesc
::
GetElementSpace
());
}
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
View file @
7cdb665a
...
...
@@ -4,7 +4,6 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = flatten(N, Hi, Wi)
...
...
@@ -202,8 +201,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
// set threadwise output
tensor
to 0
threadwise_
2d_tensor
_set_zero
(
out
_kb_thread_desc
,
p_out_thread
);
// set threadwise output to 0
threadwise_
matrix
_set_zero
(
c
_k
x
b_thread_
mtx_
desc
,
p_out_thread
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
7cdb665a
...
...
@@ -4,7 +4,6 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "threadwise_tensor_slice_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
...
...
@@ -222,8 +221,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
// set threadwise output
tensor
to 0
threadwise_
2d_tensor
_set_zero
(
out
_kb_thread_desc
,
p_out_thread
);
// set threadwise output to 0
threadwise_
matrix
_set_zero
(
c
_k
x
b_thread_
mtx_
desc
,
p_out_thread
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
+
2
*
CPerBlock
<
C
;
c_block_data_begin
+=
2
*
CPerBlock
)
...
...
src/include/threadwise_2d_tensor_op.hip.hpp
deleted
100644 → 0
View file @
97ba755f
#pragma once
#include "ConstantTensorDescriptor.hip.hpp"
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_2d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
desc
=
Desc
{};
#if 0
if(get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
}
#endif
for
(
index_t
did0
=
0
;
did0
<
desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
index_t
did1
=
0
;
did1
<
desc
.
GetLength
(
I1
);
++
did1
)
{
const
index_t
dindex
=
desc
.
GetOffsetFromMultiIndex
(
did0
,
did1
);
f
(
p
[
dindex
]);
}
}
}
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
,
class
F
>
__device__
void
threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
index_t
IR0
=
MapDst2Src
{}.
Get
(
I0
);
constexpr
index_t
IR1
=
MapDst2Src
{}.
Get
(
I1
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
for
(
index_t
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
index_t
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
const
index_t
aindex
=
src_desc
.
GetOffsetFromMultiIndex
(
did0
,
did1
);
const
index_t
did
[
2
]
=
{
did0
,
did1
};
const
index_t
bindex
=
dst_desc
.
GetOffsetFromMultiIndex
(
did
[
IR0
],
did
[
IR1
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
}
template
<
class
Float
,
class
Desc
>
__device__
void
threadwise_2d_tensor_set_zero
(
Desc
,
Float
*
__restrict__
p
)
{
auto
f_set_zero
=
[](
Float
&
v
)
{
v
=
Float
(
0
);
};
threadwise_2d_tensor_pointwise_operation_unary
<
Float
,
Desc
,
decltype
(
f_set_zero
)
>
(
Desc
{},
p
,
f_set_zero
);
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
>
__device__
void
threadwise_2d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
)
{
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
MapDst2Src
{},
f_copy
);
}
#if 0 // replaced threadwise_tensor_slice_copy
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
__device__ void threadwise_2d_tensor_copy(
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
{
auto dst_from_src_reorder = Sequence<0, 1>{};
threadwise_2d_tensor_copy_reorder_by_get_dst_from_src(
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
}
#endif
template
<
class
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
__device__
void
threadwise_2d_tensor_shift_down
(
Desc
,
Float
*
__restrict__
p
,
IDim
,
NShift
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
desc
=
Desc
{};
#if 0
if(get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: ");
}
#endif
constexpr
index_t
nshift
=
NShift
::
mValue
;
constexpr
index_t
did0_end
=
is_same
<
decltype
(
I0
),
IDim
>::
value
?
desc
.
GetLength
(
I0
)
-
nshift
:
desc
.
GetLength
(
I0
);
constexpr
index_t
did1_end
=
is_same
<
decltype
(
I1
),
IDim
>::
value
?
desc
.
GetLength
(
I1
)
-
nshift
:
desc
.
GetLength
(
I1
);
for
(
index_t
did0
=
0
;
did0
<
did0_end
;
++
did0
)
{
for
(
index_t
did1
=
0
;
did1
<
did1_end
;
++
did1
)
{
const
index_t
dindex
=
desc
.
GetOffsetFromMultiIndex
(
did0
,
did1
);
const
index_t
sindex
=
dindex
+
nshift
*
desc
.
GetStride
(
IDim
{});
p
[
dindex
]
=
p
[
sindex
];
}
}
}
src/include/threadwise_4d_tensor_op.hip.hpp
View file @
7cdb665a
#pragma once
#include "ConstantTensorDescriptor.hip.hpp"
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
Desc
{};
#if 0
if(get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
}
#endif
for
(
index_t
did0
=
0
;
did0
<
desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
index_t
did1
=
0
;
did1
<
desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
index_t
did2
=
0
;
did2
<
desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
index_t
did3
=
0
;
did3
<
desc
.
GetLength
(
I3
);
++
did3
)
{
const
index_t
dindex
=
desc
.
GetOffsetFromMultiIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p
[
dindex
]);
}
}
}
}
}
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template
<
class
SrcData
,
class
DstData
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_binary_reorder_given_dst2src
(
SrcDesc
,
const
SrcData
*
__restrict__
p_src
,
DstDesc
,
DstData
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
index_t
IR0
=
MapDst2Src
{}.
Get
(
I0
);
constexpr
index_t
IR1
=
MapDst2Src
{}.
Get
(
I1
);
constexpr
index_t
IR2
=
MapDst2Src
{}.
Get
(
I2
);
constexpr
index_t
IR3
=
MapDst2Src
{}.
Get
(
I3
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
for
(
index_t
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
index_t
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
index_t
did2
=
0
;
did2
<
ref_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
index_t
did3
=
0
;
did3
<
ref_desc
.
GetLength
(
I3
);
++
did3
)
{
const
index_t
aindex
=
src_desc
.
GetOffsetFromMultiIndex
(
did0
,
did1
,
did2
,
did3
);
const
index_t
did
[
4
]
=
{
did0
,
did1
,
did2
,
did3
};
const
index_t
bindex
=
dst_desc
.
GetOffsetFromMultiIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
#if 0
if(get_block_1d_id() == 0)
{
printf("tid %5u, "
"src did %u %u %u %u, "
"dst did %u %u %u %u, "
"aindex %5u, "
"bindex %5u\n",
get_thread_local_1d_id(),
did0,
did1,
did2,
did3,
did[IR0],
did[IR1],
did[IR2],
did[IR3],
aindex,
bindex);
}
#endif
}
}
}
}
}
template
<
class
Data
,
class
Desc
>
__device__
void
threadwise_4d_tensor_set_zero
(
Desc
,
Data
*
__restrict__
p
)
{
auto
f_set_zero
=
[](
Data
&
v
)
{
v
=
Data
(
0
);
};
threadwise_4d_tensor_pointwise_operation_unary
<
Data
,
Desc
,
decltype
(
f_set_zero
)
>
(
Desc
{},
p
,
f_set_zero
);
}
template
<
class
SrcData
,
class
DstData
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
>
__device__
void
threadwise_4d_tensor_copy_reorder_given_dst2src
(
SrcDesc
,
const
SrcData
*
__restrict__
p_src
,
DstDesc
,
DstData
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
)
{
auto
f_copy
=
[](
const
SrcData
&
src
,
DstData
&
dst
)
{
dst
=
static_cast
<
DstData
>
(
src
);
};
threadwise_4d_tensor_pointwise_operation_binary_reorder_given_dst2src
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
MapDst2Src
{},
f_copy
);
}
template
<
class
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
__device__
void
threadwise_4d_tensor_shift_down
(
Desc
,
Float
*
__restrict__
p
,
IDim
,
NShift
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment