Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
97ba755f
Commit
97ba755f
authored
May 31, 2019
by
Chao Liu
Browse files
refactor
parent
8d460740
Changes
22
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
155 additions
and
528 deletions
+155
-528
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+3
-4
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+2
-3
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+1
-1
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+1
-1
driver/driver.hip.cpp
driver/driver.hip.cpp
+4
-4
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+3
-3
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+76
-433
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+7
-8
src/include/blockwise_generic_tensor_slice_op.hip.hpp
src/include/blockwise_generic_tensor_slice_op.hip.hpp
+6
-6
src/include/blockwise_tensor_slice_op.hip.hpp
src/include/blockwise_tensor_slice_op.hip.hpp
+4
-4
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+2
-2
src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
...ude/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
+8
-8
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+4
-4
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+7
-7
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+4
-4
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+5
-5
src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+5
-6
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+5
-6
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
...implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+4
-15
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
+4
-4
No files found.
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
97ba755f
...
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// reorder input
// reorder input
auto
in_chwn_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
auto
in_chwn_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
...
@@ -64,8 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -64,8 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// output
// output
auto
out_khwn_desc
=
auto
out_khwn_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
make_ConstantTensorDescriptor_default_rank_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
View file @
97ba755f
...
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
...
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -50,8 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
...
@@ -50,8 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// output
// output
auto
out_khwn_desc
=
auto
out_khwn_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
make_ConstantTensorDescriptor_default_rank_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
View file @
97ba755f
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
...
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
View file @
97ba755f
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
...
driver/driver.hip.cpp
View file @
97ba755f
...
@@ -443,7 +443,7 @@ int main(int argc, char* argv[])
...
@@ -443,7 +443,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
0
#elif
1
// 3x3 filter, 28x28 image
// 3x3 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
256
;
...
@@ -455,7 +455,7 @@ int main(int argc, char* argv[])
...
@@ -455,7 +455,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 1x1 filter, 28x28 image
// 1x1 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
C
=
512
;
...
@@ -568,8 +568,8 @@ int main(int argc, char* argv[])
...
@@ -568,8 +568,8 @@ int main(int argc, char* argv[])
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
lower_pads
,
upper_pads
);
in_nchw_desc
,
wei_kcyx_desc
,
lower_pads
,
upper_pads
);
...
...
src/include/ConstantMergedTensorDescriptor.hip.hpp
View file @
97ba755f
...
@@ -114,7 +114,7 @@ struct ConstantMergedTensorDescriptor
...
@@ -114,7 +114,7 @@ struct ConstantMergedTensorDescriptor
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
{
{
constexpr
auto
dummy_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
GetLengths
());
constexpr
auto
dummy_desc
=
make_ConstantTensorDescriptor_packed
(
GetLengths
());
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
}
...
@@ -128,7 +128,7 @@ __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalT
...
@@ -128,7 +128,7 @@ __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalT
}
}
template
<
class
TDesc
>
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantMergedTensorDescriptor
(
TDesc
,
const
char
*
s
)
__host__
__device__
void
print_ConstantMergedTensorDescriptor
(
const
char
*
s
,
TDesc
)
{
{
print_ConstantTensorDescriptor
(
TDesc
::
GetOriginalTensorDescriptor
()
,
s
);
print_ConstantTensorDescriptor
(
s
,
TDesc
::
GetOriginalTensorDescriptor
());
}
}
src/include/ConstantTensorDescriptor.hip.hpp
View file @
97ba755f
...
@@ -2,25 +2,23 @@
...
@@ -2,25 +2,23 @@
#include "common.hip.hpp"
#include "common.hip.hpp"
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_
default_rank_
packed
(
Lengths
)
__host__
__device__
constexpr
auto
calculate_tensor_strides_packed
(
Lengths
)
{
{
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
mod_conv
::
multiplies
<
index_t
>
{})
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
mod_conv
::
multiplies
<
index_t
>
{})
.
PushBack
(
Number
<
1
>
{});
.
PushBack
(
Number
<
1
>
{});
}
}
template
<
class
Lengths
,
index_t
Align
>
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_default_rank_aligned
(
Lengths
,
__host__
__device__
constexpr
auto
calculate_tensor_strides_aligned
(
Lengths
,
Number
<
Align
>
)
Number
<
Align
>
)
{
{
constexpr
index_t
L_back_align
=
constexpr
index_t
L_back_align
=
Align
*
mod_conv
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
Align
*
mod_conv
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
return
calculate_tensor_strides_
default_rank_
packed
(
return
calculate_tensor_strides_packed
(
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
}
}
// MemoryRanks of dimensions is for conversion from offset to multi-index
template
<
class
Lengths
,
class
Strides
>
template
<
class
Lengths
,
class
Strides
,
class
MemoryRanks
>
struct
ConstantTensorDescriptor
struct
ConstantTensorDescriptor
{
{
using
Type
=
ConstantTensorDescriptor
;
using
Type
=
ConstantTensorDescriptor
;
...
@@ -29,15 +27,7 @@ struct ConstantTensorDescriptor
...
@@ -29,15 +27,7 @@ struct ConstantTensorDescriptor
__host__
__device__
constexpr
ConstantTensorDescriptor
()
__host__
__device__
constexpr
ConstantTensorDescriptor
()
{
{
static_assert
(
Lengths
::
GetSize
()
==
Strides
::
GetSize
()
&&
static_assert
(
Lengths
::
GetSize
()
==
Strides
::
GetSize
(),
"nDim not consistent"
);
Lengths
::
GetSize
()
==
MemoryRanks
::
GetSize
(),
"nDim not consistent"
);
#if 0 // require sequence_sort, but it's not implemented yet
static_assert(is_same<typename sequence_sort<MemoryRanks>::SortedSeqType,
typename arithmetic_sequence_gen<0, nDim, 1>::SeqType>::value,
"wrong! invalid MemoryRanks");
#endif
}
}
__host__
__device__
static
constexpr
auto
GetOriginalTensorDescriptor
()
{
return
Type
{};
}
__host__
__device__
static
constexpr
auto
GetOriginalTensorDescriptor
()
{
return
Type
{};
}
...
@@ -54,8 +44,6 @@ struct ConstantTensorDescriptor
...
@@ -54,8 +44,6 @@ struct ConstantTensorDescriptor
__host__
__device__
static
constexpr
auto
GetStrides
()
{
return
Strides
{};
}
__host__
__device__
static
constexpr
auto
GetStrides
()
{
return
Strides
{};
}
__host__
__device__
static
constexpr
auto
GetMemoryRanks
()
{
return
MemoryRanks
{};
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
static
constexpr
index_t
GetLength
(
Number
<
I
>
)
__host__
__device__
static
constexpr
index_t
GetLength
(
Number
<
I
>
)
{
{
...
@@ -68,12 +56,6 @@ struct ConstantTensorDescriptor
...
@@ -68,12 +56,6 @@ struct ConstantTensorDescriptor
return
Strides
{}.
Get
(
Number
<
I
>
{});
return
Strides
{}.
Get
(
Number
<
I
>
{});
}
}
template
<
index_t
I
>
__host__
__device__
static
constexpr
index_t
GetMemoryRank
(
Number
<
I
>
)
{
return
MemoryRanks
{}.
Get
(
Number
<
I
>
{});
}
__host__
__device__
static
constexpr
bool
AreStridesNonAscending
()
__host__
__device__
static
constexpr
bool
AreStridesNonAscending
()
{
{
bool
flag
=
true
;
bool
flag
=
true
;
...
@@ -98,20 +80,13 @@ struct ConstantTensorDescriptor
...
@@ -98,20 +80,13 @@ struct ConstantTensorDescriptor
return
accumulate_on_sequence
(
Lengths
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
return
accumulate_on_sequence
(
Lengths
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
}
// WRONG! ReorderGivenOld2New is broken
template
<
class
Align
=
Number
<
1
>
>
template
<
class
Align
=
Number
<
1
>
>
__host__
__device__
static
constexpr
index_t
GetElementSpace
(
Align
align
=
Align
{})
__host__
__device__
static
constexpr
index_t
GetElementSpace
(
Align
align
=
Align
{})
{
{
#if 0
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
constexpr auto lengths_in_rank = GetLengths().ReorderGivenOld2New(MemoryRank{});
// dimension
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus<index_t>{}, Number<1>{});
#else
// WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
1
>
{});
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
1
>
{});
#endif
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
}
}
...
@@ -148,35 +123,13 @@ struct ConstantTensorDescriptor
...
@@ -148,35 +123,13 @@ struct ConstantTensorDescriptor
multi_id
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
multi_id
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
}
}
#if 0 // ReorderGivenOld2new is broken
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFromOffset(index_t offset)
{
Array<index_t, nDim> ranked_multi_id;
constexpr auto ranked_strides =
GetStrides().ReorderGivenOld2new(MemoryRanks{}); // check this
// calculate index in each of the dimensions in the order of their rank (not dimension)
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr index_t stride = ranked_strides.Get(Number<idim>{});
ranked_multi_id[idim] = offset / stride;
offset -= ranked_multi_id[idim] * stride;
});
ranked_multi_id[nDim - 1] = offset / ranked_strides.Get(Number<nDim - 1>{});
return reorder_array_given_new2old(ranked_multi_id, MemoryRanks{}); // check this
}
#endif
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
{
{
Array
<
index_t
,
nDim
>
multi_id
;
Array
<
index_t
,
nDim
>
multi_id
;
constexpr
auto
dummy_strides
=
calculate_tensor_strides_
default_rank_
packed
(
GetLengths
());
constexpr
auto
dummy_strides
=
calculate_tensor_strides_packed
(
GetLengths
());
// calculate index in each of the dimensions in the order of their dimension
(not rank)
// calculate index in each of the dimensions in the order of their dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDim
)
{
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
index_t
idim
=
IDim
.
Get
();
constexpr
index_t
idim
=
IDim
.
Get
();
constexpr
index_t
stride
=
dummy_strides
.
Get
(
Number
<
idim
>
{});
constexpr
index_t
stride
=
dummy_strides
.
Get
(
Number
<
idim
>
{});
...
@@ -267,24 +220,16 @@ struct ConstantTensorDescriptor
...
@@ -267,24 +220,16 @@ struct ConstantTensorDescriptor
return
new_multi_id
;
return
new_multi_id
;
}
}
// WRONG! Ranks is broken
template
<
index_t
...
IDims
>
template
<
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
Extract
(
Number
<
IDims
>
...
extract_dims
)
__host__
__device__
static
constexpr
auto
Extract
(
Number
<
IDims
>
...
extract_dims
)
{
{
static_assert
(
sizeof
...(
IDims
)
<=
GetNumOfDimension
(),
static_assert
(
sizeof
...(
IDims
)
<=
GetNumOfDimension
(),
"wrong! too many number of dimensions to be extracted"
);
"wrong! too many number of dimensions to be extracted"
);
using
extract_lengths
=
decltype
(
Lengths
{}.
Extract
(
extract_dims
...));
using
extract_lengths
=
decltype
(
Lengths
::
Extract
(
extract_dims
...));
using
extract_strides
=
decltype
(
Strides
{}.
Extract
(
extract_dims
...));
using
extract_strides
=
decltype
(
Strides
::
Extract
(
extract_dims
...));
using
extract_ranks
=
decltype
(
MemoryRanks
{}.
Extract
(
extract_dims
...));
#if 0
using new_ranks = typename sequence_sort<extract_ranks>::Original2SortedType;
#else
// WRONG! TODO:: implement sequence_sort
using
new_ranks
=
typename
arithmetic_sequence_gen
<
0
,
sizeof
...(
IDims
),
1
>::
SeqType
;
#endif
return
ConstantTensorDescriptor
<
extract_lengths
,
extract_strides
,
new_ranks
>
{};
return
ConstantTensorDescriptor
<
extract_lengths
,
extract_strides
>
{};
}
}
template
<
index_t
...
IDims
>
template
<
index_t
...
IDims
>
...
@@ -298,12 +243,8 @@ struct ConstantTensorDescriptor
...
@@ -298,12 +243,8 @@ struct ConstantTensorDescriptor
{
{
using
leaf_tensor
=
ConstantTensorDescriptor
<
Ts
...
>
;
using
leaf_tensor
=
ConstantTensorDescriptor
<
Ts
...
>
;
// memory rank is broken
// TODO: remove memory rank info from tensor descritpor
return
ConstantTensorDescriptor
<
decltype
(
GetLengths
().
Append
(
leaf_tensor
::
GetLengths
())),
return
ConstantTensorDescriptor
<
decltype
(
GetLengths
().
Append
(
leaf_tensor
::
GetLengths
())),
decltype
(
GetStrides
().
Append
(
leaf_tensor
::
GetStrides
())),
decltype
(
GetStrides
().
Append
(
leaf_tensor
::
GetStrides
()))
>
{};
decltype
(
GetMemoryRanks
().
Append
(
leaf_tensor
::
GetMemoryRanks
()))
>
{};
}
}
template
<
index_t
IDim
,
index_t
SliceLen
>
template
<
index_t
IDim
,
index_t
SliceLen
>
...
@@ -311,18 +252,9 @@ struct ConstantTensorDescriptor
...
@@ -311,18 +252,9 @@ struct ConstantTensorDescriptor
{
{
using
slice_lengths
=
decltype
(
Lengths
{}.
Modify
(
Number
<
IDim
>
{},
Number
<
SliceLen
>
{}));
using
slice_lengths
=
decltype
(
Lengths
{}.
Modify
(
Number
<
IDim
>
{},
Number
<
SliceLen
>
{}));
return
ConstantTensorDescriptor
<
slice_lengths
,
Strides
,
MemoryRanks
>
{};
return
ConstantTensorDescriptor
<
slice_lengths
,
Strides
>
{};
}
}
template
<
index_t
Threashold
,
index_t
Delta
>
struct
f_fold_impl
{
__host__
__device__
constexpr
index_t
operator
()(
index_t
x
)
const
{
return
x
>
Threashold
?
x
+
Delta
:
x
;
}
};
template
<
index_t
IDim
,
index_t
...
FoldIntervals
>
template
<
index_t
IDim
,
index_t
...
FoldIntervals
>
__host__
__device__
static
constexpr
auto
Fold
(
Number
<
IDim
>
,
Number
<
FoldIntervals
>
...)
__host__
__device__
static
constexpr
auto
Fold
(
Number
<
IDim
>
,
Number
<
FoldIntervals
>
...)
{
{
...
@@ -333,7 +265,6 @@ struct ConstantTensorDescriptor
...
@@ -333,7 +265,6 @@ struct ConstantTensorDescriptor
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
constexpr
auto
unfold_rank
=
GetMemoryRank
(
Number
<
IDim
>
{});
// length of the dimension to be folded needs to be dividable by fold_interval_product,
// length of the dimension to be folded needs to be dividable by fold_interval_product,
// otherwise, folding is invalid
// otherwise, folding is invalid
...
@@ -350,16 +281,6 @@ struct ConstantTensorDescriptor
...
@@ -350,16 +281,6 @@ struct ConstantTensorDescriptor
reverse_inclusive_scan_sequence
(
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
reverse_inclusive_scan_sequence
(
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
mod_conv
::
multiplies
<
index_t
>
{});
mod_conv
::
multiplies
<
index_t
>
{});
// folded_ranks
constexpr
auto
fold_ranks
=
typename
arithmetic_sequence_gen
<
unfold_rank
,
unfold_rank
+
fold_intervals
.
GetSize
()
+
1
,
1
>::
SeqType
{};
// increase the ranks that are larger than unfold_rank
constexpr
auto
tmp_ranks
=
transform_sequences
(
f_fold_impl
<
unfold_rank
,
fold_intervals
.
GetSize
()
>
{},
GetMemoryRanks
());
// left and right
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
IDim
,
1
>::
SeqType
{};
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
IDim
,
1
>::
SeqType
{};
constexpr
auto
right
=
constexpr
auto
right
=
...
@@ -369,15 +290,8 @@ struct ConstantTensorDescriptor
...
@@ -369,15 +290,8 @@ struct ConstantTensorDescriptor
GetLengths
().
Extract
(
left
).
Append
(
fold_lengths
).
Append
(
GetLengths
().
Extract
(
right
));
GetLengths
().
Extract
(
left
).
Append
(
fold_lengths
).
Append
(
GetLengths
().
Extract
(
right
));
constexpr
auto
new_strides
=
constexpr
auto
new_strides
=
GetStrides
().
Extract
(
left
).
Append
(
fold_strides
).
Append
(
GetStrides
().
Extract
(
right
));
GetStrides
().
Extract
(
left
).
Append
(
fold_strides
).
Append
(
GetStrides
().
Extract
(
right
));
constexpr
auto
new_ranks
=
tmp_ranks
.
Extract
(
left
).
Append
(
fold_ranks
).
Append
(
tmp_ranks
.
Extract
(
right
));
static_assert
(
new_ranks
.
GetSize
()
==
new_lengths
.
GetSize
(),
"wrong!"
);
static_assert
(
fold_ranks
.
GetSize
()
==
fold_lengths
.
GetSize
(),
"wrong!"
);
return
ConstantTensorDescriptor
<
decltype
(
new_lengths
),
return
ConstantTensorDescriptor
<
decltype
(
new_lengths
),
decltype
(
new_strides
)
>
{};
decltype
(
new_strides
),
decltype
(
new_ranks
)
>
{};
}
}
template
<
index_t
Threashold
,
index_t
Delta
>
template
<
index_t
Threashold
,
index_t
Delta
>
...
@@ -411,11 +325,6 @@ struct ConstantTensorDescriptor
...
@@ -411,11 +325,6 @@ struct ConstantTensorDescriptor
// check if packed
// check if packed
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed");
"wrong! dimensions to be unfolded need to be packed");
// check ranks
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
"continuous ranks");
});
});
#endif
#endif
...
@@ -426,21 +335,13 @@ struct ConstantTensorDescriptor
...
@@ -426,21 +335,13 @@ struct ConstantTensorDescriptor
constexpr
auto
right
=
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
GetNumOfDimension
(),
1
>::
SeqType
{};
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
GetNumOfDimension
(),
1
>::
SeqType
{};
// unfolded length, stride
and rank
// unfolded length, stride
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
GetLengths
().
Extract
(
middle
),
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
GetLengths
().
Extract
(
middle
),
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
constexpr
index_t
unfold_rank
=
GetMemoryRank
(
Number
<
FirstUnfoldDim
>
{});
// new lengths, strides
// decrease the ranks that are larger than the rank of LastUnfoldDim
constexpr
auto
tmp_ranks
=
transform_sequences
(
f_unfold_impl
<
GetMemoryRank
(
Number
<
LastUnfoldDim
>
{}),
LastUnfoldDim
-
FirstUnfoldDim
+
1
>
{},
GetMemoryRanks
());
// new lengths, strides and ranks
constexpr
auto
new_lengths
=
GetLengths
()
constexpr
auto
new_lengths
=
GetLengths
()
.
Extract
(
left
)
.
Extract
(
left
)
.
PushBack
(
Number
<
unfold_length
>
{})
.
PushBack
(
Number
<
unfold_length
>
{})
...
@@ -451,22 +352,14 @@ struct ConstantTensorDescriptor
...
@@ -451,22 +352,14 @@ struct ConstantTensorDescriptor
.
PushBack
(
Number
<
unfold_stride
>
{})
.
PushBack
(
Number
<
unfold_stride
>
{})
.
Append
(
GetStrides
().
Extract
(
right
));
.
Append
(
GetStrides
().
Extract
(
right
));
constexpr
auto
new_ranks
=
tmp_ranks
.
Extract
(
left
)
return
ConstantTensorDescriptor
<
decltype
(
new_lengths
),
decltype
(
new_strides
)
>
{};
.
PushBack
(
Number
<
unfold_rank
>
{})
.
Append
(
tmp_ranks
.
Extract
(
right
));
return
ConstantTensorDescriptor
<
decltype
(
new_lengths
),
decltype
(
new_strides
),
decltype
(
new_ranks
)
>
{};
}
}
template
<
class
MapNew2Old
>
template
<
class
MapNew2Old
>
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
MapNew2Old
)
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
MapNew2Old
)
{
{
return
ConstantTensorDescriptor
<
decltype
(
Lengths
{}.
ReorderGivenNew2Old
(
MapNew2Old
{})),
return
ConstantTensorDescriptor
<
decltype
(
Lengths
{}.
ReorderGivenNew2Old
(
MapNew2Old
{})),
decltype
(
Strides
{}.
ReorderGivenNew2Old
(
MapNew2Old
{})),
decltype
(
Strides
{}.
ReorderGivenNew2Old
(
MapNew2Old
{}))
>
{};
decltype
(
MemoryRanks
{}.
ReorderGivenNew2Old
(
MapNew2Old
{}))
>
{};
}
}
#if 0 // require sequence_sort, which is not implemented yet
#if 0 // require sequence_sort, which is not implemented yet
...
@@ -474,358 +367,108 @@ struct ConstantTensorDescriptor
...
@@ -474,358 +367,108 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
{
{
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenOld2New(MapOld2New{})),
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenOld2New(MapOld2New{})),
decltype(Strides{}.ReorderGivenOld2New(MapOld2New{})),
decltype(Strides{}.ReorderGivenOld2New(MapOld2New{}))>{}
decltype(
MemoryRanks{}.ReorderGivenOld2New(MapOld2New{}))>{};
}
}
#endif
#endif
};
};
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_
default_rank_
packed
(
Lengths
)
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_packed
(
Lengths
)
{
{
using
Strides
=
decltype
(
calculate_tensor_strides_default_rank_packed
(
Lengths
{}));
using
Strides
=
decltype
(
calculate_tensor_strides_packed
(
Lengths
{}));
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
}
}
template
<
class
Lengths
,
class
Strides
>
template
<
class
Lengths
,
class
Strides
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
_default_rank
(
Lengths
,
Strides
)
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
(
Lengths
,
Strides
)
{
{
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
}
}
template
<
class
Lengths
,
index_t
Align
>
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_default_rank_aligned
(
Lengths
,
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_aligned
(
Lengths
,
Number
<
Align
>
)
Number
<
Align
>
)
{
{
using
Strides
=
using
Strides
=
decltype
(
calculate_tensor_strides_aligned
(
Lengths
{},
Number
<
Align
>
{}));
decltype
(
calculate_tensor_strides_default_rank_aligned
(
Lengths
{},
Number
<
Align
>
{}));
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
}
}
template
<
class
TDesc
>
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
__host__
__device__
void
print_ConstantTensorDescriptor
(
const
char
*
s
,
ConstantTensorDescriptor
<
Sequence
<
Lengths
...
>
,
Sequence
<
Strides
...
>>
)
{
{
constexpr
index_t
ndim
=
TDesc
::
GetNumOfDimension
();
constexpr
index_t
ndim
=
sizeof
...(
Lengths
);
static_assert
(
ndim
>=
1
&&
ndim
<=
10
,
"wrong!"
);
static_if
<
ndim
==
1
>
{}([
&
](
auto
fwd
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{}
);
static_assert
(
ndim
>
0
&&
ndim
<=
10
,
"wrong!"
);
printf
(
"%s dim %u, lengths {%u}, strides {%u}, ranks {%u}
\n
"
,
static_if
<
ndim
==
1
>
{}([
&
](
auto
)
{
s
,
printf
(
"%s dim %u, lengths {%u}, strides {%u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
desc
.
GetNumOfDimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetStride
(
I0
),
desc
.
GetMemoryRank
(
I0
));
});
});
static_if
<
ndim
==
2
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
2
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
printf
(
"%s dim %u, lengths {%u %u}, strides {%u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u}, strides {%u %u}, ranks {%u %u}
\n
"
,
s
,
desc
.
GetNumOfDimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
));
});
});
static_if
<
ndim
==
3
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
3
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
printf
(
constexpr
auto
I1
=
Number
<
1
>
{};
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}, ranks {%u %u %u}
\n
"
,
s
,
desc
.
GetNumOfDimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
));
});
});
static_if
<
ndim
==
4
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
4
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
printf
(
"%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}
\n
"
,
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}, ranks {%u %u %u %u}
\n
"
,
s
,
s
,
desc
.
GetNumOfDimension
(),
ndim
,
desc
.
GetLength
(
I0
),
Lengths
...,
desc
.
GetLength
(
I1
),
Strides
...);
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
),
desc
.
GetMemoryRank
(
I3
));
});
});
static_if
<
ndim
==
5
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
5
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
printf
(
"%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}
\n
"
,
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}, ranks {%u %u %u %u "
"%u}
\n
"
,
s
,
s
,
desc
.
GetNumOfDimension
(),
ndim
,
desc
.
GetLength
(
I0
),
Lengths
...,
desc
.
GetLength
(
I1
),
Strides
...);
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetLength
(
I4
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
),
desc
.
GetStride
(
I4
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
),
desc
.
GetMemoryRank
(
I3
),
desc
.
GetMemoryRank
(
I4
));
});
});
static_if
<
ndim
==
6
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
6
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}
\n
"
,
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}, ranks {%u %u "
"%u %u %u %u}
\n
"
,
s
,
s
,
desc
.
GetNumOfDimension
(),
ndim
,
desc
.
GetLength
(
I0
),
Lengths
...,
desc
.
GetLength
(
I1
),
Strides
...);
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetLength
(
I4
),
desc
.
GetLength
(
I5
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
),
desc
.
GetStride
(
I4
),
desc
.
GetStride
(
I5
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
),
desc
.
GetMemoryRank
(
I3
),
desc
.
GetMemoryRank
(
I4
),
desc
.
GetMemoryRank
(
I5
));
});
});
static_if
<
ndim
==
7
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
7
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}
\n
"
,
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}, ranks "
"{%u %u %u %u %u %u %u}
\n
"
,
s
,
s
,
desc
.
GetNumOfDimension
(),
ndim
,
desc
.
GetLength
(
I0
),
Lengths
...,
desc
.
GetLength
(
I1
),
Strides
...);
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetLength
(
I4
),
desc
.
GetLength
(
I5
),
desc
.
GetLength
(
I6
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
),
desc
.
GetStride
(
I4
),
desc
.
GetStride
(
I5
),
desc
.
GetStride
(
I6
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
),
desc
.
GetMemoryRank
(
I3
),
desc
.
GetMemoryRank
(
I4
),
desc
.
GetMemoryRank
(
I5
),
desc
.
GetMemoryRank
(
I6
));
});
});
static_if
<
ndim
==
8
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
8
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}
\n
"
,
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}, "
"ranks {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
s
,
desc
.
GetNumOfDimension
(),
ndim
,
desc
.
GetLength
(
I0
),
Lengths
...,
desc
.
GetLength
(
I1
),
Strides
...);
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetLength
(
I4
),
desc
.
GetLength
(
I5
),
desc
.
GetLength
(
I6
),
desc
.
GetLength
(
I7
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
),
desc
.
GetStride
(
I4
),
desc
.
GetStride
(
I5
),
desc
.
GetStride
(
I6
),
desc
.
GetStride
(
I7
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
),
desc
.
GetMemoryRank
(
I3
),
desc
.
GetMemoryRank
(
I4
),
desc
.
GetMemoryRank
(
I5
),
desc
.
GetMemoryRank
(
I6
),
desc
.
GetMemoryRank
(
I7
));
});
});
static_if
<
ndim
==
9
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
9
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I8
=
Number
<
8
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"
%u}, ranks {%u %u %u %u %u %u %u %u
%u}
\n
"
,
"%u}
\n
"
,
s
,
s
,
desc
.
GetNumOfDimension
(),
ndim
,
desc
.
GetLength
(
I0
),
Lengths
...,
desc
.
GetLength
(
I1
),
Strides
...);
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetLength
(
I4
),
desc
.
GetLength
(
I5
),
desc
.
GetLength
(
I6
),
desc
.
GetLength
(
I7
),
desc
.
GetLength
(
I8
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
),
desc
.
GetStride
(
I4
),
desc
.
GetStride
(
I5
),
desc
.
GetStride
(
I6
),
desc
.
GetStride
(
I7
),
desc
.
GetStride
(
I8
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
),
desc
.
GetMemoryRank
(
I3
),
desc
.
GetMemoryRank
(
I4
),
desc
.
GetMemoryRank
(
I5
),
desc
.
GetMemoryRank
(
I6
),
desc
.
GetMemoryRank
(
I7
),
desc
.
GetMemoryRank
(
I8
));
});
});
static_if
<
ndim
==
10
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
10
>
{}([
&
](
auto
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I8
=
Number
<
8
>
{};
constexpr
auto
I9
=
Number
<
9
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u
%u}, ranks {%u %u %u %u %u %u %u %u %u
%u}
\n
"
,
"%u %u %u}
\n
"
,
s
,
s
,
desc
.
GetNumOfDimension
(),
ndim
,
desc
.
GetLength
(
I0
),
Lengths
...,
desc
.
GetLength
(
I1
),
Strides
...);
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetLength
(
I4
),
desc
.
GetLength
(
I5
),
desc
.
GetLength
(
I6
),
desc
.
GetLength
(
I7
),
desc
.
GetLength
(
I8
),
desc
.
GetLength
(
I9
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
),
desc
.
GetStride
(
I4
),
desc
.
GetStride
(
I5
),
desc
.
GetStride
(
I6
),
desc
.
GetStride
(
I7
),
desc
.
GetStride
(
I8
),
desc
.
GetStride
(
I9
),
desc
.
GetMemoryRank
(
I0
),
desc
.
GetMemoryRank
(
I1
),
desc
.
GetMemoryRank
(
I2
),
desc
.
GetMemoryRank
(
I3
),
desc
.
GetMemoryRank
(
I4
),
desc
.
GetMemoryRank
(
I5
),
desc
.
GetMemoryRank
(
I6
),
desc
.
GetMemoryRank
(
I7
),
desc
.
GetMemoryRank
(
I8
),
desc
.
GetMemoryRank
(
I9
));
});
});
}
}
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
97ba755f
...
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
...
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
dst_desc
.
GetLengths
());
constexpr
auto
desc
=
make_ConstantTensorDescriptor_packed
(
dst_desc
.
GetLengths
());
#if 0
#if 0
if(get_thread_local_1d_id() == 0)
if(get_thread_local_1d_id() == 0)
...
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
SrcOpLengths
{});
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor_packed
(
SrcOpLengths
{});
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
...
@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
...
@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr
index_t
read_per_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
DataPerRead
);
constexpr
index_t
read_per_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
DataPerRead
);
constexpr
auto
ref_desc
=
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
L0
,
L1
,
L2
,
read_per_d3
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
L0
,
L1
,
L2
,
read_per_d3
>
{});
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
...
@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
...
@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
DstOpLengths
{});
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor_packed
(
DstOpLengths
{});
constexpr
auto
h_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I0
);
constexpr
auto
h_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I0
);
constexpr
auto
w_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I1
);
constexpr
auto
w_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I1
);
...
@@ -510,8 +510,7 @@ struct Blockwise4dTensorCopy3
...
@@ -510,8 +510,7 @@ struct Blockwise4dTensorCopy3
}
}
}
}
constexpr
auto
thread_cluster_desc
=
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadPerDims
{});
make_ConstantTensorDescriptor_default_rank_packed
(
ThreadPerDims
{});
const
auto
thread_multi_id
=
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
...
@@ -653,7 +652,7 @@ struct Blockwise4dTensorCopy3
...
@@ -653,7 +652,7 @@ struct Blockwise4dTensorCopy3
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
#pragma unroll
#pragma unroll
...
@@ -720,7 +719,7 @@ struct Blockwise4dTensorCopy3
...
@@ -720,7 +719,7 @@ struct Blockwise4dTensorCopy3
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
#pragma unroll
#pragma unroll
...
...
src/include/blockwise_generic_tensor_slice_op.hip.hpp
View file @
97ba755f
...
@@ -63,7 +63,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -63,7 +63,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
"wrong!"
);
"wrong!"
);
// thread cluster
// thread cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
DataClusterLengths
{}.
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
DataClusterLengths
{}.
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
// BlockSize
...
@@ -185,7 +185,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -185,7 +185,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
DataClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
DataClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
SubLengths
{}
*
repeat_lengths
);
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
return
thread_tensor_desc
.
GetElementSpace
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
...
@@ -199,8 +199,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -199,8 +199,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
DataClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
DataClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
constexpr
auto
thread_tensor_desc
=
thread_sub_tensor_lengths
*
repeat_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){});
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){});
...
@@ -237,8 +237,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -237,8 +237,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
DataClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
DataClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
constexpr
auto
thread_tensor_desc
=
thread_sub_tensor_lengths
*
repeat_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){});
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){});
...
...
src/include/blockwise_tensor_slice_op.hip.hpp
View file @
97ba755f
...
@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
constexpr
auto
thread_cluster_desc
=
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
thread_cluster_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_cluster_lengths
);
// sanity check: data type
// sanity check: data type
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
...
@@ -175,7 +175,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -175,7 +175,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
thread_tensor_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_tensor_lengths
);
return
thread_tensor_desc
.
GetElementSpace
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
...
@@ -196,7 +196,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -196,7 +196,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
thread_tensor_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
@@ -234,7 +234,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -234,7 +234,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
thread_tensor_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
...
src/include/conv_common.hip.hpp
View file @
97ba755f
...
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
...
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
1
-
X
;
constexpr
auto
WO
=
WI
+
1
-
X
;
return
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
return
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
}
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
...
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
...
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
Y
;
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
X
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
X
;
return
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
return
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
}
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
View file @
97ba755f
...
@@ -45,22 +45,22 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
...
@@ -45,22 +45,22 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
constexpr
index_t
Y
=
wei_kcyx_global_desc
.
GetLength
(
I2
);
constexpr
index_t
Y
=
wei_kcyx_global_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_global_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_global_desc
.
GetLength
(
I3
);
constexpr
auto
wei_ke_global_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
wei_ke_global_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
*
Y
*
X
>
{});
// 2d view of wei for blockwise copy
Sequence
<
K
,
C
*
Y
*
X
>
{});
// 2d view of wei for blockwise copy
constexpr
index_t
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
index_t
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
index_t
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
index_t
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
auto
in_nchw_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
in_nchw_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{},
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ke_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_ke_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
KPerBlock
,
CPerBlock
*
Y
*
X
>
{},
Sequence
<
KPerBlock
,
CPerBlock
*
Y
*
X
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// 2d view of wei for blockwise copy
Number
<
WeiBlockCopyDataPerRead
>
{});
// 2d view of wei for blockwise copy
constexpr
auto
wei_kcyx_block_desc
=
make_ConstantTensorDescriptor_default_rank
(
constexpr
auto
wei_kcyx_block_desc
=
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{},
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{},
Sequence
<
wei_ke_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
Sequence
<
wei_ke_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
// shared mem
// shared mem
...
@@ -82,11 +82,11 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
...
@@ -82,11 +82,11 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
constexpr
index_t
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
index_t
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
index_t
WiPerThread
=
WoPerThread
+
X
-
1
;
constexpr
index_t
WiPerThread
=
WoPerThread
+
X
-
1
;
constexpr
auto
in_nchw_thread_block_desc
=
make_ConstantTensorDescriptor
_default_rank
(
constexpr
auto
in_nchw_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
in_nchw_block_desc
.
GetStrides
());
in_nchw_block_desc
.
GetStrides
());
constexpr
auto
wei_kcyx_thread_block_desc
=
make_ConstantTensorDescriptor
_default_rank
(
constexpr
auto
wei_kcyx_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_kcyx_block_desc
.
GetStrides
());
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_kcyx_block_desc
.
GetStrides
());
constexpr
auto
out_nkhw_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
constexpr
auto
out_nkhw_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
View file @
97ba755f
...
@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
...
@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
NBlockWork
=
mod_conv
::
integer_divide_ceil
(
N
,
NPerBlock
);
constexpr
index_t
NBlockWork
=
mod_conv
::
integer_divide_ceil
(
N
,
NPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
HBlockWork
,
WBlockWork
,
NBlockWork
>
{});
Sequence
<
KBlockWork
,
HBlockWork
,
WBlockWork
,
NBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
...
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
...
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockCopyDataPerRead_N
>
{});
Number
<
InBlockCopyDataPerRead_N
>
{});
...
@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
...
@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
// blockwise copy
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
97ba755f
...
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
...
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// global tensor view
// global tensor view
constexpr
auto
wei_c_k_global_desc
=
constexpr
auto
wei_c_k_global_desc
=
make_ConstantTensorDescriptor
_default_rank
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
// LDS tensor view
// LDS tensor view
// be careful of alignment
// be careful of alignment
...
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockReorderDataPerWrite_N
>
{});
Number
<
InBlockReorderDataPerWrite_N
>
{});
...
@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
// blockwise copy
...
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
#if 0
#if 0
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_
default_rank_
packed(
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_packed(
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_
default_rank_
packed(
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
#else
#else
constexpr
auto
out_10d_global_desc
=
constexpr
auto
out_10d_global_desc
=
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
97ba755f
...
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
...
@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockReorderDataPerWrite_N
>
{});
Number
<
InBlockReorderDataPerWrite_N
>
{});
...
@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
// blockwise copy
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
View file @
97ba755f
...
@@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
...
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
// global tensor view
// global tensor view
constexpr
auto
wei_c_k_global_desc
=
constexpr
auto
wei_c_k_global_desc
=
make_ConstantTensorDescriptor
_default_rank
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
// LDS tensor view
// LDS tensor view
// be careful of alignment
// be careful of alignment
...
@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockReorderDataPerWrite_N
>
{});
Number
<
InBlockReorderDataPerWrite_N
>
{});
...
@@ -117,12 +117,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -117,12 +117,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
// blockwise copy
...
...
src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
97ba755f
...
@@ -88,7 +88,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -88,7 +88,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
...
@@ -111,8 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -111,8 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
// memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
// memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
in_c_n1_b_n2_block_mem_desc
=
constexpr
auto
in_c_n1_b_n2_block_mem_desc
=
make_ConstantTensorDescriptor_aligned
(
make_ConstantTensorDescriptor_default_rank_aligned
(
Sequence
<
CPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
Sequence
<
CPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
// this check is ad-hoc
// this check is ad-hoc
...
@@ -143,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -143,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
// tensor descriptor in LDS, dst of blockwise copy
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerAccess_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerAccess_K
,
GemmDataPerReadA
)
>
{});
...
@@ -367,7 +366,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -367,7 +366,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
// define tensor descriptor for threadwise copy
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register
// output memory layout descriptor in register
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
// output tensor descriptor in register, src of threadwise copy
// output tensor descriptor in register, src of threadwise copy
...
...
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
View file @
97ba755f
...
@@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
...
@@ -114,8 +114,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -114,8 +114,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
// memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
in_c_n1_b_n2_block_mem_desc
=
constexpr
auto
in_c_n1_b_n2_block_mem_desc
=
make_ConstantTensorDescriptor_aligned
(
make_ConstantTensorDescriptor_default_rank_aligned
(
Sequence
<
CPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
Sequence
<
CPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
// this check is ad-hoc
// this check is ad-hoc
...
@@ -146,7 +145,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -146,7 +145,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// tensor descriptor in LDS, dst of blockwise copy
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerAccess_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerAccess_K
,
GemmDataPerReadA
)
>
{});
...
@@ -320,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -320,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// define tensor descriptor for threadwise copy
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register
// output memory layout descriptor in register
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
// output tensor descriptor in register, src of threadwise copy
// output tensor descriptor in register, src of threadwise copy
...
...
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
View file @
97ba755f
...
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
...
@@ -127,20 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -127,20 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
Sequence
<
3
,
6
,
7
>
{},
Sequence
<
3
,
6
,
7
>
{},
Sequence
<
5
>
{});
Sequence
<
5
>
{});
#if 0
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
"in_n0_n1_n2_h_w_global_desc: ");
print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
"in_e_n1_b_n2_global_merged_desc: ");
}
#endif
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
in_e_n1_b_n2_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
in_e_n1_b_n2_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
EPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
Sequence
<
EPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
// this check is ad-hoc
// this check is ad-hoc
...
@@ -174,7 +163,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -174,7 +163,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// tensor descriptor in LDS, dst of blockwise copy
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
wei_e_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_e_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
...
@@ -406,7 +395,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -406,7 +395,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// define tensor descriptor for threadwise copy
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register
// output memory layout descriptor in register
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
// output tensor descriptor in register, src of threadwise copy
// output tensor descriptor in register, src of threadwise copy
...
...
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
View file @
97ba755f
...
@@ -93,7 +93,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -93,7 +93,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
...
@@ -134,7 +134,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -134,7 +134,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
in_e_n1_b_n2_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
in_e_n1_b_n2_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
EPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
Sequence
<
EPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
// this check is ad-hoc
// this check is ad-hoc
...
@@ -167,7 +167,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -167,7 +167,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// tensor descriptor in LDS, dst of blockwise copy
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
wei_e_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank_
aligned
(
constexpr
auto
wei_e_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
...
@@ -288,7 +288,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -288,7 +288,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// define tensor descriptor for threadwise copy
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register
// output memory layout descriptor in register
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
make_ConstantTensorDescriptor_
default_rank_
packed
(
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
// output tensor descriptor in register, src of threadwise copy
// output tensor descriptor in register, src of threadwise copy
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment