Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
acd7082f
Commit
acd7082f
authored
May 21, 2019
by
Chao Liu
Browse files
adding ConstantMergedTensorDescriptor, refactering ConstantTensorDescriptor, Sequence
parent
cd29b09a
Changes
38
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
860 additions
and
469 deletions
+860
-469
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+3
-3
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+2
-2
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+1
-1
driver/driver.hip.cpp
driver/driver.hip.cpp
+2
-2
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+19
-0
src/include/ConstantMatrixDescriptor.hip.hpp
src/include/ConstantMatrixDescriptor.hip.hpp
+1
-1
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+80
-56
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+213
-94
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+144
-58
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+30
-26
src/include/blockwise_3d_tensor_op.hip.hpp
src/include/blockwise_3d_tensor_op.hip.hpp
+24
-21
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+63
-57
src/include/blockwise_batched_gemm.hip.hpp
src/include/blockwise_batched_gemm.hip.hpp
+25
-17
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+8
-6
src/include/blockwise_merged_tensor_slice_op.hip.hpp
src/include/blockwise_merged_tensor_slice_op.hip.hpp
+120
-13
src/include/blockwise_tensor_slice_op.hip.hpp
src/include/blockwise_tensor_slice_op.hip.hpp
+18
-13
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+2
-2
src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
...ude/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
+34
-31
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
+27
-26
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
+44
-40
No files found.
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
acd7082f
...
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// reorder input
// reorder input
auto
in_chwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
auto
in_chwn_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
...
@@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// output
// output
auto
out_khwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
auto
out_khwn_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
View file @
acd7082f
...
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
...
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -50,7 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
...
@@ -50,7 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// output
// output
auto
out_khwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
auto
out_khwn_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
View file @
acd7082f
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
...
driver/driver.hip.cpp
View file @
acd7082f
...
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
...
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
in_nchw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_nchw_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
wei_kcyx_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
lower_pads
,
upper_pads
);
in_nchw_desc
,
wei_kcyx_desc
,
lower_pads
,
upper_pads
);
...
...
src/include/Array.hip.hpp
View file @
acd7082f
...
@@ -16,6 +16,8 @@ struct Array
...
@@ -16,6 +16,8 @@ struct Array
{
{
}
}
__host__
__device__
constexpr
index_t
GetSize
()
const
{
return
NSize
;
}
__host__
__device__
const
TData
&
operator
[](
index_t
i
)
const
{
return
mData
[
i
];
}
__host__
__device__
const
TData
&
operator
[](
index_t
i
)
const
{
return
mData
[
i
];
}
__host__
__device__
TData
&
operator
[](
index_t
i
)
{
return
mData
[
i
];
}
__host__
__device__
TData
&
operator
[](
index_t
i
)
{
return
mData
[
i
];
}
...
@@ -67,6 +69,23 @@ __host__ __device__ auto reorder_array_given_old2new(const Array<TData, NSize>&
...
@@ -67,6 +69,23 @@ __host__ __device__ auto reorder_array_given_old2new(const Array<TData, NSize>&
return
new_array
;
return
new_array
;
}
}
template
<
class
TData
,
index_t
NSize
,
class
ExtractSeq
>
__host__
__device__
auto
extract_array
(
const
Array
<
TData
,
NSize
>&
old_array
,
ExtractSeq
)
{
Array
<
TData
,
ExtractSeq
::
GetSize
()
>
new_array
;
constexpr
index_t
new_size
=
ExtractSeq
::
GetSize
();
static_assert
(
new_size
<=
NSize
,
"wrong! too many extract"
);
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
new_array
[
i
]
=
old_array
[
ExtractSeq
{}.
Get
(
I
)];
});
return
new_array
;
}
template
<
class
TData
,
index_t
NSize
>
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+
(
const
Array
<
TData
,
NSize
>&
a
,
__host__
__device__
constexpr
auto
operator
+
(
const
Array
<
TData
,
NSize
>&
a
,
const
Array
<
TData
,
NSize
>&
b
)
const
Array
<
TData
,
NSize
>&
b
)
...
...
src/include/ConstantMatrixDescriptor.hip.hpp
View file @
acd7082f
...
@@ -21,7 +21,7 @@ struct ConstantMatrixDescriptor
...
@@ -21,7 +21,7 @@ struct ConstantMatrixDescriptor
__host__
__device__
constexpr
index_t
GetElementSpace
()
const
{
return
NRow_
*
RowStride_
;
}
__host__
__device__
constexpr
index_t
GetElementSpace
()
const
{
return
NRow_
*
RowStride_
;
}
__host__
__device__
index_t
Get
1d
Index
(
index_t
irow
,
index_t
icol
)
const
__host__
__device__
index_t
Get
OffsetFromMulti
Index
(
index_t
irow
,
index_t
icol
)
const
{
{
return
irow
*
RowStride_
+
icol
;
return
irow
*
RowStride_
+
icol
;
}
}
...
...
src/include/ConstantMergedTensorDescriptor.hip.hpp
View file @
acd7082f
...
@@ -2,94 +2,118 @@
...
@@ -2,94 +2,118 @@
#include "common.hip.hpp"
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
// TensorDesc: ConstantTensorDescriptor<...>
// OriginalTensorDesc : ConstantTensorDescriptor<...>
// MergedDimRanges: Sequence<FirstMergedDim, LastMergedDim>
// it's the tensor whose dimensions are to be merged
template
<
class
TensorDesc
,
class
...
MergedDimRanges
>
// OriginalDimMergeSeqs : Sequence<...>...
// each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
struct
ConstantMergedTensorDescriptor
struct
ConstantMergedTensorDescriptor
{
{
static
constexpr
index_t
nOriginalDim
=
GetNumOfOriginalDimension
();
static
constexpr
auto
mOriginalDimMergeSeqs
=
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{};
static
constexpr
index_t
nDim
=
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
std
::
tuple_size
<
mOriginalDimMergeSeqs
>::
value
;
static
constexpr
index_t
nOriginalDim
=
OriginalDesc
::
GetNumOfDimension
();
template
<
class
...
Is
>
__host__
__device__
constexpr
ConstantMergedTensorDescriptor
()
__host__
__device__
constexpr
ConstantMergedTensorDescriptor
()
{
{
constexpr
auto
merged_dim_ranges
=
std
::
make_tuple
(
MergedDimRanges
{}...);
static_assert
(
nDim
<=
nOriginalDim
,
"wrong!"
);
static_for
<
0
,
sizeof
...(
MergedDimRanges
),
1
>
{}([
&
](
auto
I
)
{
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
constexpr
index_t
i
=
I
.
Get
();
// OriginalTensorDesc::nDim number of dimensions
constexpr
auto
merged_dim_range
=
std
::
get
<
i
>
(
merged_dim_ranges
);
// TODO: check there is no duplication in OriginalDimMergeSeqs
static_assert
(
merged_dim_range
.
GetSize
()
==
2
,
"wrong! should specify first and last dimension to be merged"
);
// TODO: check OriginalDimMergeSeqs contains all original dimensions
static_assert
(
merged_dim_range
.
Get
(
Number
<
0
>
{})
<
GetNumOfUnmergedDimension
(),
"wrong!"
);
static_assert
(
merged_dim_range
.
Get
(
Number
<
1
>
{})
<
GetNumOfUnmergedDimension
(),
"wrong!"
);
static_assert
(
merged_dim_range
.
Get
(
Number
<
0
>
{})
<=
merged_dim_range
.
Get
(
Number
<
1
>
{}),
"wrong!"
);
});
}
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
return
nDim
;
}
{
constexpr
auto
merged_dim_ranges
=
std
::
make_tuple
(
MergedDimRanges
...);
__host__
__device__
static
constexpr
index_t
GetNumOfOriginalDimension
()
{
return
nOriginalDim
}
struct
f_calculate_num_of_lost_dim
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
)
{
{
__host__
__device__
constexpr
index_t
operator
()(
auto
I
)
const
return
(
std
::
Get
<
IDIM
>
(
mOriginalDimMergeSeqs
).
GetSize
()
>
1
);
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
index_t
GetLength
(
Number
<
IDim
>
)
{
{
constexpr
index_t
i
=
I
.
Get
();
constexpr
auto
original_dims_partial
=
std
::
Get
<
IDim
>
(
mOriginalDimMergeSeqs
);
constexpr
auto
merged_dim_range
=
std
::
get
<
i
>
(
merged_dim_ranges
);
return
merged_dim_range
.
Get
(
Number
<
1
>
{})
-
merged_dim_range
.
Get
(
Number
<
0
>
{}
);
return
OriginalTensorDesc
::
Extract
(
original_dims_partial
).
GetElementSize
(
);
}
}
};
constexpr
index_t
num_lost_dim
=
static_const_reduce_n
<
sizeof
...(
MergedDimRanges
)
>
{}(
template
<
index_t
IDim
>
f_calculate_num_of_lost_dim
,
std
::
plus
<
index_t
>
{});
__host__
__device__
static
constexpr
index_t
GetStride
(
Number
<
IDim
>
)
{
static_assert
(
!
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
{}),
"wrong! stride of a merged dimension is undefined"
);
constexpr
auto
idim_original
=
std
::
Get
<
IDim
>
(
mOriginalDimMergeSeqs
).
Front
();
return
TensorDesc
::
Get
NumOfDimension
()
-
num_lost_dim
;
return
Original
TensorDesc
::
Get
Stride
(
Number
<
idim_original
>
{})
;
}
}
__host__
__device__
static
constexpr
index_t
GetNumOfOriginalDimension
()
__host__
__device__
static
constexpr
auto
GetLengths
()
{
{
return
TensorDesc
::
GetNumOfDimension
()
;
return
Sequence
<
OriginalTensorDesc
::
Extract
(
OriginalDimMergeSeqs
).
GetElementSize
()...
>
{}
;
}
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
index_t
GetElementSize
()
__host__
__device__
static
constexpr
bool
IsMergedDimension
(
Number
<
IDim
>
)
{
{
// not implemented
return
OriginalTensorDesc
::
GetElementSize
();
}
}
template
<
index_t
IDim
>
__host__
__device__
static
auto
__host__
__device__
static
constexpr
bool
GetLength
(
Number
<
IDim
>
)
GetOriginalMultiIndexFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
{
{
// not implemented
Array
<
index_t
,
nOriginalDim
>
original_multi_id
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
index_t
idim
=
IDim
.
Get
();
constexpr
auto
original_dims_partial
=
std
::
get
<
idim
>
(
mOriginalDimMergeSeqs
);
// get partial original-multi-id corresponding to this merged dimension
constexpr
auto
original_multi_id_partial
=
OriginalTensorDesc
::
Extract
(
original_dims_partial
)
.
GetMultiIndexFrom1dIndex
(
multi_id
[
idim
]);
// make sure compiler unroll this loop and propagate all the constants
for
(
index_t
i
=
0
;
i
<
original_dims_partial
.
GetSize
();
++
i
)
{
index_t
idim_original
=
original_dims_partial
[
i
];
original_multi_id
[
idim_original
]
=
original_multi_id_partial
[
i
]
}
}
});
template
<
index_t
IDim
>
return
original_multi_id
;
__host__
__device__
static
constexpr
bool
GetStride
(
Number
<
IDim
>
)
}
__host__
__device__
static
index_t
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
{
{
static_assert
(
!
IsMergedDimension
(
Number
<
IDim
>
{},
"wrong! stride of a merged dimension is undefined"
)
const
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
// not implemented
return
OriginalTensorDesc
::
GetOffsetFromMultiIndex
(
orginal_multi_id
);
}
}
template
<
class
...
Is
>
template
<
index_t
...
Is
>
__host__
__device__
auto
MultiIndex2Original
MultiIndex
(
Is
...
is
)
const
__host__
__device__
static
index_t
GetOffsetFrom
MultiIndex
(
Is
...
is
)
{
{
// not implemented
return
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
{
is
...});
}
}
template
<
class
...
Is
>
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
__host__
__device__
auto
OriginalMultiIndex2MultiIndex
(
Is
...
is
)
const
{
{
// not implemented
constexpr
auto
dummy_desc
=
make_packed_ConstantTensorDescriptor
(
GetLengths
());
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
}
};
};
template
<
class
TensorDesc
,
class
...
MergedDimRange
s
>
template
<
class
Original
TensorDesc
,
class
...
OriginalDimMergeSeq
s
>
constexpr
auto
make_ConstantMergedTensorDescriptor
(
TensorDesc
,
MergedDimRange
s
...)
constexpr
auto
make_ConstantMergedTensorDescriptor
(
Original
TensorDesc
,
OriginalDimMergeSeq
s
...)
{
{
return
ConstantMergedTensorDescriptor
<
TensorDesc
,
MergedDimRange
s
...
>
{};
return
ConstantMergedTensorDescriptor
<
Original
TensorDesc
,
OriginalDimMergeSeq
s
...
>
{};
}
}
src/include/ConstantTensorDescriptor.hip.hpp
View file @
acd7082f
This diff is collapsed.
Click to expand it.
src/include/Sequence.hip.hpp
View file @
acd7082f
...
@@ -9,76 +9,100 @@ struct Sequence
...
@@ -9,76 +9,100 @@ struct Sequence
static
constexpr
index_t
mSize
=
sizeof
...(
Is
);
static
constexpr
index_t
mSize
=
sizeof
...(
Is
);
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
// the last element is dummy, to prevent compiler complain on empty Sequence
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
mSize
;
}
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
mSize
;
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
constexpr
index_t
Get
(
Number
<
I
>
)
const
__host__
__device__
static
constexpr
index_t
Get
(
Number
<
I
>
)
{
{
static_assert
(
I
<
mSize
,
"wrong! I too large"
);
// the last dummy element is to prevent compiler complain about empty Sequence
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
I
];
return
mData
[
I
];
}
}
__host__
__device__
index_t
operator
[](
index_t
i
)
const
{
return
mData
[
i
];
}
__host__
__device__
index_t
operator
[](
index_t
i
)
const
{
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
i
];
}
template
<
index_t
...
IRs
>
template
<
index_t
...
IRs
>
__host__
__device__
constexpr
auto
ReorderGivenNew2Old
(
Sequence
<
IRs
...
>
/*new2old*/
)
const
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
Sequence
<
IRs
...
>
/*new2old*/
)
{
{
static_assert
(
mSize
==
sizeof
...(
IRs
),
"mSize not consistent"
);
#if 0 // require sequence_sort, which is not implemented yet
static_assert(is_same<sequence_sort<Sequence<IRs...>>::SortedSeqType,
constexpr
auto
old
=
Type
{};
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
"wrong! invalid new2old map");
#endif
return
Sequence
<
old
.
Get
(
Number
<
IRs
>
{})...
>
{};
return
Sequence
<
Type
{}
.
Get
(
Number
<
IRs
>
{})...
>
{};
}
}
template
<
index_t
...
IRs
>
#if 0 // require sequence_sort, which is not implemented yet
__host__
__device__
constexpr
auto
ReorderGivenOld2New
(
Sequence
<
IRs
...
>
/*old2new*/
)
const
template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/)
{
{
// TODO: don't know how to implement this
static_assert(is_same<sequence_sort<MapOld2New>::SortedSeqType,
printf
(
"Sequence::ReorderGivenOld2New not implemented"
);
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
assert
(
false
);
"wrong! invalid old2new map");
constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{};
return ReorderGivenNew2Old(map_new2old);
}
}
#endif
__host__
__device__
constexpr
auto
Reverse
()
const
;
__host__
__device__
static
constexpr
auto
Reverse
();
__host__
__device__
constexpr
index_t
Front
()
const
{
return
mData
[
0
];
}
__host__
__device__
static
constexpr
index_t
Front
()
{
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
0
];
}
__host__
__device__
constexpr
index_t
Back
()
const
{
return
mData
[
mSize
-
1
];
}
__host__
__device__
static
constexpr
index_t
Back
()
{
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
mSize
-
1
];
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
constexpr
auto
PushFront
(
Number
<
I
>
)
const
__host__
__device__
static
constexpr
auto
PushFront
(
Number
<
I
>
)
{
{
return
Sequence
<
I
,
Is
...
>
{};
return
Sequence
<
I
,
Is
...
>
{};
}
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
constexpr
auto
PushBack
(
Number
<
I
>
)
const
__host__
__device__
static
constexpr
auto
PushBack
(
Number
<
I
>
)
{
{
return
Sequence
<
Is
...,
I
>
{};
return
Sequence
<
Is
...,
I
>
{};
}
}
__host__
__device__
constexpr
auto
PopFront
()
const
;
__host__
__device__
static
constexpr
auto
PopFront
();
__host__
__device__
constexpr
auto
PopBack
()
const
;
__host__
__device__
static
constexpr
auto
PopBack
();
template
<
index_t
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
auto
Append
(
Sequence
<
Xs
...
>
)
const
__host__
__device__
static
constexpr
auto
Append
(
Sequence
<
Xs
...
>
)
{
{
return
Sequence
<
Is
...,
Xs
...
>
{};
return
Sequence
<
Is
...,
Xs
...
>
{};
}
}
template
<
index_t
...
Ns
>
template
<
index_t
...
Ns
>
__host__
__device__
constexpr
auto
Extract
(
Number
<
Ns
>
...)
const
__host__
__device__
static
constexpr
auto
Extract
(
Number
<
Ns
>
...)
{
{
return
Sequence
<
Type
{}.
Get
(
Number
<
Ns
>
{})...
>
{};
return
Sequence
<
Type
{}.
Get
(
Number
<
Ns
>
{})...
>
{};
}
}
template
<
index_t
...
Ns
>
template
<
index_t
...
Ns
>
__host__
__device__
constexpr
auto
Extract
(
Sequence
<
Ns
...
>
)
const
__host__
__device__
static
constexpr
auto
Extract
(
Sequence
<
Ns
...
>
)
{
{
return
Sequence
<
Type
{}.
Get
(
Number
<
Ns
>
{})...
>
{};
return
Sequence
<
Type
{}.
Get
(
Number
<
Ns
>
{})...
>
{};
}
}
template
<
index_t
I
,
index_t
X
>
__host__
__device__
static
constexpr
auto
Modify
(
Number
<
I
>
,
Number
<
X
>
);
};
};
template
<
class
,
class
>
template
<
class
,
class
>
...
@@ -91,43 +115,36 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
...
@@ -91,43 +115,36 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
};
};
template
<
index_t
IBegin
,
index_t
NSize
,
index_t
Increment
>
template
<
index_t
IBegin
,
index_t
NSize
,
index_t
Increment
>
struct
increasing
_sequence_gen_impl
struct
arithmetic
_sequence_gen_impl
{
{
static
constexpr
index_t
NSizeLeft
=
NSize
/
2
;
static
constexpr
index_t
NSizeLeft
=
NSize
/
2
;
using
SeqType
=
typename
sequence_merge
<
using
SeqType
=
typename
sequence_merge
<
typename
increasing
_sequence_gen_impl
<
IBegin
,
NSizeLeft
,
Increment
>::
SeqType
,
typename
arithmetic
_sequence_gen_impl
<
IBegin
,
NSizeLeft
,
Increment
>::
SeqType
,
typename
increasing
_sequence_gen_impl
<
IBegin
+
NSizeLeft
*
Increment
,
typename
arithmetic
_sequence_gen_impl
<
IBegin
+
NSizeLeft
*
Increment
,
NSize
-
NSizeLeft
,
NSize
-
NSizeLeft
,
Increment
>::
SeqType
>::
SeqType
;
Increment
>::
SeqType
>::
SeqType
;
};
};
template
<
index_t
IBegin
,
index_t
Increment
>
template
<
index_t
IBegin
,
index_t
Increment
>
struct
increasing
_sequence_gen_impl
<
IBegin
,
1
,
Increment
>
struct
arithmetic
_sequence_gen_impl
<
IBegin
,
1
,
Increment
>
{
{
using
SeqType
=
Sequence
<
IBegin
>
;
using
SeqType
=
Sequence
<
IBegin
>
;
};
};
template
<
index_t
IBegin
,
index_t
Increment
>
template
<
index_t
IBegin
,
index_t
Increment
>
struct
increasing
_sequence_gen_impl
<
IBegin
,
0
,
Increment
>
struct
arithmetic
_sequence_gen_impl
<
IBegin
,
0
,
Increment
>
{
{
using
SeqType
=
Sequence
<>
;
using
SeqType
=
Sequence
<>
;
};
};
template
<
index_t
IBegin
,
index_t
IEnd
,
index_t
Increment
>
template
<
index_t
IBegin
,
index_t
IEnd
,
index_t
Increment
>
struct
increasing
_sequence_gen
struct
arithmetic
_sequence_gen
{
{
using
SeqType
=
using
SeqType
=
typename
increasing
_sequence_gen_impl
<
IBegin
,
IEnd
-
IBegin
,
Increment
>::
SeqType
;
typename
arithmetic
_sequence_gen_impl
<
IBegin
,
IEnd
-
IBegin
,
Increment
>::
SeqType
;
};
};
template
<
index_t
IBegin
,
index_t
IEnd
,
index_t
Increment
>
__host__
__device__
constexpr
auto
make_increasing_sequence
(
Number
<
IBegin
>
,
Number
<
IEnd
>
,
Number
<
Increment
>
)
{
return
typename
increasing_sequence_gen
<
IBegin
,
IEnd
,
Increment
>::
SeqType
{};
}
template
<
class
,
class
>
template
<
class
,
class
>
struct
sequence_reverse_inclusive_scan
;
struct
sequence_reverse_inclusive_scan
;
...
@@ -161,8 +178,8 @@ struct sequence_split
...
@@ -161,8 +178,8 @@ struct sequence_split
{
{
static
constexpr
index_t
NSize
=
Seq
{}.
GetSize
();
static
constexpr
index_t
NSize
=
Seq
{}.
GetSize
();
using
range0
=
typename
increasing
_sequence_gen
<
0
,
I
,
1
>::
SeqType
;
using
range0
=
typename
arithmetic
_sequence_gen
<
0
,
I
,
1
>::
SeqType
;
using
range1
=
typename
increasing
_sequence_gen
<
I
,
NSize
,
1
>::
SeqType
;
using
range1
=
typename
arithmetic
_sequence_gen
<
I
,
NSize
,
1
>::
SeqType
;
using
SeqType0
=
typename
sequence_extract
<
Seq
,
range0
>::
SeqType
;
using
SeqType0
=
typename
sequence_extract
<
Seq
,
range0
>::
SeqType
;
using
SeqType1
=
typename
sequence_extract
<
Seq
,
range1
>::
SeqType
;
using
SeqType1
=
typename
sequence_extract
<
Seq
,
range1
>::
SeqType
;
...
@@ -191,6 +208,63 @@ struct sequence_reverse<Sequence<I0, I1>>
...
@@ -191,6 +208,63 @@ struct sequence_reverse<Sequence<I0, I1>>
using
SeqType
=
Sequence
<
I1
,
I0
>
;
using
SeqType
=
Sequence
<
I1
,
I0
>
;
};
};
#if 0 // not fully implemented
template <class KeySeq0, class ValSeq0, class KeySeq1, class ValSeq1>
struct sequence_sort_merge_impl;
template <index_t Key0,
index_t... Keys0,
index_t Val0,
index_t... Vals0,
index_t Key1,
index_t... Keys1,
index_t Val0,
index_t... Vals1>
struct sequence_sort_merge_impl<Sequence<Key0, Keys0...>,
Sequence<Val0, Vals0...>,
Sequence<Key1, Keys1...>,
Sequence<Val1, Vals1...>>
{
};
template <class>
struct sequence_sort;
template <index_t... Is>
struct sequence_sort<Sequence<Is...>>
{
using OriginalSeqType = Sequence<Is...>;
using SortedSeqType = xxxxx;
using MapSorted2OriginalType = xxx;
};
template <class Seq, class IsValidSeqMap>
struct sequence_map_inverse_impl;
// impl for valid map, no impl for invalid map
template <index_t... Is>
struct sequence_map_inverse_impl<Sequence<Is...>, true>
{
using SeqMapType = sequence_sort<Sequence<Is...>>::MapSorted2OriginalType;
};
template <class>
struct sequence_map_inverse;
template <class Is...>
struct sequence_map_inverse<Sequence<Is...>>
{
// TODO: make sure the map to be inversed is valid: [0, sizeof...(Is))
static constexpr bool is_valid_sequence_map =
is_same<typename sequence_sort<Sequence<Is...>>::SortedSeqType,
typename arithmetic_sequence_gen<0, sizeof...(Is), 1>::SeqType>::value;
// make compiler fails, if is_valid_map != true
using SeqMapType =
typename sequence_map_inverse_impl<Sequence<Is...>, is_valid_map>::SeqMapType;
};
#endif
template
<
index_t
...
Xs
,
index_t
...
Ys
>
template
<
index_t
...
Xs
,
index_t
...
Ys
>
__host__
__device__
constexpr
auto
operator
+
(
Sequence
<
Xs
...
>
,
Sequence
<
Ys
...
>
)
__host__
__device__
constexpr
auto
operator
+
(
Sequence
<
Xs
...
>
,
Sequence
<
Ys
...
>
)
{
{
...
@@ -243,7 +317,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
...
@@ -243,7 +317,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
template
<
index_t
...
Xs
,
index_t
Y
>
template
<
index_t
...
Xs
,
index_t
Y
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Xs
...
>
,
Number
<
Y
>
)
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Xs
...
>
,
Number
<
Y
>
)
{
{
#if 0 //
d
oesn't compile
#if 0 //
TODO: turn it on. D
oesn't compile
constexpr auto seq_x = Sequence<Xs...>{};
constexpr auto seq_x = Sequence<Xs...>{};
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
...
@@ -313,14 +387,13 @@ __host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
...
@@ -313,14 +387,13 @@ __host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
template
<
index_t
I
,
index_t
...
Is
>
template
<
index_t
I
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
sequence_pop_front
(
Sequence
<
I
,
Is
...
>
)
__host__
__device__
constexpr
auto
sequence_pop_front
(
Sequence
<
I
,
Is
...
>
)
{
{
static_assert
(
sizeof
...(
Is
)
>
0
,
"empty Sequence!"
);
return
Sequence
<
Is
...
>
{};
return
Sequence
<
Is
...
>
{};
}
}
template
<
class
Seq
>
template
<
class
Seq
>
__host__
__device__
constexpr
auto
sequence_pop_back
(
Seq
)
__host__
__device__
constexpr
auto
sequence_pop_back
(
Seq
)
{
{
static_assert
(
Seq
{}.
GetSize
()
>
0
,
"empty Sequence!"
);
static_assert
(
Seq
{}.
GetSize
()
>
0
,
"
wrong! cannot pop an
empty Sequence!"
);
return
sequence_pop_front
(
Seq
{}.
Reverse
()).
Reverse
();
return
sequence_pop_front
(
Seq
{}.
Reverse
()).
Reverse
();
}
}
...
@@ -349,16 +422,16 @@ transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
...
@@ -349,16 +422,16 @@ transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
return
Sequence
<
f
(
Xs
,
Ys
,
Zs
)...
>
{};
return
Sequence
<
f
(
Xs
,
Ys
,
Zs
)...
>
{};
}
}
template
<
index_t
...
Is
>
template
<
class
Seq
,
class
Reduce
>
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
PopFront
()
const
__host__
__device__
constexpr
auto
reverse_inclusive_scan_sequence
(
Seq
,
Reduce
)
{
{
return
sequence_pop_front
(
Type
{}
)
;
return
typename
sequence_reverse_inclusive_scan
<
Seq
,
Reduce
>::
Seq
Type
{};
}
}
template
<
index_t
...
Is
>
template
<
class
Seq
,
class
Reduce
>
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
PopBack
()
const
__host__
__device__
constexpr
auto
inclusive_scan_sequence
(
Seq
,
Reduce
)
{
{
return
sequence_pop_back
(
Type
{}
);
return
reverse_inclusive_scan_sequence
(
Seq
{}.
Reverse
(),
Reduce
{}).
Reverse
(
);
}
}
template
<
class
Seq
>
template
<
class
Seq
>
...
@@ -381,19 +454,32 @@ __host__ __device__ constexpr index_t
...
@@ -381,19 +454,32 @@ __host__ __device__ constexpr index_t
}
}
template
<
index_t
...
Is
>
template
<
index_t
...
Is
>
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
Reverse
()
const
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
PopFront
()
{
{
return
typename
sequence_reverse
<
Sequence
<
Is
...
>>::
Seq
Type
{};
return
sequence_pop_front
(
Type
{}
)
;
}
}
template
<
class
Seq
,
class
Reduce
>
template
<
index_t
...
Is
>
__host__
__device__
constexpr
auto
reverse_inclusive_scan_sequence
(
Seq
,
Reduce
)
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
PopBack
(
)
{
{
return
typename
sequence_reverse_inclusive_scan
<
Seq
,
Reduce
>::
Seq
Type
{};
return
sequence_pop_back
(
Type
{}
)
;
}
}
template
<
class
Seq
,
class
Reduce
>
template
<
index_t
...
Is
>
__host__
__device__
constexpr
auto
inclusive_scan_sequence
(
Seq
,
Reduce
)
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
Reverse
(
)
{
{
return
reverse_inclusive_scan_sequence
(
Seq
{}.
Reverse
(),
Reduce
{}).
Reverse
();
return
typename
sequence_reverse
<
Sequence
<
Is
...
>>::
SeqType
{};
}
template
<
index_t
...
Is
>
template
<
index_t
I
,
index_t
X
>
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
Modify
(
Number
<
I
>
,
Number
<
X
>
)
{
static_assert
(
I
<
GetSize
(),
"wrong!"
);
using
seq_split
=
sequence_split
<
Type
,
I
>
;
constexpr
auto
seq_left
=
typename
seq_split
::
SeqType0
{};
constexpr
auto
seq_right
=
typename
seq_split
::
SeqType1
{}.
PopFront
();
return
seq_left
.
PushBack
(
Number
<
X
>
{}).
Append
(
seq_right
);
}
}
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
acd7082f
...
@@ -33,7 +33,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
...
@@ -33,7 +33,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const
index_t
did1
=
is
/
desc
.
GetStride
(
I1
);
const
index_t
did1
=
is
/
desc
.
GetStride
(
I1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
f
(
p_dst
[
dindex
]);
f
(
p_dst
[
dindex
]);
}
}
...
@@ -52,7 +52,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
...
@@ -52,7 +52,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const
index_t
did1
=
is
/
desc
.
GetStride
(
I1
);
const
index_t
did1
=
is
/
desc
.
GetStride
(
I1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
f
(
p_dst
[
dindex
]);
f
(
p_dst
[
dindex
]);
}
}
...
@@ -102,9 +102,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -102,9 +102,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
const
index_t
aindex
=
src_desc
.
Get
1d
Index
(
did
[
0
],
did
[
1
]);
const
index_t
aindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did
[
0
],
did
[
1
]);
const
index_t
bindex
=
dst_desc
.
Get
1d
Index
(
did
[
IR0
],
did
[
IR1
]);
const
index_t
bindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did
[
IR0
],
did
[
IR1
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
...
@@ -125,9 +125,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -125,9 +125,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
const
index_t
aindex
=
src_desc
.
Get
1d
Index
(
did
[
0
],
did
[
1
]);
const
index_t
aindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did
[
0
],
did
[
1
]);
const
index_t
bindex
=
dst_desc
.
Get
1d
Index
(
did
[
IR0
],
did
[
IR1
]);
const
index_t
bindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did
[
IR0
],
did
[
IR1
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
...
@@ -224,8 +224,10 @@ struct Blockwise2dTensorCopy1
...
@@ -224,8 +224,10 @@ struct Blockwise2dTensorCopy1
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
]
*
DataPerRead
);
const
index_t
src_index
=
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
]
*
DataPerRead
);
src_desc
.
GetOffsetFromMultiIndex
(
did
[
0
],
did
[
1
]
*
DataPerRead
);
const
index_t
dst_index
=
dst_desc
.
GetOffsetFromMultiIndex
(
did
[
0
],
did
[
1
]
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
...
@@ -328,8 +330,8 @@ struct Blockwise2dTensorCopy2
...
@@ -328,8 +330,8 @@ struct Blockwise2dTensorCopy2
{
{
index_t
did1
=
d1v4loop
*
4
*
ThreadPerDim1
+
4
*
mThreadId1
;
index_t
did1
=
d1v4loop
*
4
*
ThreadPerDim1
+
4
*
mThreadId1
;
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
const
Float4
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
const
Float4
*>
(
p_src
+
sindex
));
...
@@ -341,8 +343,8 @@ struct Blockwise2dTensorCopy2
...
@@ -341,8 +343,8 @@ struct Blockwise2dTensorCopy2
index_t
did1
=
index_t
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
d1v2loop
*
2
*
ThreadPerDim1
+
2
*
mThreadId1
;
Dim1V4Loop
*
4
*
ThreadPerDim1
+
d1v2loop
*
2
*
ThreadPerDim1
+
2
*
mThreadId1
;
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
const
Float2
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
const
Float2
*>
(
p_src
+
sindex
));
...
@@ -354,8 +356,8 @@ struct Blockwise2dTensorCopy2
...
@@ -354,8 +356,8 @@ struct Blockwise2dTensorCopy2
index_t
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
Dim1V2Loop
*
2
*
ThreadPerDim1
+
index_t
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
Dim1V2Loop
*
2
*
ThreadPerDim1
+
d1v1loop
*
ThreadPerDim1
+
mThreadId1
;
d1v1loop
*
ThreadPerDim1
+
mThreadId1
;
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
}
...
@@ -368,8 +370,8 @@ struct Blockwise2dTensorCopy2
...
@@ -368,8 +370,8 @@ struct Blockwise2dTensorCopy2
if
(
did1
<
L1
)
if
(
did1
<
L1
)
{
{
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
}
...
@@ -389,8 +391,8 @@ struct Blockwise2dTensorCopy2
...
@@ -389,8 +391,8 @@ struct Blockwise2dTensorCopy2
{
{
index_t
did1
=
d1v4loop
*
4
*
ThreadPerDim1
+
4
*
mThreadId1
;
index_t
did1
=
d1v4loop
*
4
*
ThreadPerDim1
+
4
*
mThreadId1
;
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
const
Float4
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
const
Float4
*>
(
p_src
+
sindex
));
...
@@ -402,8 +404,8 @@ struct Blockwise2dTensorCopy2
...
@@ -402,8 +404,8 @@ struct Blockwise2dTensorCopy2
index_t
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
d1v2loop
*
2
*
ThreadPerDim1
+
index_t
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
d1v2loop
*
2
*
ThreadPerDim1
+
2
*
mThreadId1
;
2
*
mThreadId1
;
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
const
Float2
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
const
Float2
*>
(
p_src
+
sindex
));
...
@@ -415,8 +417,8 @@ struct Blockwise2dTensorCopy2
...
@@ -415,8 +417,8 @@ struct Blockwise2dTensorCopy2
index_t
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
Dim1V2Loop
*
2
*
ThreadPerDim1
+
index_t
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
Dim1V2Loop
*
2
*
ThreadPerDim1
+
d1v1loop
*
ThreadPerDim1
+
mThreadId1
;
d1v1loop
*
ThreadPerDim1
+
mThreadId1
;
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
}
...
@@ -429,8 +431,8 @@ struct Blockwise2dTensorCopy2
...
@@ -429,8 +431,8 @@ struct Blockwise2dTensorCopy2
if
(
did1
<
L1
)
if
(
did1
<
L1
)
{
{
const
index_t
sindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
sindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
}
...
@@ -497,8 +499,10 @@ struct Blockwise2dTensorCopy3
...
@@ -497,8 +499,10 @@ struct Blockwise2dTensorCopy3
const
index_t
thread_id_d0
=
get_thread_local_1d_id
()
/
thread_per_d1
;
const
index_t
thread_id_d0
=
get_thread_local_1d_id
()
/
thread_per_d1
;
const
index_t
thread_id_d1
=
get_thread_local_1d_id
()
-
thread_id_d0
*
thread_per_d1
;
const
index_t
thread_id_d1
=
get_thread_local_1d_id
()
-
thread_id_d0
*
thread_per_d1
;
mSrcMyThreadOffset
=
SrcDesc
{}.
Get1dIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
mSrcMyThreadOffset
=
mDstMyThreadOffset
=
DstDesc
{}.
Get1dIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
SrcDesc
{}.
GetOffsetFromMultiIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
mDstMyThreadOffset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
}
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
...
...
src/include/blockwise_3d_tensor_op.hip.hpp
View file @
acd7082f
...
@@ -71,8 +71,10 @@ struct Blockwise3dTensorCopy1
...
@@ -71,8 +71,10 @@ struct Blockwise3dTensorCopy1
did
[
2
]
=
is
/
ref_desc
.
GetStride
(
I2
);
did
[
2
]
=
is
/
ref_desc
.
GetStride
(
I2
);
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
]
*
DataPerRead
);
const
index_t
src_index
=
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
]
*
DataPerRead
);
src_desc
.
GetOffsetFromMultiIndex
(
did
[
0
],
did
[
1
],
did
[
2
]
*
DataPerRead
);
const
index_t
dst_index
=
dst_desc
.
GetOffsetFromMultiIndex
(
did
[
0
],
did
[
1
],
did
[
2
]
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
...
@@ -167,12 +169,13 @@ struct Blockwise3dTensorCopy3
...
@@ -167,12 +169,13 @@ struct Blockwise3dTensorCopy3
}
}
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor
(
ThreadPerDims
{});
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor
(
ThreadPerDims
{});
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndex
(
get_thread_local_1d_id
());
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
mSrcMyThreadOffset
=
SrcDesc
{}.
Get
1d
Index
(
mSrcMyThreadOffset
=
SrcDesc
{}.
Get
OffsetFromMulti
Index
(
thread_multi_id
[
0
],
thread_multi_id
[
1
],
thread_multi_id
[
2
]
*
DataPerRead
);
thread_multi_id
[
0
],
thread_multi_id
[
1
],
thread_multi_id
[
2
]
*
DataPerRead
);
mDstMyThreadOffset
=
DstDesc
{}.
Get
1d
Index
(
mDstMyThreadOffset
=
DstDesc
{}.
Get
OffsetFromMulti
Index
(
thread_multi_id
[
0
],
thread_multi_id
[
1
],
thread_multi_id
[
2
]
*
DataPerRead
);
thread_multi_id
[
0
],
thread_multi_id
[
1
],
thread_multi_id
[
2
]
*
DataPerRead
);
}
}
...
@@ -214,12 +217,12 @@ struct Blockwise3dTensorCopy3
...
@@ -214,12 +217,12 @@ struct Blockwise3dTensorCopy3
for
(
index_t
iloop_d2
=
0
;
iloop_d2
<
nloop_d2
;
++
iloop_d2
)
for
(
index_t
iloop_d2
=
0
;
iloop_d2
<
nloop_d2
;
++
iloop_d2
)
{
{
const
index_t
src_offset
=
const
index_t
src_offset
=
SrcDesc
{}.
Get
1d
Index
(
iloop_d0
*
thread_per_d0
,
SrcDesc
{}.
Get
OffsetFromMulti
Index
(
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
*
DataPerRead
);
iloop_d2
*
thread_per_d2
*
DataPerRead
);
const
index_t
dst_offset
=
const
index_t
dst_offset
=
DstDesc
{}.
Get
1d
Index
(
iloop_d0
*
thread_per_d0
,
DstDesc
{}.
Get
OffsetFromMulti
Index
(
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
*
DataPerRead
);
iloop_d2
*
thread_per_d2
*
DataPerRead
);
...
@@ -295,12 +298,12 @@ struct Blockwise3dTensorCopy3
...
@@ -295,12 +298,12 @@ struct Blockwise3dTensorCopy3
for
(
index_t
iloop_d2
=
0
;
iloop_d2
<
nloop_d2
;
++
iloop_d2
)
for
(
index_t
iloop_d2
=
0
;
iloop_d2
<
nloop_d2
;
++
iloop_d2
)
{
{
const
index_t
src_offset
=
const
index_t
src_offset
=
SrcDesc
{}.
Get
1d
Index
(
iloop_d0
*
thread_per_d0
,
SrcDesc
{}.
Get
OffsetFromMulti
Index
(
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
*
DataPerRead
);
iloop_d2
*
thread_per_d2
*
DataPerRead
);
const
index_t
clipboard_offset
=
const
index_t
clipboard_offset
=
clipboard_desc
.
GetOffsetFromMultiIndex
(
clipboard_desc
.
Get1dIndex
(
iloop_d0
,
iloop_d1
,
iloop_d2
*
DataPerRead
);
iloop_d0
,
iloop_d1
,
iloop_d2
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_clipboard
[
clipboard_offset
]))
=
*
(
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_clipboard
[
clipboard_offset
]))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
+
mSrcMyThreadOffset
]));
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
+
mSrcMyThreadOffset
]));
...
@@ -350,11 +353,11 @@ struct Blockwise3dTensorCopy3
...
@@ -350,11 +353,11 @@ struct Blockwise3dTensorCopy3
#pragma unroll
#pragma unroll
for
(
index_t
iloop_d2
=
0
;
iloop_d2
<
nloop_d2
;
++
iloop_d2
)
for
(
index_t
iloop_d2
=
0
;
iloop_d2
<
nloop_d2
;
++
iloop_d2
)
{
{
const
index_t
clipboard_offset
=
const
index_t
clipboard_offset
=
clipboard_desc
.
GetOffsetFromMultiIndex
(
clipboard_desc
.
Get1dIndex
(
iloop_d0
,
iloop_d1
,
iloop_d2
*
DataPerRead
);
iloop_d0
,
iloop_d1
,
iloop_d2
*
DataPerRead
);
const
index_t
dst_offset
=
const
index_t
dst_offset
=
DstDesc
{}.
Get
1d
Index
(
iloop_d0
*
thread_per_d0
,
DstDesc
{}.
Get
OffsetFromMulti
Index
(
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
*
DataPerRead
);
iloop_d2
*
thread_per_d2
*
DataPerRead
);
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
acd7082f
...
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
...
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
dst_desc
.
GetLengths
());
constexpr
auto
desc
=
make_
packed_
ConstantTensorDescriptor
(
dst_desc
.
GetLengths
());
#if 0
#if 0
if(get_thread_local_1d_id() == 0)
if(get_thread_local_1d_id() == 0)
...
@@ -43,7 +43,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
...
@@ -43,7 +43,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const
index_t
did3
=
is
/
desc
.
GetStride
(
I3
);
const
index_t
did3
=
is
/
desc
.
GetStride
(
I3
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
,
did2
,
did3
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
,
did2
,
did3
);
f
(
p_dst
[
dindex
]);
f
(
p_dst
[
dindex
]);
}
}
...
@@ -70,7 +70,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
...
@@ -70,7 +70,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const
index_t
did3
=
is
/
desc
.
GetStride
(
I3
);
const
index_t
did3
=
is
/
desc
.
GetStride
(
I3
);
const
index_t
dindex
=
dst_desc
.
Get
1d
Index
(
did0
,
did1
,
did2
,
did3
);
const
index_t
dindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
,
did2
,
did3
);
f
(
p_dst
[
dindex
]);
f
(
p_dst
[
dindex
]);
}
}
...
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
constexpr
auto
ref_desc
=
make_
packed_
ConstantTensorDescriptor
(
SrcOpLengths
{});
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
...
@@ -132,9 +132,10 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -132,9 +132,10 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
index_t
src_index
=
src_desc
.
Get
1d
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
index_t
src_index
=
src_desc
.
Get
OffsetFromMulti
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
const
index_t
dst_index
=
dst_desc
.
GetOffsetFromMultiIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
f
(
p_src
[
src_index
],
p_dst
[
dst_index
]);
f
(
p_src
[
src_index
],
p_dst
[
dst_index
]);
}
}
...
@@ -163,9 +164,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -163,9 +164,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
index_t
src_index
=
src_desc
.
GetOffsetFromMultiIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
const
index_t
dst_index
=
dst_desc
.
GetOffsetFromMultiIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
f
(
p_src
[
src_index
],
p_dst
[
dst_index
]);
f
(
p_src
[
src_index
],
p_dst
[
dst_index
]);
}
}
...
@@ -256,7 +259,7 @@ struct Blockwise4dTensorCopy1
...
@@ -256,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr
index_t
read_per_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
DataPerRead
);
constexpr
index_t
read_per_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
DataPerRead
);
constexpr
auto
ref_desc
=
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
L2
,
read_per_d3
>
{});
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
L2
,
read_per_d3
>
{});
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
...
@@ -278,9 +281,9 @@ struct Blockwise4dTensorCopy1
...
@@ -278,9 +281,9 @@ struct Blockwise4dTensorCopy1
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
index_t
src_index
=
const
index_t
src_index
=
src_desc
.
Get
1d
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]
*
DataPerRead
);
src_desc
.
Get
OffsetFromMulti
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]
*
DataPerRead
);
const
index_t
dst_index
=
const
index_t
dst_index
=
dst_desc
.
Get
1d
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]
*
DataPerRead
);
dst_desc
.
Get
OffsetFromMulti
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
...
@@ -333,16 +336,16 @@ struct BlockwiseChwnTensorCopyPadded
...
@@ -333,16 +336,16 @@ struct BlockwiseChwnTensorCopyPadded
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
DstOpLengths
{});
constexpr
auto
ref_desc
=
make_
packed_
ConstantTensorDescriptor
(
DstOpLengths
{});
constexpr
auto
h_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I0
);
constexpr
auto
h_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I0
);
constexpr
auto
w_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I1
);
constexpr
auto
w_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I1
);
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
const
Float
*
p_src_tmp
=
const
Float
*
p_src_tmp
=
p_src
+
p_src
+
src_desc
.
GetOffsetFromMultiIndex
(
src_desc
.
Get1dIndex
(
c_block_data_begin
,
c_block_data_begin
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
n_block_data_begin
);
n_block_data_begin
);
...
@@ -389,13 +392,13 @@ struct BlockwiseChwnTensorCopyPadded
...
@@ -389,13 +392,13 @@ struct BlockwiseChwnTensorCopyPadded
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
index_t
bindex
=
dst_desc
.
Get
1d
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
index_t
bindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
p_dst
[
bindex
]
=
p_dst
[
bindex
]
=
(
did
[
1
]
<
h_block_pad_low
||
did
[
1
]
+
h_block_pad_up
>=
ref_desc
.
GetLength
(
I1
)
||
(
did
[
1
]
<
h_block_pad_low
||
did
[
1
]
+
h_block_pad_up
>=
ref_desc
.
GetLength
(
I1
)
||
did
[
2
]
<
w_block_pad_low
||
did
[
2
]
+
w_block_pad_up
>=
ref_desc
.
GetLength
(
I2
))
did
[
2
]
<
w_block_pad_low
||
did
[
2
]
+
w_block_pad_up
>=
ref_desc
.
GetLength
(
I2
))
?
Float
(
0
)
?
Float
(
0
)
:
p_src_tmp
[
src_desc
.
Get
1d
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
])];
:
p_src_tmp
[
src_desc
.
Get
OffsetFromMulti
Index
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
])];
}
}
constexpr
bool
has_tail
=
(
ref_desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
constexpr
bool
has_tail
=
(
ref_desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
...
@@ -422,14 +425,16 @@ struct BlockwiseChwnTensorCopyPadded
...
@@ -422,14 +425,16 @@ struct BlockwiseChwnTensorCopyPadded
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
index_t
bindex
=
dst_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
index_t
bindex
=
dst_desc
.
GetOffsetFromMultiIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
p_dst
[
bindex
]
=
p_dst
[
bindex
]
=
(
did
[
1
]
<
h_block_pad_low
||
(
did
[
1
]
<
h_block_pad_low
||
did
[
1
]
+
h_block_pad_up
>=
ref_desc
.
GetLength
(
I1
)
||
did
[
1
]
+
h_block_pad_up
>=
ref_desc
.
GetLength
(
I1
)
||
did
[
2
]
<
w_block_pad_low
||
did
[
2
]
+
w_block_pad_up
>=
ref_desc
.
GetLength
(
I2
))
did
[
2
]
<
w_block_pad_low
||
did
[
2
]
+
w_block_pad_up
>=
ref_desc
.
GetLength
(
I2
))
?
Float
(
0
)
?
Float
(
0
)
:
p_src_tmp
[
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
])];
:
p_src_tmp
[
src_desc
.
GetOffsetFromMultiIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
])];
}
}
}
}
}
}
...
@@ -505,15 +510,16 @@ struct Blockwise4dTensorCopy3
...
@@ -505,15 +510,16 @@ struct Blockwise4dTensorCopy3
}
}
}
}
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor
(
ThreadPerDims
{});
constexpr
auto
thread_cluster_desc
=
make_packed_ConstantTensorDescriptor
(
ThreadPerDims
{});
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndex
(
get_thread_local_1d_id
());
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
mSrcMyThreadOffset
=
SrcDesc
{}.
Get
1d
Index
(
thread_multi_id
[
0
],
mSrcMyThreadOffset
=
SrcDesc
{}.
Get
OffsetFromMulti
Index
(
thread_multi_id
[
0
],
thread_multi_id
[
1
],
thread_multi_id
[
1
],
thread_multi_id
[
2
],
thread_multi_id
[
2
],
thread_multi_id
[
3
]
*
DataPerRead
);
thread_multi_id
[
3
]
*
DataPerRead
);
mDstMyThreadOffset
=
DstDesc
{}.
Get
1d
Index
(
thread_multi_id
[
0
],
mDstMyThreadOffset
=
DstDesc
{}.
Get
OffsetFromMulti
Index
(
thread_multi_id
[
0
],
thread_multi_id
[
1
],
thread_multi_id
[
1
],
thread_multi_id
[
2
],
thread_multi_id
[
2
],
thread_multi_id
[
3
]
*
DataPerRead
);
thread_multi_id
[
3
]
*
DataPerRead
);
...
@@ -564,14 +570,14 @@ struct Blockwise4dTensorCopy3
...
@@ -564,14 +570,14 @@ struct Blockwise4dTensorCopy3
#pragma unroll
#pragma unroll
for
(
index_t
iloop_d3
=
0
;
iloop_d3
<
nloop_d3
;
++
iloop_d3
)
for
(
index_t
iloop_d3
=
0
;
iloop_d3
<
nloop_d3
;
++
iloop_d3
)
{
{
const
index_t
src_offset
=
const
index_t
src_offset
=
SrcDesc
{}.
GetOffsetFromMultiIndex
(
SrcDesc
{}.
Get1dIndex
(
iloop_d0
*
thread_per_d0
,
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
,
iloop_d2
*
thread_per_d2
,
iloop_d3
*
thread_per_d3
*
DataPerRead
);
iloop_d3
*
thread_per_d3
*
DataPerRead
);
const
index_t
dst_offset
=
const
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
DstDesc
{}.
Get1dIndex
(
iloop_d0
*
thread_per_d0
,
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
,
iloop_d2
*
thread_per_d2
,
iloop_d3
*
thread_per_d3
*
DataPerRead
);
iloop_d3
*
thread_per_d3
*
DataPerRead
);
...
@@ -646,7 +652,7 @@ struct Blockwise4dTensorCopy3
...
@@ -646,7 +652,7 @@ struct Blockwise4dTensorCopy3
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
clipboard_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
#pragma unroll
#pragma unroll
...
@@ -661,13 +667,13 @@ struct Blockwise4dTensorCopy3
...
@@ -661,13 +667,13 @@ struct Blockwise4dTensorCopy3
#pragma unroll
#pragma unroll
for
(
index_t
iloop_d3
=
0
;
iloop_d3
<
nloop_d3
;
++
iloop_d3
)
for
(
index_t
iloop_d3
=
0
;
iloop_d3
<
nloop_d3
;
++
iloop_d3
)
{
{
const
index_t
src_offset
=
const
index_t
src_offset
=
SrcDesc
{}.
GetOffsetFromMultiIndex
(
SrcDesc
{}.
Get1dIndex
(
iloop_d0
*
thread_per_d0
,
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
,
iloop_d2
*
thread_per_d2
,
iloop_d3
*
thread_per_d3
*
DataPerRead
);
iloop_d3
*
thread_per_d3
*
DataPerRead
);
const
index_t
clipboard_offset
=
clipboard_desc
.
Get
1d
Index
(
const
index_t
clipboard_offset
=
clipboard_desc
.
Get
OffsetFromMulti
Index
(
iloop_d0
,
iloop_d1
,
iloop_d2
,
iloop_d3
*
DataPerRead
);
iloop_d0
,
iloop_d1
,
iloop_d2
,
iloop_d3
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_clipboard
[
clipboard_offset
]))
=
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_clipboard
[
clipboard_offset
]))
=
...
@@ -713,7 +719,7 @@ struct Blockwise4dTensorCopy3
...
@@ -713,7 +719,7 @@ struct Blockwise4dTensorCopy3
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
clipboard_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
#pragma unroll
#pragma unroll
...
@@ -728,11 +734,11 @@ struct Blockwise4dTensorCopy3
...
@@ -728,11 +734,11 @@ struct Blockwise4dTensorCopy3
#pragma unroll
#pragma unroll
for
(
index_t
iloop_d3
=
0
;
iloop_d3
<
nloop_d3
;
++
iloop_d3
)
for
(
index_t
iloop_d3
=
0
;
iloop_d3
<
nloop_d3
;
++
iloop_d3
)
{
{
const
index_t
clipboard_offset
=
clipboard_desc
.
Get
1d
Index
(
const
index_t
clipboard_offset
=
clipboard_desc
.
Get
OffsetFromMulti
Index
(
iloop_d0
,
iloop_d1
,
iloop_d2
,
iloop_d3
*
DataPerRead
);
iloop_d0
,
iloop_d1
,
iloop_d2
,
iloop_d3
*
DataPerRead
);
const
index_t
dst_offset
=
const
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
DstDesc
{}.
Get1dIndex
(
iloop_d0
*
thread_per_d0
,
iloop_d0
*
thread_per_d0
,
iloop_d1
*
thread_per_d1
,
iloop_d1
*
thread_per_d1
,
iloop_d2
*
thread_per_d2
,
iloop_d2
*
thread_per_d2
,
iloop_d3
*
thread_per_d3
*
DataPerRead
);
iloop_d3
*
thread_per_d3
*
DataPerRead
);
...
...
src/include/blockwise_batched_gemm.hip.hpp
View file @
acd7082f
...
@@ -87,10 +87,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -87,10 +87,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
const
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
mMyThreadOffsetA
=
c_thread_mtx_index
.
batch
*
BlockMatrixStrideA
+
mMyThreadOffsetA
=
c_thread_mtx_index
.
batch
*
BlockMatrixStrideA
+
a_block_mtx
.
Get
1d
Index
(
0
,
c_thread_mtx_index
.
row
);
a_block_mtx
.
Get
OffsetFromMulti
Index
(
0
,
c_thread_mtx_index
.
row
);
mMyThreadOffsetB
=
c_thread_mtx_index
.
batch
*
BlockMatrixStrideB
+
mMyThreadOffsetB
=
c_thread_mtx_index
.
batch
*
BlockMatrixStrideB
+
b_block_mtx
.
Get
1d
Index
(
0
,
c_thread_mtx_index
.
col
);
b_block_mtx
.
Get
OffsetFromMulti
Index
(
0
,
c_thread_mtx_index
.
col
);
#if 0
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
...
@@ -221,10 +221,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -221,10 +221,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
threadwise_matrix_copy
(
threadwise_matrix_copy
(
a_block_mtx
,
a_block_mtx
,
p_a_block
+
p_a_block
+
a_block_mtx
.
Get1dIndex
(
k_begin
,
m_repeat
*
MPerLevel1Cluster
)
+
a_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
m_repeat
*
MPerLevel1Cluster
)
+
ib
*
BlockMatrixStrideA
+
mMyThreadOffsetA
,
ib
*
BlockMatrixStrideA
+
mMyThreadOffsetA
,
a_thread_mtx
,
a_thread_mtx
,
p_a_thread
+
a_thread_mtx
.
Get1dIndex
(
0
,
m_repeat
*
MPerThreadSubC
),
p_a_thread
+
a_thread_mtx
.
GetOffsetFromMultiIndex
(
0
,
m_repeat
*
MPerThreadSubC
),
a_thread_sub_mtx
.
GetLengths
(),
a_thread_sub_mtx
.
GetLengths
(),
Number
<
DataPerReadA
>
{});
Number
<
DataPerReadA
>
{});
}
}
...
@@ -238,10 +240,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -238,10 +240,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
threadwise_matrix_copy
(
threadwise_matrix_copy
(
b_block_mtx
,
b_block_mtx
,
p_b_block
+
p_b_block
+
b_block_mtx
.
Get1dIndex
(
k_begin
,
n_repeat
*
NPerLevel1Cluster
)
+
b_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
n_repeat
*
NPerLevel1Cluster
)
+
ib
*
BlockMatrixStrideB
+
mMyThreadOffsetB
,
ib
*
BlockMatrixStrideB
+
mMyThreadOffsetB
,
b_thread_mtx
,
b_thread_mtx
,
p_b_thread
+
b_thread_mtx
.
Get1dIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
p_b_thread
+
b_thread_mtx
.
GetOffsetFromMultiIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
b_thread_sub_mtx
.
GetLengths
(),
b_thread_sub_mtx
.
GetLengths
(),
Number
<
DataPerReadB
>
{});
Number
<
DataPerReadB
>
{});
}
}
...
@@ -343,9 +347,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -343,9 +347,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
reg_a
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_a_block
[
mMyThreadOffsetA
]);
reg_a
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_a_block
[
mMyThreadOffsetA
]);
reg_b
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_b_block
[
mMyThreadOffsetB
]);
reg_b
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_b_block
[
mMyThreadOffsetB
]);
reg_b
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
reg_b
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_b_block
[
b_block_mtx
.
Get1dIndex
(
0
,
NPerLevel1Cluster
)
+
mMyThreadOffsetB
]);
&
p_b_block
[
b_block_mtx
.
GetOffsetFromMultiIndex
(
0
,
NPerLevel1Cluster
)
+
mMyThreadOffsetB
]);
reg_a
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
reg_a
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_a_block
[
a_block_mtx
.
Get1dIndex
(
0
,
MPerLevel1Cluster
)
+
mMyThreadOffsetA
]);
&
p_a_block
[
a_block_mtx
.
GetOffsetFromMultiIndex
(
0
,
MPerLevel1Cluster
)
+
mMyThreadOffsetA
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
...
@@ -353,15 +359,17 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -353,15 +359,17 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
for
(
index_t
k
=
1
;
k
<
K
;
++
k
)
for
(
index_t
k
=
1
;
k
<
K
;
++
k
)
{
{
reg_a
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
reg_a
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_a_block
[
a_block_mtx
.
Get
1d
Index
(
k
,
0
)
+
mMyThreadOffsetA
]);
&
p_a_block
[
a_block_mtx
.
Get
OffsetFromMulti
Index
(
k
,
0
)
+
mMyThreadOffsetA
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
reg_b
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
reg_b
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_b_block
[
b_block_mtx
.
Get
1d
Index
(
k
,
0
)
+
mMyThreadOffsetB
]);
&
p_b_block
[
b_block_mtx
.
Get
OffsetFromMulti
Index
(
k
,
0
)
+
mMyThreadOffsetB
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
reg_b
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
reg_b
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_b_block
[
b_block_mtx
.
Get1dIndex
(
k
,
NPerLevel1Cluster
)
+
mMyThreadOffsetB
]);
&
p_b_block
[
b_block_mtx
.
GetOffsetFromMultiIndex
(
k
,
NPerLevel1Cluster
)
+
mMyThreadOffsetB
]);
reg_a
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
reg_a
[
1
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_a_block
[
a_block_mtx
.
Get1dIndex
(
k
,
MPerLevel1Cluster
)
+
mMyThreadOffsetA
]);
&
p_a_block
[
a_block_mtx
.
GetOffsetFromMultiIndex
(
k
,
MPerLevel1Cluster
)
+
mMyThreadOffsetA
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
}
}
...
@@ -489,7 +497,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -489,7 +497,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
const
index_t
c_thread_offset
=
const
index_t
c_thread_offset
=
c_thread_mtx_begin
.
batch
*
BlockMatrixStrideC
+
c_thread_mtx_begin
.
batch
*
BlockMatrixStrideC
+
c_block_mtx
.
Get
1d
Index
(
c_thread_mtx_begin
.
row
,
c_thread_mtx_begin
.
col
);
c_block_mtx
.
Get
OffsetFromMulti
Index
(
c_thread_mtx_begin
.
row
,
c_thread_mtx_begin
.
col
);
for
(
index_t
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
for
(
index_t
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
{
...
@@ -498,11 +506,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -498,11 +506,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
threadwise_matrix_copy
(
threadwise_matrix_copy
(
c_thread_sub_mtx
,
c_thread_sub_mtx
,
p_c_thread
+
p_c_thread
+
c_thread_sub_mtx
.
Get
1d
Index
(
m_repeat
*
MPerLevel1Cluster
,
c_thread_sub_mtx
.
Get
OffsetFromMulti
Index
(
m_repeat
*
MPerLevel1Cluster
,
n_repeat
*
NPerLevel1Cluster
),
n_repeat
*
NPerLevel1Cluster
),
c_block_mtx
,
c_block_mtx
,
p_c_block
+
p_c_block
+
c_block_mtx
.
Get
1d
Index
(
m_repeat
*
MPerLevel1Cluster
,
c_block_mtx
.
Get
OffsetFromMulti
Index
(
m_repeat
*
MPerLevel1Cluster
,
n_repeat
*
NPerLevel1Cluster
)
+
n_repeat
*
NPerLevel1Cluster
)
+
c_thread_offset
,
c_thread_offset
,
c_thread_sub_mtx
.
GetLengths
());
c_thread_sub_mtx
.
GetLengths
());
...
...
src/include/blockwise_gemm.hip.hpp
View file @
acd7082f
...
@@ -51,8 +51,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -51,8 +51,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
mMyThreadOffsetA
=
BlockMatrixA
::
Get
1d
Index
(
0
,
c_thread_mtx_index
.
row
);
mMyThreadOffsetA
=
BlockMatrixA
::
Get
OffsetFromMulti
Index
(
0
,
c_thread_mtx_index
.
row
);
mMyThreadOffsetB
=
BlockMatrixB
::
Get
1d
Index
(
0
,
c_thread_mtx_index
.
col
);
mMyThreadOffsetB
=
BlockMatrixB
::
Get
OffsetFromMulti
Index
(
0
,
c_thread_mtx_index
.
col
);
}
}
__device__
static
auto
GetThreadMatrixCLengths
()
__device__
static
auto
GetThreadMatrixCLengths
()
...
@@ -248,10 +248,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -248,10 +248,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
{
{
threadwise_matrix_copy
(
threadwise_matrix_copy
(
a_block_mtx
,
a_block_mtx
,
p_a_block
+
a_block_mtx
.
Get1dIndex
(
k_begin
,
m_repeat
*
MPerLevel1Cluster
)
+
p_a_block
+
a_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
m_repeat
*
MPerLevel1Cluster
)
+
mMyThreadOffsetA
,
mMyThreadOffsetA
,
a_thread_mtx
,
a_thread_mtx
,
p_a_thread
+
a_thread_mtx
.
Get
1d
Index
(
0
,
m_repeat
*
MPerThreadSubC
),
p_a_thread
+
a_thread_mtx
.
Get
OffsetFromMulti
Index
(
0
,
m_repeat
*
MPerThreadSubC
),
a_thread_sub_mtx
.
GetLengths
(),
a_thread_sub_mtx
.
GetLengths
(),
Number
<
DataPerReadA
>
{});
Number
<
DataPerReadA
>
{});
}
}
...
@@ -262,10 +263,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -262,10 +263,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
{
{
threadwise_matrix_copy
(
threadwise_matrix_copy
(
b_block_mtx
,
b_block_mtx
,
p_b_block
+
b_block_mtx
.
Get1dIndex
(
k_begin
,
n_repeat
*
NPerLevel1Cluster
)
+
p_b_block
+
b_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
n_repeat
*
NPerLevel1Cluster
)
+
mMyThreadOffsetB
,
mMyThreadOffsetB
,
b_thread_mtx
,
b_thread_mtx
,
p_b_thread
+
b_thread_mtx
.
Get
1d
Index
(
0
,
n_repeat
*
NPerThreadSubC
),
p_b_thread
+
b_thread_mtx
.
Get
OffsetFromMulti
Index
(
0
,
n_repeat
*
NPerThreadSubC
),
b_thread_sub_mtx
.
GetLengths
(),
b_thread_sub_mtx
.
GetLengths
(),
Number
<
DataPerReadB
>
{});
Number
<
DataPerReadB
>
{});
}
}
...
...
src/include/blockwise_merged_tensor_slice_op.hip.hpp
View file @
acd7082f
...
@@ -11,7 +11,7 @@ template <index_t BlockSize,
...
@@ -11,7 +11,7 @@ template <index_t BlockSize,
class
SliceLengths
,
class
SliceLengths
,
class
SubLengths
,
class
SubLengths
,
class
ClusterLengths
,
class
ClusterLengths
,
class
ThreadArrangeOrder
,
class
Thread
Cluster
ArrangeOrder
,
class
SrcAccessOrder
,
class
SrcAccessOrder
,
class
DstAccessOrder
>
class
DstAccessOrder
>
struct
BlockwiseTensorSliceCopy_generic_v1
struct
BlockwiseTensorSliceCopy_generic_v1
...
@@ -21,28 +21,135 @@ struct BlockwiseTensorSliceCopy_generic_v1
...
@@ -21,28 +21,135 @@ struct BlockwiseTensorSliceCopy_generic_v1
index_t
mSrcMyThreadOffset
;
index_t
mSrcMyThreadOffset
;
index_t
mDstMyThreadOffset
;
index_t
mDstMyThreadOffset
;
__device__
BlockwiseTensorSliceCopy_generic_v1
(
Array
<
index_t
,
nDim
>
src_block_multi_
id_
offset
,
__device__
BlockwiseTensorSliceCopy_generic_v1
(
Array
<
index_t
,
nDim
>
src_block_multi_offset
,
Array
<
index_t
,
nDim
>
dst_block_multi_
id_
offset
)
Array
<
index_t
,
nDim
>
dst_block_multi_offset
)
{
{
// only support SrcSubLengths.GetLength() == 1 on merged dimension, for now
// check SrcDataPerRead should be 1, if last dimension is a merged dimension
// check NDim consistent
// check NDim consistent
static_assert
(
SrcDesc
::
GetNumOfDimension
()
==
DstDesc
::
GetNumOfDimension
(),
"wrong"
);
constexpr
auto
thread_cluster_desc
=
make_packed_ConstantTensorDescriptor
(
ClusterLengths
{}.
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
SliceLengths
{}.
Get
(
IDim
)
%
SubLenghs
{}.
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide sliced tensor into sub-tensor"
);
});
constexpr
auto
thread_work_desc
=
make_packed_ConstantTensorDescriptor
(
SliceLengths
{}
/
SliceSubLengths
{});
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
thread_work_desc
.
GetLength
(
IDim
)
%
thread_cluster_desc
.
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide work to cluster"
);
});
// only support SubLengths.Get() == 1 on merged dimension, for now
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_if
<
(
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
||
DstDesc
::
ContainMultipleOriginalDimensions
(
IDim
))
>
{}([
&
](
auto
fwd
)
{
static_assert
(
fwd
(
SubLengths
{}).
Get
(
IDim
)
==
1
,
"wrong! Sub-Lengths on merged dimension should be 1"
);
});
});
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
const
auto
thread_cluster_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
// calculate mSrcMyThreadOffset
const
auto
data_cluster_multi_id
=
// calculate mDstMyThreadOffset
reorder_array_given_old2new
(
thread_cluster_multi_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_multi_offset
=
data_cluster_multi_id
*
SubLengths
{};
mSrcMythreadOffset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_block_multi_offset
+
thread_data_multi_offset
);
mSrcMythreadOffset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_block_multi_offset
+
thread_data_multi_offset
);
}
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
{}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
SubLengths
{}
*
repeat_lengths
);
return
thread_tensor_desc
.
GetElementSpaceSize
();
}
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
Float
*
__restrict__
p_clipboard
)
const
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
src_data_multi_offset
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
auto
clipboard_data_multi_offset
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
{}.
GetOffsetFromMultiIndex
(
src_data_multi_id
);
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_id
);
threadwise_tensor_slice_copy_generic
(
SrcDesc
{},
p_src
+
src_offset
+
mSrcMyThreadOffset
,
thread_tensor_desc
,
zero_array
<
index_t
,
nDim
>
{},
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
zero_array
<
index_t
,
nDim
>
{},
thread_sub_tensor_lengths
,
SrcAccessOrder
{});
});
}
}
__device__
void
RunStoreRegisterClipboard
(
const
Float
*
__restrict__
p_clipboard
,
__device__
void
RunStoreRegisterClipboard
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
clipboard_data_multi_offset
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_multi_offset
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_offset
);
constexpr
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
dst_data_multi_offset
);
threadwise_tensor_slice_copy_generic
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
zero_array
<
index_t
,
nDim
>
{},
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
zero_array
<
index_t
,
nDim
>
{},
thread_sub_tensor_lengths
,
DstAccessOrder
{});
}
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
...
@@ -52,4 +159,4 @@ struct BlockwiseTensorSliceCopy_generic_v1
...
@@ -52,4 +159,4 @@ struct BlockwiseTensorSliceCopy_generic_v1
RunLoadRegisterClipboard
(
p_src
,
p_clipboard
);
RunLoadRegisterClipboard
(
p_src
,
p_clipboard
);
RunStoreRegisterClipboard
(
p_clipboard
,
p_dst
);
RunStoreRegisterClipboard
(
p_clipboard
,
p_dst
);
}
}
};
};
src/include/blockwise_tensor_slice_op.hip.hpp
View file @
acd7082f
...
@@ -39,7 +39,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -39,7 +39,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_cluster_lengths
=
constexpr
auto
thread_cluster_lengths
=
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor
(
thread_cluster_lengths
);
constexpr
auto
thread_cluster_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_cluster_lengths
);
// sanity check: data type
// sanity check: data type
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
...
@@ -105,7 +105,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -105,7 +105,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
}
}
}
}
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndex
(
get_thread_local_1d_id
());
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
// regsiters, or only one copy???
// regsiters, or only one copy???
...
@@ -115,17 +116,21 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -115,17 +116,21 @@ struct BlockwiseTensorSliceReorderCopy_v3
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
index_t
i
=
I
.
Get
();
constexpr
index_t
i
=
I
.
Get
();
// compiler: will it really compute index here, or be merged with Get1dIndex and
// compiler: will it really compute index here, or be merged with
// GetOffsetFromMultiIndex and
// optimized away???
// optimized away???
src_data_multi_id
[
i
]
*=
src_sub_lengths
.
Get
(
I
);
src_data_multi_id
[
i
]
*=
src_sub_lengths
.
Get
(
I
);
});
});
// compiler: will it really compute index here, or be merged with Get1dIndex and
// compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
// and
// optimized away???
// optimized away???
const
auto
dst_data_multi_id
=
reorder_array_given_new2old
(
src_data_multi_id
,
map_dst2src
);
const
auto
dst_data_multi_id
=
reorder_array_given_new2old
(
src_data_multi_id
,
map_dst2src
);
mSrcMyThreadOffset
=
src_desc
.
Get1dIndex
(
src_data_multi_id
+
src_block_data_multi_id_begin
);
mSrcMyThreadOffset
=
mDstMyThreadOffset
=
dst_desc
.
Get1dIndex
(
dst_data_multi_id
+
dst_block_data_multi_id_begin
);
src_desc
.
GetOffsetFromMultiIndex
(
src_data_multi_id
+
src_block_data_multi_id_begin
);
mDstMyThreadOffset
=
dst_desc
.
GetOffsetFromMultiIndex
(
dst_data_multi_id
+
dst_block_data_multi_id_begin
);
}
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
...
@@ -142,7 +147,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -142,7 +147,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_tensor_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_tensor_lengths
);
return
thread_tensor_desc
.
GetElementSpace
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
...
@@ -162,7 +167,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -162,7 +167,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_tensor_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
@@ -171,9 +176,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -171,9 +176,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
clipboard_data_multi_id
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
clipboard_data_multi_id
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
{}.
Get
1d
Index
(
src_data_multi_id
);
constexpr
index_t
src_offset
=
SrcDesc
{}.
Get
OffsetFromMulti
Index
(
src_data_multi_id
);
constexpr
index_t
clipboard_offset
=
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
Get
1d
Index
(
clipboard_data_multi_id
);
thread_tensor_desc
.
Get
OffsetFromMulti
Index
(
clipboard_data_multi_id
);
threadwise_tensor_slice_copy
(
SrcDesc
{},
threadwise_tensor_slice_copy
(
SrcDesc
{},
p_src
+
src_offset
+
mSrcMyThreadOffset
,
p_src
+
src_offset
+
mSrcMyThreadOffset
,
...
@@ -199,7 +204,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -199,7 +204,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_tensor_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
@@ -212,9 +217,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -212,9 +217,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
dst_data_multi_id
=
src_data_multi_id
.
ReorderGivenNew2Old
(
MapDst2Src
{});
constexpr
auto
dst_data_multi_id
=
src_data_multi_id
.
ReorderGivenNew2Old
(
MapDst2Src
{});
constexpr
index_t
clipboard_offset
=
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
Get
1d
Index
(
clipboard_data_multi_id
);
thread_tensor_desc
.
Get
OffsetFromMulti
Index
(
clipboard_data_multi_id
);
constexpr
index_t
dst_offset
=
DstDesc
{}.
Get
1d
Index
(
dst_data_multi_id
);
constexpr
index_t
dst_offset
=
DstDesc
{}.
Get
OffsetFromMulti
Index
(
dst_data_multi_id
);
// write in the order of dst
// write in the order of dst
#if 1
#if 1
...
...
src/include/conv_common.hip.hpp
View file @
acd7082f
...
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
...
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
1
-
X
;
constexpr
auto
WO
=
WI
+
1
-
X
;
return
make_ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
return
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
}
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
...
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
...
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
Y
;
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
X
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
X
;
return
make_ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
return
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
}
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
View file @
acd7082f
...
@@ -180,17 +180,18 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
...
@@ -180,17 +180,18 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
blockwise_in_copy
.
Run
(
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
wi_block_data_begin
),
p_in_block
);
p_in_block
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_wei_copy
.
Run
(
blockwise_wei_copy
.
Run
(
p_wei_global
+
p_wei_global
+
wei_kcyx_global_desc
.
GetOffsetFromMultiIndex
(
wei_kcyx_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_block
);
p_wei_block
);
__syncthreads
();
__syncthreads
();
...
@@ -202,26 +203,28 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
...
@@ -202,26 +203,28 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
threadwise_direct_convolution_2
(
threadwise_direct_convolution_2
(
in_nchw_thread_block_desc
,
in_nchw_thread_block_desc
,
p_in_block
+
p_in_block
+
in_nchw_block_desc
.
Get
1d
Index
(
n_thread_data_begin
,
in_nchw_block_desc
.
Get
OffsetFromMulti
Index
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
wei_kcyx_thread_block_desc
,
wei_kcyx_thread_block_desc
,
p_wei_block
+
p_wei_block
+
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
wei_kcyx_block_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
out_nkhw_thread_desc
,
p_out_thread
);
p_out_thread
);
#elif 0
#elif 0
threadwise_direct_convolution_3
(
threadwise_direct_convolution_3
(
in_nchw_thread_block_desc
,
in_nchw_thread_block_desc
,
p_in_block
+
p_in_block
+
in_nchw_block_desc
.
Get
1d
Index
(
n_thread_data_begin
,
in_nchw_block_desc
.
Get
OffsetFromMulti
Index
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
wei_kcyx_thread_block_desc
,
wei_kcyx_thread_block_desc
,
p_wei_block
+
p_wei_block
+
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
wei_kcyx_block_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
out_nkhw_thread_desc
,
p_out_thread
);
p_out_thread
);
#endif
#endif
...
@@ -229,12 +232,12 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
...
@@ -229,12 +232,12 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
}
}
// copy output tensor from register to global mem
// copy output tensor from register to global mem
threadwise_tensor_slice_copy
(
threadwise_tensor_slice_copy
(
out_nkhw_thread_desc
,
out_nkhw_thread_desc
,
p_out_thread
,
p_out_thread
,
out_nkhw_global_desc
,
out_nkhw_global_desc
,
p_out_global
+
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
out_nkhw_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
View file @
acd7082f
...
@@ -221,11 +221,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
...
@@ -221,11 +221,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
const
Float
*
p_in_global_block_offset
=
const
Float
*
p_in_global_block_offset
=
p_in_global
+
p_in_global
+
in_c_h_w_n_global_desc
.
Get
1d
Index
(
in_c_h_w_n_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_c_h_w_n_global_desc
.
GetStride
(
I0
),
p_in_global_block_offset
+=
CPerBlock
*
in_c_h_w_n_global_desc
.
GetStride
(
I0
),
...
@@ -261,8 +262,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
...
@@ -261,8 +262,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
#else
#else
blockwise_batch_gemm
.
Run_asm
blockwise_batch_gemm
.
Run_asm
#endif
#endif
(
p_wei_block
+
wei_c_y_x_k_block_desc
.
Get
1d
Index
(
0
,
y
,
x
,
0
),
(
p_wei_block
+
wei_c_y_x_k_block_desc
.
Get
OffsetFromMulti
Index
(
0
,
y
,
x
,
0
),
p_in_block
+
in_c_h_w_n_block_desc
.
Get
1d
Index
(
0
,
y
,
x
,
0
),
p_in_block
+
in_c_h_w_n_block_desc
.
Get
OffsetFromMulti
Index
(
0
,
y
,
x
,
0
),
p_out_thread
);
p_out_thread
);
}
}
}
}
...
@@ -325,12 +326,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
...
@@ -325,12 +326,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
}
}
#endif
#endif
threadwise_tensor_slice_copy
(
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
out_10d_thread_desc
,
p_out_thread
,
p_out_thread
,
out_10d_global_desc
,
out_10d_global_desc
,
p_out_global
+
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
...
@@ -375,12 +376,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
...
@@ -375,12 +376,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
}
}
#endif
#endif
threadwise_tensor_slice_copy
(
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
out_10d_thread_desc
,
p_out_thread
,
p_out_thread
,
out_10d_global_desc
,
out_10d_global_desc
,
p_out_global
+
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
View file @
acd7082f
...
@@ -230,11 +230,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
...
@@ -230,11 +230,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
#if 1
#if 1
const
Float
*
p_in_global_block_offset
=
const
Float
*
p_in_global_block_offset
=
p_in_global
+
p_in_global
+
in_c_h_w_n_global_desc
.
Get
1d
Index
(
in_c_h_w_n_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_c_h_w_n_global_desc
.
GetStride
(
I0
),
p_in_global_block_offset
+=
CPerBlock
*
in_c_h_w_n_global_desc
.
GetStride
(
I0
),
...
@@ -242,21 +243,23 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
...
@@ -242,21 +243,23 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
{
{
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
{
blockwise_in_copy
.
Run
(
p_in_global_block_offset
+
blockwise_in_copy
.
Run
(
in_c_h_w_n_global_desc
.
Get1dIndex
(
0
,
y
,
0
,
0
),
p_in_global_block_offset
+
in_c_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
0
,
0
),
p_in_block
);
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
+
blockwise_wei_copy
.
Run
(
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
0
,
0
),
p_wei_global_block_offset
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
0
,
0
),
p_wei_block
);
p_wei_block
);
__syncthreads
();
__syncthreads
();
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c_x_k_block_desc
.
Get1dIndex
(
0
,
x
,
0
),
blockwise_batch_gemm
.
Run
(
p_in_block
+
p_wei_block
+
wei_c_x_k_block_desc
.
GetOffsetFromMultiIndex
(
0
,
x
,
0
),
in_c_h_w_n_block_desc
.
Get
1d
Index
(
0
,
0
,
x
,
0
),
p_in_block
+
in_c_h_w_n_block_desc
.
Get
OffsetFromMulti
Index
(
0
,
0
,
x
,
0
),
p_out_thread
);
p_out_thread
);
}
}
...
@@ -269,11 +272,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
...
@@ -269,11 +272,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
{
{
const
Float
*
p_in_global_block_offset
=
const
Float
*
p_in_global_block_offset
=
p_in_global
+
p_in_global
+
in_c_h_w_n_global_desc
.
Get
1d
Index
(
in_c_h_w_n_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
,
n_block_data_begin
);
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
0
,
k_block_data_begin
);
for
(
index_t
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
=
0
;
...
@@ -290,9 +294,9 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
...
@@ -290,9 +294,9 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c_x_k_block_desc
.
Get1dIndex
(
0
,
x
,
0
),
blockwise_batch_gemm
.
Run
(
p_in_block
+
p_wei_block
+
wei_c_x_k_block_desc
.
GetOffsetFromMultiIndex
(
0
,
x
,
0
),
in_c_h_w_n_block_desc
.
Get
1d
Index
(
0
,
0
,
x
,
0
),
p_in_block
+
in_c_h_w_n_block_desc
.
Get
OffsetFromMulti
Index
(
0
,
0
,
x
,
0
),
p_out_thread
);
p_out_thread
);
}
}
...
@@ -358,12 +362,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
...
@@ -358,12 +362,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
}
}
#endif
#endif
threadwise_tensor_slice_copy
(
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
out_10d_thread_desc
,
p_out_thread
,
p_out_thread
,
out_10d_global_desc
,
out_10d_global_desc
,
p_out_global
+
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
...
@@ -408,12 +412,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
...
@@ -408,12 +412,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
}
}
#endif
#endif
threadwise_tensor_slice_copy
(
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
out_10d_thread_desc
,
p_out_thread
,
p_out_thread
,
out_10d_global_desc
,
out_10d_global_desc
,
p_out_global
+
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment