Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
a9031464
"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "e48a403eb6310c9e1af6a0f2d165e5589879ab1e"
Commit
a9031464
authored
Apr 25, 2019
by
Chao Liu
Browse files
implicit gemm v1r3 nchw_cyxk_nkhw
parent
569ad66e
Changes
18
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
1202 additions
and
523 deletions
+1202
-523
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+2
-2
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+224
-0
driver/driver.hip.cpp
driver/driver.hip.cpp
+4
-2
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+1
-1
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+7
-2
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+2
-1
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+0
-1
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+0
-336
src/include/blockwise_batched_gemm.hip.hpp
src/include/blockwise_batched_gemm.hip.hpp
+0
-3
src/include/blockwise_nd_tensor_op.hip.hpp
src/include/blockwise_nd_tensor_op.hip.hpp
+252
-0
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+0
-1
src/include/data_type.hip.hpp
src/include/data_type.hip.hpp
+31
-8
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
+12
-12
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+12
-12
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+12
-12
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+514
-0
src/include/threadwise_4d_tensor_op.hip.hpp
src/include/threadwise_4d_tensor_op.hip.hpp
+0
-129
src/include/threadwise_nd_tensor_op.hip.hpp
src/include/threadwise_nd_tensor_op.hip.hpp
+129
-1
No files found.
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
View file @
a9031464
...
...
@@ -128,7 +128,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load input for NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
1
;
// not used yet
constexpr
index_t
InBlockReorderDataPerWrite_N
=
1
;
using
WeiBlockCopyClusterLengths
=
Sequence
<
0
,
0
>
;
// not used
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
...
...
@@ -163,7 +163,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
1
,
8
,
1
,
16
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load input for NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
1
;
// not used yet
constexpr
index_t
InBlockReorderDataPerWrite_N
=
2
;
using
WeiBlockCopyClusterLengths
=
Sequence
<
0
,
0
>
;
// not used
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
0 → 100644
View file @
a9031464
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hip.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
auto
f_reorder_kcyx2cyxk
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
};
make_ParallelTensorFunctor
(
f_reorder_kcyx2cyxk
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_cyxk_device_buf
(
data_sz
*
wei_cyxk
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 0
// for 3x3, 28x28, v1r2, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 2;
constexpr index_t NPerThread = 4;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 2;
constexpr index_t InBlockReorderDataPerWrite_N = 4;
using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif
0
// for 3x3, 28x28, v1r3, Pascal, bad
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
WoPerBlock
=
2
;
constexpr
index_t
NPerThread
=
4
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
4
,
1
,
1
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load input for NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
1
;
// not used yet
using
WeiBlockCopyClusterLengths
=
Sequence
<
0
,
0
>
;
// not used
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
#elif 1
// for 3x3, 34x34, v1r3, Pascal
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
2
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
WoPerBlock
=
16
;
constexpr
index_t
NPerThread
=
2
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
4
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
2
,
1
,
2
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
1
,
8
,
1
,
16
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load input for NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
1
;
// not used yet
using
WeiBlockCopyClusterLengths
=
Sequence
<
0
,
0
>
;
// not used
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
#endif
constexpr
index_t
GridSize
=
((
N
+
NPerBlock
-
1
)
/
NPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
)
*
((
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
)
*
((
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_cyxk_desc
),
decltype
(
out_nkhw_desc
),
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
HoPerThread
,
WoPerThread
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
,
WeiBlockCopyClusterLengths
,
WeiBlockCopyDataPerRead_K
,
OutThreadCopyDataPerWrite_W
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_cyxk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/driver.hip.cpp
View file @
a9031464
...
...
@@ -12,7 +12,7 @@
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
//
#include "device_implicit_gemm_
convolution_
1_chw
n
_cyxk_khw
n_padded
.hpp"
#include "device_
convolution_
implicit_gemm_
v
1_
n
chw_cyxk_
n
khw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
struct
GeneratorTensor_1
...
...
@@ -605,8 +605,10 @@ int main(int argc, char* argv[])
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif
1
#elif
0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 1
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#endif
...
...
src/include/Array.hip.hpp
View file @
a9031464
...
...
@@ -24,7 +24,7 @@ struct Array
{
Array
<
TData
,
NSize
+
1
>
new_array
;
static_for
<
0
,
NSize
,
1
>
{}([
=
](
auto
I
)
{
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
new_array
[
i
]
=
mData
[
i
];
});
...
...
src/include/ConstantTensorDescriptor.hip.hpp
View file @
a9031464
...
...
@@ -137,11 +137,16 @@ struct ConstantTensorDescriptor
}
template
<
index_t
...
Is
>
__host__
__device__
static
constexpr
index_t
Get1dIndex
(
Sequence
<
Is
...
>
multi_id
)
__host__
__device__
static
constexpr
index_t
Get1dIndex
(
Sequence
<
Is
...
>
/*
multi_id
*/
)
{
static_assert
(
sizeof
...(
Is
)
==
nDim
,
"wrong! Dimension not consistent"
);
return
Get1dIndex
(
Is
...);
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
constexpr
auto
seq_tmp
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
multi_id
,
GetStrides
());
return
accumulate_on_sequence
(
seq_tmp
,
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
}
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndex
(
index_t
id
)
...
...
src/include/Sequence.hip.hpp
View file @
a9031464
...
...
@@ -246,7 +246,8 @@ struct accumulate_on_sequence_f
};
template
<
class
Seq
,
class
Reduce
,
index_t
I
>
__host__
__device__
constexpr
index_t
accumulate_on_sequence
(
Seq
,
Reduce
,
Number
<
I
>
)
__host__
__device__
constexpr
index_t
accumulate_on_sequence
(
Seq
,
Reduce
,
Number
<
I
>
/*initial_value*/
)
{
constexpr
index_t
a
=
static_const_reduce_n
<
Seq
::
mSize
>
{}(
accumulate_on_sequence_f
<
Seq
>
{},
Reduce
{});
...
...
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
a9031464
...
...
@@ -471,7 +471,6 @@ struct Blockwise2dTensorCopy3
DstDesc
{}.
GetStride
(
I0
)
%
DataPerRead
==
0
,
"src and dst stride should be multiple of DataPerRead to keep alignment"
);
constexpr
index_t
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
thread_per_d1
=
(
L1
+
DataPerRead
-
1
)
/
DataPerRead
;
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
a9031464
...
...
@@ -761,339 +761,3 @@ struct Blockwise4dTensorCopyReorder1
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
MapDst2Src
{},
f_copy
);
}
};
template
<
index_t
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcLengths
,
class
SrcSubLengths
,
class
SrcClusterLengths
,
class
MapDst2Src
,
class
MapThreadCluster2SrcCluster
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
>
struct
Blockwise4dTensorCopyReorder3
{
static
constexpr
index_t
nDim
=
SrcLengths
::
GetSize
();
index_t
mSrcMyThreadOffset
;
index_t
mDstMyThreadOffset
;
__device__
Blockwise4dTensorCopyReorder3
()
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
src_lengths
=
SrcLengths
{};
constexpr
auto
map_dst2src
=
MapDst2Src
{};
constexpr
auto
src_sub_lengths
=
SrcSubLengths
{};
constexpr
auto
dst_sub_lengths
=
src_sub_lengths
.
ReorderGivenNew2Old
(
map_dst2src
);
constexpr
auto
map_thread_cluster_2_src_cluster
=
MapThreadCluster2SrcCluster
{};
constexpr
auto
src_cluster_lengths
=
SrcClusterLengths
{};
constexpr
auto
thread_cluster_lengths
=
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor
(
thread_cluster_lengths
);
// sanity check: data type
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
// sanity check: nDim
static_assert
(
SrcDesc
::
GetDimension
()
==
nDim
&&
DstDesc
::
GetDimension
()
==
nDim
&&
SrcLengths
::
GetSize
()
==
nDim
&&
SrcSubLengths
::
GetSize
()
==
nDim
&&
SrcClusterLengths
::
GetSize
()
==
nDim
&&
MapDst2Src
::
GetSize
()
==
nDim
&&
MapThreadCluster2SrcCluster
::
GetSize
()
==
nDim
,
"wrong! nDim is not consistent
\n
"
);
// sanity check: BlockSize
constexpr
index_t
num_active_thread
=
thread_cluster_desc
.
GetElementSize
();
static_assert
(
BlockSize
>=
num_active_thread
,
"wrong! BlockSize is not big enough for ThreadPerDims!"
);
// sanity check: work division
static_for
<
0
,
nDim
,
1
>
{}([](
auto
IDim
)
{
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
index_t
src_len
=
src_lengths
.
Get
(
I
);
constexpr
index_t
src_sub_len
=
src_sub_lengths
.
Get
(
I
);
constexpr
index_t
src_cluster_len
=
src_cluster_lengths
.
Get
(
I
);
static_assert
(
src_len
%
(
src_sub_len
*
src_cluster_len
)
==
0
,
"wrong! cannot evenly divide Src tensor lengths"
);
});
// sanity check: src read
static_assert
(
SrcDataPerRead
==
1
||
SrcDataPerRead
==
2
||
SrcDataPerRead
==
4
,
"wrong! only support SrcDataPerRead == 1, 2 or 4!
\n
"
);
static_assert
(
SrcDataPerRead
==
1
||
src_desc
.
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
,
"wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!
\n
"
);
static_assert
(
src_sub_lengths
.
Get
(
Number
<
nDim
-
1
>
{})
%
SrcDataPerRead
==
0
,
"wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0
\n
"
);
static_assert
(
src_desc
.
GetStride
(
Number
<
nDim
-
2
>
{})
%
SrcDataPerRead
==
0
,
"wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to "
"keep alignment"
);
// sanity check: dst write
static_assert
(
DstDataPerWrite
==
1
||
DstDataPerWrite
==
2
||
DstDataPerWrite
==
4
,
"wrong! only support DstDataPerWrite == 1, 2 or 4!
\n
"
);
static_assert
(
DstDataPerWrite
==
1
||
dst_desc
.
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
,
"wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!
\n
"
);
static_assert
(
dst_sub_lengths
.
Get
(
Number
<
nDim
-
1
>
{})
%
DstDataPerWrite
==
0
,
"wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0
\n
"
);
static_assert
(
dst_desc
.
GetStride
(
Number
<
nDim
-
2
>
{})
%
DstDataPerWrite
==
0
,
"wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to "
"keep alignment"
);
// start dividing work
if
(
BlockSize
>
num_active_thread
)
{
if
(
get_thread_local_1d_id
()
>=
num_active_thread
)
{
return
;
}
}
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndex
(
get_thread_local_1d_id
());
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
// regsiters, or only one copy???
auto
src_data_multi_id
=
reorder_array_given_old2new
(
thread_multi_id
,
map_thread_cluster_2_src_cluster
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
index_t
i
=
I
.
Get
();
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
src_data_multi_id
[
i
]
*=
src_sub_lengths
.
Get
(
I
);
});
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
const
auto
dst_data_multi_id
=
reorder_array_given_new2old
(
src_data_multi_id
,
map_dst2src
);
mSrcMyThreadOffset
=
src_desc
.
Get1dIndex
(
src_data_multi_id
);
mDstMyThreadOffset
=
dst_desc
.
Get1dIndex
(
dst_data_multi_id
);
#if 0
if(get_block_1d_id() == 0)
{
printf("tid %5u, "
"thread_multi_id %5u %5u %5u %5u, "
"src_data_multi_id %5u %5u %5u %5u, "
"dst_data_multi_id %5u %5u %5u %5u, "
"mSrcMyThreadOffset %u, mDstMyThreadOffset %u\n",
get_thread_local_1d_id(),
thread_multi_id[0],
thread_multi_id[1],
thread_multi_id[2],
thread_multi_id[3],
src_data_multi_id[0],
src_data_multi_id[1],
src_data_multi_id[2],
src_data_multi_id[3],
dst_data_multi_id[0],
dst_data_multi_id[1],
dst_data_multi_id[2],
dst_data_multi_id[3],
mSrcMyThreadOffset,
mDstMyThreadOffset);
}
#endif
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
{
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
src_data_per_cluster_per_dims
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
SrcClusterLengths
{});
constexpr
auto
cluster_per_dims
=
transform_sequences
(
mod_conv
::
integer_divide_ceiler
<
index_t
>
{},
SrcLengths
{},
src_data_per_cluster_per_dims
);
constexpr
auto
thread_tensor_lengths
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
cluster_per_dims
);
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
return
thread_tensor_desc
.
GetElementSpace
();
}
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
src_data_per_cluster_per_dims
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
SrcClusterLengths
{});
constexpr
auto
cluster_per_dims
=
transform_sequences
(
mod_conv
::
integer_divide_ceiler
<
index_t
>
{},
SrcLengths
{},
src_data_per_cluster_per_dims
);
constexpr
auto
thread_tensor_lengths
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
cluster_per_dims
);
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_sub_tensor_desc
=
make_ConstantTensorDescriptor
(
SrcClusterLengths
{},
thread_tensor_desc
.
GetStrides
());
#if 1
for
(
index_t
icluster_d0
=
0
;
icluster_d0
<
cluster_per_dims
.
Get
(
I0
);
++
icluster_d0
)
{
for
(
index_t
icluster_d1
=
0
;
icluster_d1
<
cluster_per_dims
.
Get
(
I1
);
++
icluster_d1
)
{
for
(
index_t
icluster_d2
=
0
;
icluster_d2
<
cluster_per_dims
.
Get
(
I2
);
++
icluster_d2
)
{
for
(
index_t
icluster_d3
=
0
;
icluster_d3
<
cluster_per_dims
.
Get
(
I3
);
++
icluster_d3
)
{
const
index_t
src_offset
=
SrcDesc
{}.
Get1dIndex
(
icluster_d0
*
src_data_per_cluster_per_dims
.
Get
(
I0
),
icluster_d1
*
src_data_per_cluster_per_dims
.
Get
(
I1
),
icluster_d2
*
src_data_per_cluster_per_dims
.
Get
(
I2
),
icluster_d3
*
src_data_per_cluster_per_dims
.
Get
(
I3
));
const
index_t
clipboard_offset
=
thread_tensor_desc
.
Get1dIndex
(
icluster_d0
*
thread_sub_tensor_lengths
.
Get
(
I0
),
icluster_d1
*
thread_sub_tensor_lengths
.
Get
(
I1
),
icluster_d2
*
thread_sub_tensor_lengths
.
Get
(
I2
),
icluster_d3
*
thread_sub_tensor_lengths
.
Get
(
I3
));
threadwise_nd_tensor_copy
(
SrcDesc
{},
p_src
+
src_offset
+
mSrcMyThreadOffset
,
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
thread_sub_tensor_lengths
,
Number
<
SrcDataPerRead
>
{});
}
}
}
}
#else
static_ford
<
decltype
(
cluster_per_dims
)
>
{}([
=
](
auto
cluster_ids
)
{
});
#endif
#if 0
if(get_block_1d_id() == 0)
{
printf("tid %5u, "
"data: %f %f %f %f %f %f %f %f\n",
get_thread_local_1d_id(),
p_clipboard[0],
p_clipboard[1],
p_clipboard[2],
p_clipboard[3],
p_clipboard[4],
p_clipboard[5],
p_clipboard[6],
p_clipboard[7]);
}
#endif
}
__device__
void
RunStoreRegisterClipboard
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
src_data_per_cluster_per_dims
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
SrcClusterLengths
{});
constexpr
auto
cluster_per_dims
=
transform_sequences
(
mod_conv
::
integer_divide_ceiler
<
index_t
>
{},
SrcLengths
{},
src_data_per_cluster_per_dims
);
constexpr
auto
thread_tensor_lengths
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
cluster_per_dims
);
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_sub_tensor_desc
=
make_ConstantTensorDescriptor
(
SrcClusterLengths
{},
thread_tensor_desc
.
GetStrides
());
for
(
index_t
icluster_d0
=
0
;
icluster_d0
<
cluster_per_dims
.
Get
(
I0
);
++
icluster_d0
)
{
for
(
index_t
icluster_d1
=
0
;
icluster_d1
<
cluster_per_dims
.
Get
(
I1
);
++
icluster_d1
)
{
for
(
index_t
icluster_d2
=
0
;
icluster_d2
<
cluster_per_dims
.
Get
(
I2
);
++
icluster_d2
)
{
for
(
index_t
icluster_d3
=
0
;
icluster_d3
<
cluster_per_dims
.
Get
(
I3
);
++
icluster_d3
)
{
const
index_t
clipboard_offset
=
thread_tensor_desc
.
Get1dIndex
(
icluster_d0
*
thread_sub_tensor_lengths
.
Get
(
I0
),
icluster_d1
*
thread_sub_tensor_lengths
.
Get
(
I1
),
icluster_d2
*
thread_sub_tensor_lengths
.
Get
(
I2
),
icluster_d3
*
thread_sub_tensor_lengths
.
Get
(
I3
));
const
auto
dst_multi_id
=
reorder_array_given_new2old
(
Array
<
index_t
,
nDim
>
{
icluster_d0
*
src_data_per_cluster_per_dims
.
Get
(
I0
),
icluster_d1
*
src_data_per_cluster_per_dims
.
Get
(
I1
),
icluster_d2
*
src_data_per_cluster_per_dims
.
Get
(
I2
),
icluster_d3
*
src_data_per_cluster_per_dims
.
Get
(
I3
)},
MapDst2Src
{});
const
index_t
dst_offset
=
DstDesc
{}.
Get1dIndex
(
dst_multi_id
);
#if 0
if(get_block_1d_id() == 0)
{
printf("tid %5u, "
"clipboard_offsetm %5u, dst_offset %5u\n",
get_thread_local_1d_id(),
clipboard_offset,
dst_offset);
}
#endif
threadwise_4d_tensor_copy_reorder_given_dst2src_v2
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
thread_sub_tensor_lengths
,
MapDst2Src
{});
}
}
}
}
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
Float
p_clipboard
[
GetRegisterClipboardSize
()];
RunLoadRegisterClipboard
(
p_src
,
p_clipboard
);
RunStoreRegisterClipboard
(
p_clipboard
,
p_dst
);
}
};
src/include/blockwise_batched_gemm.hip.hpp
View file @
a9031464
...
...
@@ -53,7 +53,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr
index_t
M
=
a_block_mtx
.
NCol
();
// A is transposed
constexpr
index_t
N
=
b_block_mtx
.
NCol
();
constexpr
index_t
K
=
a_block_mtx
.
NRow
();
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
NPerThread
=
c_thread_mtx
.
NCol
();
...
...
@@ -114,8 +113,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
__device__
MatrixIndex
GetBeginOfThreadMatrixC
(
index_t
thread_id
)
const
{
constexpr
index_t
BatchThreadWork
=
BatchSize
/
BatchPerThread
;
constexpr
index_t
ThreadPerLevel1Cluster
=
MLevel0Cluster
*
NLevel0Cluster
*
MLevel1Cluster
*
NLevel1Cluster
;
...
...
src/include/blockwise_nd_tensor_op.hip.hpp
0 → 100644
View file @
a9031464
#pragma once
#include "threadwise_nd_tensor_op.hip.hpp"
template
<
index_t
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcLengths
,
class
SrcSubLengths
,
class
SrcClusterLengths
,
class
MapDst2Src
,
class
MapThreadCluster2SrcCluster
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
>
struct
BlockwiseNdTensorCopyReorder_v3
{
static
constexpr
index_t
nDim
=
SrcLengths
::
GetSize
();
index_t
mSrcMyThreadOffset
;
index_t
mDstMyThreadOffset
;
__device__
BlockwiseNdTensorCopyReorder_v3
()
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
src_lengths
=
SrcLengths
{};
constexpr
auto
map_dst2src
=
MapDst2Src
{};
constexpr
auto
src_sub_lengths
=
SrcSubLengths
{};
constexpr
auto
dst_sub_lengths
=
src_sub_lengths
.
ReorderGivenNew2Old
(
map_dst2src
);
constexpr
auto
map_thread_cluster_2_src_cluster
=
MapThreadCluster2SrcCluster
{};
constexpr
auto
src_cluster_lengths
=
SrcClusterLengths
{};
constexpr
auto
thread_cluster_lengths
=
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor
(
thread_cluster_lengths
);
// sanity check: data type
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
// sanity check: nDim
static_assert
(
SrcDesc
::
GetDimension
()
==
nDim
&&
DstDesc
::
GetDimension
()
==
nDim
&&
SrcLengths
::
GetSize
()
==
nDim
&&
SrcSubLengths
::
GetSize
()
==
nDim
&&
SrcClusterLengths
::
GetSize
()
==
nDim
&&
MapDst2Src
::
GetSize
()
==
nDim
&&
MapThreadCluster2SrcCluster
::
GetSize
()
==
nDim
,
"wrong! nDim is not consistent
\n
"
);
// sanity check: BlockSize
constexpr
index_t
num_active_thread
=
thread_cluster_desc
.
GetElementSize
();
static_assert
(
BlockSize
>=
num_active_thread
,
"wrong! BlockSize is not big enough for ThreadPerDims!"
);
// sanity check: work division
static_for
<
0
,
nDim
,
1
>
{}([](
auto
IDim
)
{
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
index_t
src_len
=
src_lengths
.
Get
(
I
);
constexpr
index_t
src_sub_len
=
src_sub_lengths
.
Get
(
I
);
constexpr
index_t
src_cluster_len
=
src_cluster_lengths
.
Get
(
I
);
static_assert
(
src_len
%
(
src_sub_len
*
src_cluster_len
)
==
0
,
"wrong! cannot evenly divide Src tensor lengths"
);
});
// sanity check: src read
static_assert
(
SrcDataPerRead
==
1
||
SrcDataPerRead
==
2
||
SrcDataPerRead
==
4
,
"wrong! only support SrcDataPerRead == 1, 2 or 4!
\n
"
);
static_assert
(
SrcDataPerRead
==
1
||
src_desc
.
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
,
"wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!
\n
"
);
static_assert
(
src_sub_lengths
.
Get
(
Number
<
nDim
-
1
>
{})
%
SrcDataPerRead
==
0
,
"wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0
\n
"
);
static_assert
(
src_desc
.
GetStride
(
Number
<
nDim
-
2
>
{})
%
SrcDataPerRead
==
0
,
"wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to "
"keep alignment"
);
// sanity check: dst write
static_assert
(
DstDataPerWrite
==
1
||
DstDataPerWrite
==
2
||
DstDataPerWrite
==
4
,
"wrong! only support DstDataPerWrite == 1, 2 or 4!
\n
"
);
static_assert
(
DstDataPerWrite
==
1
||
dst_desc
.
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
,
"wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!
\n
"
);
static_assert
(
dst_sub_lengths
.
Get
(
Number
<
nDim
-
1
>
{})
%
DstDataPerWrite
==
0
,
"wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0
\n
"
);
static_assert
(
dst_desc
.
GetStride
(
Number
<
nDim
-
2
>
{})
%
DstDataPerWrite
==
0
,
"wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to "
"keep alignment"
);
// start dividing work
if
(
BlockSize
>
num_active_thread
)
{
if
(
get_thread_local_1d_id
()
>=
num_active_thread
)
{
return
;
}
}
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndex
(
get_thread_local_1d_id
());
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
// regsiters, or only one copy???
auto
src_data_multi_id
=
reorder_array_given_old2new
(
thread_multi_id
,
map_thread_cluster_2_src_cluster
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
I
=
decltype
(
IDim
){};
constexpr
index_t
i
=
I
.
Get
();
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
src_data_multi_id
[
i
]
*=
src_sub_lengths
.
Get
(
I
);
});
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
const
auto
dst_data_multi_id
=
reorder_array_given_new2old
(
src_data_multi_id
,
map_dst2src
);
mSrcMyThreadOffset
=
src_desc
.
Get1dIndex
(
src_data_multi_id
);
mDstMyThreadOffset
=
dst_desc
.
Get1dIndex
(
dst_data_multi_id
);
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
{
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
src_data_per_cluster_per_dims
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
SrcClusterLengths
{});
constexpr
auto
repeat_lengths
=
transform_sequences
(
mod_conv
::
integer_divide_ceiler
<
index_t
>
{},
SrcLengths
{},
src_data_per_cluster_per_dims
);
constexpr
auto
thread_tensor_lengths
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
repeat_lengths
);
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
return
thread_tensor_desc
.
GetElementSpace
();
}
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
src_data_per_cluster_per_dims
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
SrcClusterLengths
{});
constexpr
auto
repeat_lengths
=
transform_sequences
(
mod_conv
::
integer_divide_ceiler
<
index_t
>
{},
SrcLengths
{},
src_data_per_cluster_per_dims
);
constexpr
auto
thread_tensor_lengths
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
repeat_lengths
);
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
src_data_multi_id
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
repeat_multi_id
,
src_data_per_cluster_per_dims
);
constexpr
auto
clipboard_data_multi_id
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
repeat_multi_id
,
thread_sub_tensor_lengths
);
constexpr
index_t
src_offset
=
SrcDesc
{}.
Get1dIndex
(
src_data_multi_id
);
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
Get1dIndex
(
clipboard_data_multi_id
);
threadwise_nd_tensor_copy
(
SrcDesc
{},
p_src
+
src_offset
+
mSrcMyThreadOffset
,
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
thread_sub_tensor_lengths
,
Number
<
SrcDataPerRead
>
{});
});
}
__device__
void
RunStoreRegisterClipboard
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
src_data_per_cluster_per_dims
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
SrcClusterLengths
{});
constexpr
auto
repeat_lengths
=
transform_sequences
(
mod_conv
::
integer_divide_ceiler
<
index_t
>
{},
SrcLengths
{},
src_data_per_cluster_per_dims
);
constexpr
auto
thread_tensor_lengths
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
thread_sub_tensor_lengths
,
repeat_lengths
);
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
clipboard_data_multi_id
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
repeat_multi_id
,
thread_sub_tensor_lengths
);
constexpr
auto
src_data_multi_id
=
transform_sequences
(
mod_conv
::
multiplies
<
index_t
>
{},
repeat_multi_id
,
src_data_per_cluster_per_dims
);
// reorder src_data_multi_id to get dst_data_multi_id
constexpr
auto
dst_data_multi_id
=
src_data_multi_id
.
ReorderGivenNew2Old
(
MapDst2Src
{});
constexpr
index_t
clipboard_offset
=
thread_tensor_desc
.
Get1dIndex
(
clipboard_data_multi_id
);
constexpr
index_t
dst_offset
=
DstDesc
{}.
Get1dIndex
(
dst_data_multi_id
);
// write in the order of dst
#if 1
threadwise_nd_tensor_copy_reorder_given_dst2src_v2
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
thread_sub_tensor_lengths
,
MapDst2Src
{});
#else
threadwise_nd_tensor_copy_reorder_given_dst2src_v3
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
thread_sub_tensor_lengths
,
MapDst2Src
{},
Number
<
DstDataPerWrite
>
{});
#endif
});
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
Float
p_clipboard
[
GetRegisterClipboardSize
()];
RunLoadRegisterClipboard
(
p_src
,
p_clipboard
);
RunStoreRegisterClipboard
(
p_clipboard
,
p_dst
);
}
};
src/include/conv_common.hip.hpp
View file @
a9031464
...
...
@@ -73,7 +73,6 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
__host__
__device__
constexpr
std
::
size_t
calculate_convolution_flops
(
InDesc
,
WeiDesc
,
OutDesc
)
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
...
...
src/include/data_type.hip.hpp
View file @
a9031464
#pragma once
#include "config.h"
#include "constant_integral.hip.hpp"
template
<
class
T
,
index_t
N
>
struct
vector_type
...
...
@@ -10,6 +11,13 @@ template <>
struct
vector_type
<
float
,
1
>
{
typedef
float
MemoryType
;
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
{
static_assert
(
I
<
1
,
"wrong"
);
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
}
};
template
<
>
...
...
@@ -20,21 +28,29 @@ struct vector_type<float, 2>
// instruction
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
2
)));
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition
to
, otherwise
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using
MemoryType
=
float2
;
#endif
__host__
__device__
static
MemoryType
Pack
(
float
s0
,
float
s1
)
union
Data
{
union
{
MemoryType
vector
;
float
scalar
[
2
];
}
data
;
MemoryType
vector
;
float
scalar
[
2
];
};
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
{
static_assert
(
I
<
2
,
"wrong"
);
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
}
__host__
__device__
static
MemoryType
Pack
(
float
s0
,
float
s1
)
{
Data
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
...
...
@@ -49,12 +65,19 @@ struct vector_type<float, 4>
// instruction
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
4
)));
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition
to
, otherwise
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using
MemoryType
=
float4
;
#endif
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
{
static_assert
(
I
<
4
,
"wrong"
);
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
}
};
#if 0
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
View file @
a9031464
...
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_3d_tensor_op.hip.hpp"
#include "blockwise_
4
d_tensor_op.hip.hpp"
#include "blockwise_
n
d_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -125,17 +125,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
Blockwise
4
dTensorCopyReorder3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WiPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
Blockwise
N
dTensorCopyReorder
_v
3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WiPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// format is [CPerBlock, X * KPerBlock]
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
a9031464
...
...
@@ -3,7 +3,7 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_
4
d_tensor_op.hip.hpp"
#include "blockwise_
n
d_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -133,17 +133,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
Blockwise
4
dTensorCopyReorder3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
Blockwise
N
dTensorCopyReorder
_v
3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
View file @
a9031464
...
...
@@ -3,7 +3,7 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_
4
d_tensor_op.hip.hpp"
#include "blockwise_
n
d_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
...
@@ -130,17 +130,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
Blockwise
4
dTensorCopyReorder3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
Blockwise
N
dTensorCopyReorder
_v
3
<
BlockSize
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
0 → 100644
View file @
a9031464
This diff is collapsed.
Click to expand it.
src/include/threadwise_4d_tensor_op.hip.hpp
View file @
a9031464
...
...
@@ -139,135 +139,6 @@ __device__ void threadwise_4d_tensor_copy_reorder_given_dst2src(SrcDesc,
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
MapDst2Src
{},
f_copy
);
}
#if 0 // replaced threadwise_nd_tensor_copy
template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
__device__ void threadwise_4d_tensor_copy(
SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
{
auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
threadwise_4d_tensor_copy_reorder_given_dst2src(
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
}
// need to assume src and dst is aligned
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
__device__ void threadwise_4d_tensor_copy_v2(SrcDesc,
const Float* __restrict__ p_src,
DstDesc,
Float* __restrict__ p_dst,
SrcOpLengths,
Number<DataPerRead>)
{
static_assert(SrcDesc{}.GetDimension() == 4 && DstDesc{}.GetDimension() == 4 &&
SrcOpLengths::GetSize() == 4,
"wrong! should be 4 dimension");
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1,
"wrong! only support stride3 == 1!\n");
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
"wrong! only support DataPerRead == 1, 2 or 4!\n");
static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
DstDesc{}.GetStride(I2) % DataPerRead == 0,
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
constexpr index_t L3 = SrcOpLengths{}.Get(I3);
static_assert(L3 % DataPerRead == 0, "wrong! L3 should be evenly divided by DataPerRead");
constexpr index_t nloop_d3 = L3 / DataPerRead;
for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
{
for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
{
for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
{
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
{
const index_t src_index =
src_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);
const index_t dst_index =
dst_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);
*(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
*(reinterpret_cast<const vector_t*>(&p_src[src_index]));
}
}
}
}
}
#endif
template
<
class
SrcData
,
class
DstData
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
>
__device__
void
threadwise_4d_tensor_copy_reorder_given_dst2src_v2
(
SrcDesc
,
const
SrcData
*
__restrict__
p_src
,
DstDesc
,
DstData
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
index_t
IR0
=
MapDst2Src
{}.
Get
(
I0
);
constexpr
index_t
IR1
=
MapDst2Src
{}.
Get
(
I1
);
constexpr
index_t
IR2
=
MapDst2Src
{}.
Get
(
I2
);
constexpr
index_t
IR3
=
MapDst2Src
{}.
Get
(
I3
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
// ref_desc has dst_desc's ordering
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}.
ReorderGivenNew2Old
(
MapDst2Src
{}));
for
(
index_t
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
index_t
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
index_t
did2
=
0
;
did2
<
ref_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
index_t
did3
=
0
;
did3
<
ref_desc
.
GetLength
(
I3
);
++
did3
)
{
const
auto
dst_multi_id
=
Array
<
index_t
,
4
>
{
did0
,
did1
,
did2
,
did3
};
const
auto
src_multi_id
=
reorder_array_given_old2new
(
dst_multi_id
,
MapDst2Src
{});
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
dst_multi_id
);
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
src_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
}
}
}
}
}
template
<
class
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
__device__
void
threadwise_4d_tensor_shift_down
(
Desc
,
Float
*
__restrict__
p
,
IDim
,
NShift
)
{
...
...
src/include/threadwise_nd_tensor_op.hip.hpp
View file @
a9031464
...
...
@@ -50,7 +50,7 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
constexpr
index_t
nRead
=
L_Back
/
DataPerRead
;
static_ford
<
decltype
(
ref_desc
.
GetLengths
().
PopBack
())
>
{}([
=
](
auto
Ids
)
{
static_for
<
0
,
nRead
,
1
>
{}([
=
](
auto
IRead
)
{
static_for
<
0
,
nRead
,
1
>
{}([
&
](
auto
IRead
)
{
constexpr
auto
multi_id
=
decltype
(
Ids
){}.
PushBack
(
Number
<
IRead
.
Get
()
*
DataPerRead
>
{});
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
multi_id
);
...
...
@@ -62,3 +62,131 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
});
});
}
// write in order of src
template
<
class
SrcData
,
class
DstData
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
>
__device__
void
threadwise_nd_tensor_copy_reorder_given_dst2src_v1
(
SrcDesc
,
const
SrcData
*
__restrict__
p_src
,
DstDesc
,
DstData
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
)
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
ford
<
SrcOpLengths
>
{}([
&
](
auto
src_multi_id
)
{
const
auto
dst_multi_id
=
reorder_array_given_new2old
(
src_multi_id
,
MapDst2Src
{});
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
dst_multi_id
);
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
src_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
});
}
// write in order of dst
template
<
class
SrcData
,
class
DstData
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
>
__device__
void
threadwise_nd_tensor_copy_reorder_given_dst2src_v2
(
SrcDesc
,
const
SrcData
*
__restrict__
p_src
,
DstDesc
,
DstData
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
)
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_op_lengths
=
SrcOpLengths
{}.
ReorderGivenNew2Old
(
MapDst2Src
{});
ford
<
decltype
(
dst_op_lengths
)
>
{}([
&
](
auto
dst_multi_id
)
{
const
auto
src_multi_id
=
reorder_array_given_old2new
(
dst_multi_id
,
MapDst2Src
{});
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
dst_multi_id
);
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
src_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
});
}
// write in order of dst
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
MapDst2Src
,
index_t
DstDataPerWrite
>
__device__
void
threadwise_nd_tensor_copy_reorder_given_dst2src_v3
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
MapDst2Src
,
Number
<
DstDataPerWrite
>
)
{
using
vector_t
=
typename
vector_type
<
Float
,
DstDataPerWrite
>::
MemoryType
;
constexpr
index_t
nDim
=
SrcOpLengths
::
GetSize
();
static_assert
(
DstDataPerWrite
==
1
||
DstDesc
{}.
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
,
"wrong! only support dst.stride[nDim-1] == 1, if DstDataPerWrite != 1"
);
static_assert
(
DstDataPerWrite
==
1
||
DstDataPerWrite
==
2
||
DstDataPerWrite
==
4
,
"wrong! only support DstDataPerWrite == 1, 2 or 4"
);
static_assert
(
DstDesc
{}.
GetStride
(
Number
<
nDim
-
2
>
{})
%
DstDataPerWrite
==
0
,
"wrong! dst.stride[nDim-2] should be multiple of DstDataPerWrite to keep alignment"
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_op_lengths
=
SrcOpLengths
{}.
ReorderGivenNew2Old
(
MapDst2Src
{});
constexpr
index_t
L_Dst_Back
=
dst_op_lengths
.
Back
();
static_assert
(
L_Dst_Back
%
DstDataPerWrite
==
0
,
"wrong! dst.lengths[nDim-1] should be evenly divided by DstDataPerWrite"
);
constexpr
index_t
nWrite
=
L_Dst_Back
/
DstDataPerWrite
;
ford
<
decltype
(
dst_op_lengths
.
PopBack
())
>
{}([
&
](
auto
ids
)
{
static_for
<
0
,
nWrite
,
1
>
{}([
&
](
auto
IWrite
)
{
vector_t
dst_vec_data
;
// pack data
static_for
<
0
,
DstDataPerWrite
,
1
>
{}([
&
](
auto
IDstData
)
{
const
auto
dst_multi_id
=
ids
.
PushBack
(
IWrite
.
Get
()
*
DstDataPerWrite
+
IDstData
.
Get
());
const
auto
src_multi_id
=
reorder_array_given_old2new
(
dst_multi_id
,
MapDst2Src
{});
const
index_t
src_index
=
src_desc
.
Get1dIndex
(
src_multi_id
);
vector_type
<
Float
,
DstDataPerWrite
>::
SetScalar
(
dst_vec_data
,
p_src
[
src_index
],
IDstData
);
});
// write data
const
auto
dst_multi_id
=
ids
.
PushBack
(
IWrite
.
Get
()
*
DstDataPerWrite
);
const
index_t
dst_index
=
dst_desc
.
Get1dIndex
(
dst_multi_id
);
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
]))
=
dst_vec_data
;
});
});
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment