Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
88b77181
"references/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "86db394eac46bd779d6a22afaecf09c9291f5c76"
Commit
88b77181
authored
Jun 11, 2019
by
Chao Liu
Browse files
rename files, added header guard, added namespace
parent
05e04665
Changes
62
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
136 additions
and
350 deletions
+136
-350
driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+4
-2
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+6
-4
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+0
-282
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+7
-5
driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+4
-2
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+6
-4
driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
...er/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+6
-4
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
...device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+2
-0
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
...ice_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+2
-0
driver/driver.cpp
driver/driver.cpp
+4
-10
src/CMakeLists.txt
src/CMakeLists.txt
+1
-1
src/device.cpp
src/device.cpp
+1
-1
src/include/Array.hpp
src/include/Array.hpp
+15
-8
src/include/ConstantMatrixDescriptor.hpp
src/include/ConstantMatrixDescriptor.hpp
+9
-1
src/include/ConstantMergedTensorDescriptor.hpp
src/include/ConstantMergedTensorDescriptor.hpp
+8
-1
src/include/ConstantTensorDescriptor.hpp
src/include/ConstantTensorDescriptor.hpp
+16
-10
src/include/Sequence.hpp
src/include/Sequence.hpp
+8
-1
src/include/amd_inline_asm.hpp
src/include/amd_inline_asm.hpp
+8
-1
src/include/blockwise_2d_tensor_op.hpp
src/include/blockwise_2d_tensor_op.hpp
+12
-4
src/include/blockwise_3d_tensor_op.hpp
src/include/blockwise_3d_tensor_op.hpp
+17
-9
No files found.
driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
View file @
88b77181
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_
kernel_
wrapper.hpp"
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_direct_v2_nchw_kcyx_nkhw
(
InDesc
,
void
device_convolution_direct_v2_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
in
,
...
@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
...
@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
WoPerThread
,
WoPerThread
,
InBlockCopyDataPerRead
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
;
WeiBlockCopyDataPerRead
>
;
float
time
=
launch_kernel
(
run_gridwise_convolution
<
gridwise_conv
,
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution
_kernel
<
gridwise_conv
,
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
88b77181
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_
kernel_
wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
InDesc
,
void
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
InDesc
,
...
@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
#elif 0
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 1
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer
_chwn_cyxk_khwn
GridwiseConvolutionImplicitGemm_v1r3_
chwn_cyxk_khwn_
lds_double_buffer
#endif
#endif
<
GridSize
,
<
GridSize
,
BlockSize
,
BlockSize
,
...
@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
WeiBlockCopyDataPerRead_K
,
WeiBlockCopyDataPerRead_K
,
OutThreadCopyDataPerWrite_N
>
{};
OutThreadCopyDataPerWrite_N
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution
_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
deleted
100644 → 0
View file @
05e04665
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
auto
f_reorder_kcyx2cyxk
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
};
make_ParallelTensorFunctor
(
f_reorder_kcyx2cyxk
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
// output
auto
out_khwn_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_cyxk_device_buf
(
data_sz
*
wei_cyxk
.
mDesc
.
GetElementSpace
());
DeviceMem
out_khwn_device_buf
(
data_sz
*
out_khwn
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
#if 1
// for 3x3, 34x34, v1r3, Pascal
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
2
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
WoPerBlock
=
16
;
constexpr
index_t
NPerThread
=
2
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
4
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
2
,
1
,
2
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
1
,
8
,
1
,
16
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load input for NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
2
;
using
WeiBlockCopyClusterLengths
=
void
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_N
=
2
;
#elif 1
// for 3x3, 34x34, v1r3, Vega 20
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
NPerBlock
=
2
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
4
;
constexpr
index_t
WoPerBlock
=
16
;
constexpr
index_t
NPerThread
=
2
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
4
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
2
,
1
,
2
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
1
,
8
,
2
,
16
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load input for NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
2
;
using
WeiBlockCopyClusterLengths
=
void
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_N
=
2
;
#elif 0
// for 3x3, 28x28, v1r2, Pascal
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
WoPerBlock
=
2
;
constexpr
index_t
NPerThread
=
4
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
4
,
1
,
1
,
2
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
2
;
constexpr
index_t
InBlockReorderDataPerWrite_N
=
4
;
using
WeiBlockCopyClusterLengths
=
Sequence
<
4
,
1
,
32
>
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_N
=
2
;
#elif 0
// for 3x3, 28x28, v1r3, Pascal, bad
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
WoPerBlock
=
2
;
constexpr
index_t
NPerThread
=
4
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
4
,
1
,
1
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load input for NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
1
;
using
WeiBlockCopyClusterLengths
=
void
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_N
=
2
;
#endif
constexpr
index_t
GridSize
=
((
N
+
NPerBlock
-
1
)
/
NPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
)
*
((
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
)
*
((
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
#if 0
GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
#elif
0
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
#endif
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_cyxk_desc
),
decltype
(
out_khwn_desc
),
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
HoPerThread
,
WoPerThread
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
,
WeiBlockCopyClusterLengths
,
WeiBlockCopyDataPerRead_K
,
OutThreadCopyDataPerWrite_N
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_cyxk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_khwn_device_buf
.
FromDevice
(
out_khwn
.
mData
.
data
());
// reorder output
auto
f_reorder_khwn2nkhw
=
[
&
](
auto
k
,
auto
ho
,
auto
wo
,
auto
n
)
{
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_khwn
(
k
,
ho
,
wo
,
n
);
};
make_ParallelTensorFunctor
(
f_reorder_khwn2nkhw
,
K
,
Ho
,
Wo
,
N
)(
std
::
thread
::
hardware_concurrency
());
}
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
View file @
88b77181
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_
kernel_
wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
InDesc
,
void
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
InDesc
,
...
@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
constexpr
auto
gridwise_conv
=
constexpr
auto
gridwise_conv
=
#if
1
#if
0
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
#else
#else
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer
_nchw_cyxk_nkhw
GridwiseConvolutionImplicitGemm_v1r3_
nchw_cyxk_nkhw_
lds_double_buffer
#endif
#endif
<
GridSize
,
<
GridSize
,
BlockSize
,
BlockSize
,
...
@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
WeiBlockCopyDataPerRead_K
,
WeiBlockCopyDataPerRead_K
,
OutThreadCopyDataPerWrite_W
>
{};
OutThreadCopyDataPerWrite_W
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution
_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
View file @
88b77181
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_
kernel_
wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
(
InDesc
,
void
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
...
@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
...
@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
WeiBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
,
OutThreadCopyDataPerWrite
>
{};
OutThreadCopyDataPerWrite
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution
_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
View file @
88b77181
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_
kernel_
wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
InDesc
,
void
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
InDesc
,
...
@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
#if 0
#if 0
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
#else
#else
GridwiseConvolutionImplicitGemm_v3_lds_double_buffer
_nchw_cyxk_nkhw
GridwiseConvolutionImplicitGemm_v3_
nchw_cyxk_nkhw_
lds_double_buffer
#endif
#endif
<
GridSize
,
<
GridSize
,
BlockSize
,
BlockSize
,
...
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
WeiBlockCopyDataPerAccess_K
>
{};
WeiBlockCopyDataPerAccess_K
>
{};
#if 1
#if 1
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution
_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
View file @
88b77181
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_
kernel_
wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
(
InDesc
,
void
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
(
InDesc
,
...
@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
...
@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
#if 0
#if 0
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
#else
#else
GridwiseConvolutionImplicitGemm_v4_lds_double_buffer
_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4_
nchw_kcyx_nkhw_
lds_double_buffer
#endif
#endif
<
GridSize
,
<
GridSize
,
BlockSize
,
BlockSize
,
...
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
...
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
WeiBlockCopyDstDataPerWrite_K
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution
_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
View file @
88b77181
...
@@ -3,6 +3,8 @@
...
@@ -3,6 +3,8 @@
#include "device.hpp"
#include "device.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
template
<
class
TInWei
,
class
TOut
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
TInWei
,
class
TOut
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
InDesc
,
void
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
TInWei
>&
in_nchw
,
const
Tensor
<
TInWei
>&
in_nchw
,
...
...
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
View file @
88b77181
...
@@ -3,6 +3,8 @@
...
@@ -3,6 +3,8 @@
#include "device.hpp"
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
void
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
InDesc
,
void
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
...
...
driver/driver.cpp
View file @
88b77181
...
@@ -3,19 +3,19 @@
...
@@ -3,19 +3,19 @@
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include "config.h"
#include "config.h
pp
"
#include "tensor.hpp"
#include "tensor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "conv_common.hpp"
#include "conv_common.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
struct
GeneratorTensor_1
struct
GeneratorTensor_1
{
{
template
<
class
...
Is
>
template
<
class
...
Is
>
...
@@ -419,7 +419,7 @@ int main(int argc, char* argv[])
...
@@ -419,7 +419,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
constexpr index_t WPad = 0;
#elif
0
#elif
1
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
256
;
...
@@ -633,15 +633,9 @@ int main(int argc, char* argv[])
...
@@ -633,15 +633,9 @@ int main(int argc, char* argv[])
#if 1
#if 1
#if 0
#if 0
device_direct_convolution_1
#elif
0
device_convolution_direct_v2_nchw_kcyx_nkhw
device_convolution_direct_v2_nchw_kcyx_nkhw
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif
0
#elif
0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 0
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
#elif 0
...
...
src/CMakeLists.txt
View file @
88b77181
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/src/include/config.h.in"
"
${
PROJECT_BINARY_DIR
}
/src/include/config.h"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/src/include/config.h
pp
.in"
"
${
PROJECT_BINARY_DIR
}
/src/include/config.h
pp
"
)
set
(
TENSOR_SOURCE
set
(
TENSOR_SOURCE
tensor.cpp;
tensor.cpp;
...
...
src/device.cpp
View file @
88b77181
#include "config.h"
#include "config.h
pp
"
#include "device.hpp"
#include "device.hpp"
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
...
...
src/include/Array.hpp
View file @
88b77181
#pragma once
#ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP
#include "Sequence.hpp"
#include "Sequence.hpp"
#include "functional2.hpp"
#include "functional2.hpp"
namespace
ck
{
template
<
class
TData
,
index_t
NSize
>
template
<
class
TData
,
index_t
NSize
>
struct
Array
struct
Array
{
{
...
@@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
...
@@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>::
value
,
"wrong! invalid reorder map"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>::
value
,
"wrong! invalid reorder map"
);
return
Array
<
TData
,
NSize
>
{
old_array
.
mSize
[
IRs
]...};
return
Array
<
TData
,
NSize
>
{
old_array
[
IRs
]...};
}
}
template
<
class
TData
,
index_t
NSize
,
class
MapOld2New
>
template
<
class
TData
,
index_t
NSize
,
class
MapOld2New
>
...
@@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
...
@@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
{
{
Array
<
TData
,
NSize
>
result
;
Array
<
TData
,
NSize
>
result
;
auto
f
=
m
od_conv
::
plus
<
index_t
>
{};
auto
f
=
m
ath
::
plus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
...
@@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
...
@@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
{
{
Array
<
TData
,
NSize
>
result
;
Array
<
TData
,
NSize
>
result
;
auto
f
=
m
od_conv
::
minus
<
index_t
>
{};
auto
f
=
m
ath
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
...
@@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
...
@@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
Array
<
TData
,
NSize
>
result
;
Array
<
TData
,
NSize
>
result
;
auto
f
=
m
od_conv
::
plus
<
index_t
>
{};
auto
f
=
m
ath
::
plus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
...
@@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
...
@@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
Array
<
TData
,
NSize
>
result
;
Array
<
TData
,
NSize
>
result
;
auto
f
=
m
od_conv
::
minus
<
index_t
>
{};
auto
f
=
m
ath
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
...
@@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
...
@@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
Array
<
TData
,
NSize
>
result
;
Array
<
TData
,
NSize
>
result
;
auto
f
=
m
od_conv
::
multiplies
<
index_t
>
{};
auto
f
=
m
ath
::
multiplies
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
...
@@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
...
@@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
Array
<
TData
,
NSize
>
result
;
Array
<
TData
,
NSize
>
result
;
auto
f
=
m
od_conv
::
minus
<
index_t
>
{};
auto
f
=
m
ath
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
...
@@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
...
@@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
a
[
9
]);
a
[
9
]);
});
});
}
}
}
// namespace ck
#endif
src/include/ConstantMatrixDescriptor.hpp
View file @
88b77181
#pragma once
#ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#include "common.hpp"
#include "common.hpp"
namespace
ck
{
template
<
index_t
NRow_
,
index_t
NCol_
,
index_t
RowStride_
>
template
<
index_t
NRow_
,
index_t
NCol_
,
index_t
RowStride_
>
struct
ConstantMatrixDescriptor
struct
ConstantMatrixDescriptor
{
{
...
@@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
...
@@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
printf
(
"%s NRow %u NCol %u RowStride %u
\n
"
,
s
,
desc
.
NRow
(),
desc
.
NCol
(),
desc
.
RowStride
());
printf
(
"%s NRow %u NCol %u RowStride %u
\n
"
,
s
,
desc
.
NRow
(),
desc
.
NCol
(),
desc
.
RowStride
());
}
}
}
// namespace ck
#endif
src/include/ConstantMergedTensorDescriptor.hpp
View file @
88b77181
#pragma once
#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
#include "common.hpp"
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
namespace
ck
{
// OriginalTensorDesc : ConstantTensorDescriptor<...>
// OriginalTensorDesc : ConstantTensorDescriptor<...>
// it's the tensor whose dimensions are to be merged
// it's the tensor whose dimensions are to be merged
// OriginalDimMergeSeqs : Sequence<...>...
// OriginalDimMergeSeqs : Sequence<...>...
...
@@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe
...
@@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe
{
{
print_ConstantTensorDescriptor
(
s
,
TDesc
::
GetOriginalTensorDescriptor
());
print_ConstantTensorDescriptor
(
s
,
TDesc
::
GetOriginalTensorDescriptor
());
}
}
}
// namespace ck
#endif
src/include/ConstantTensorDescriptor.hpp
View file @
88b77181
#pragma once
#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
#include "common.hpp"
#include "common.hpp"
namespace
ck
{
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_packed
(
Lengths
)
__host__
__device__
constexpr
auto
calculate_tensor_strides_packed
(
Lengths
)
{
{
return
reverse_inclusive_scan_sequence
(
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
m
od_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
Lengths
{}.
PopFront
(),
m
ath
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
.
PushBack
(
Number
<
1
>
{});
}
}
...
@@ -13,7 +17,7 @@ template <class Lengths, index_t Align>
...
@@ -13,7 +17,7 @@ template <class Lengths, index_t Align>
__host__
__device__
constexpr
auto
calculate_tensor_strides_aligned
(
Lengths
,
Number
<
Align
>
)
__host__
__device__
constexpr
auto
calculate_tensor_strides_aligned
(
Lengths
,
Number
<
Align
>
)
{
{
constexpr
index_t
L_back_align
=
constexpr
index_t
L_back_align
=
Align
*
m
od_conv
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
Align
*
m
ath
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
return
calculate_tensor_strides_packed
(
return
calculate_tensor_strides_packed
(
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
...
@@ -100,7 +104,7 @@ struct ConstantTensorDescriptor
...
@@ -100,7 +104,7 @@ struct ConstantTensorDescriptor
__host__
__device__
static
constexpr
index_t
GetElementSize
()
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
{
return
accumulate_on_sequence
(
Lengths
{},
m
od_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
return
accumulate_on_sequence
(
Lengths
{},
m
ath
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
}
template
<
class
Align
=
Number
<
1
>
>
template
<
class
Align
=
Number
<
1
>
>
...
@@ -109,7 +113,7 @@ struct ConstantTensorDescriptor
...
@@ -109,7 +113,7 @@ struct ConstantTensorDescriptor
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
// dimension
// dimension
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
m
od_conv
::
plus
<
index_t
>
{},
Number
<
1
>
{});
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
m
ath
::
plus
<
index_t
>
{},
Number
<
1
>
{});
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
}
}
...
@@ -161,8 +165,7 @@ struct ConstantTensorDescriptor
...
@@ -161,8 +165,7 @@ struct ConstantTensorDescriptor
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
return
accumulate_on_sequence
(
return
accumulate_on_sequence
(
multi_id
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
0
>
{});
multi_id
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
}
}
// emulate constexpr lambda
// emulate constexpr lambda
...
@@ -323,7 +326,7 @@ struct ConstantTensorDescriptor
...
@@ -323,7 +326,7 @@ struct ConstantTensorDescriptor
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
index_t
fold_intervals_product
=
constexpr
index_t
fold_intervals_product
=
accumulate_on_sequence
(
fold_intervals
,
m
od_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
fold_intervals
,
m
ath
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
...
@@ -341,7 +344,7 @@ struct ConstantTensorDescriptor
...
@@ -341,7 +344,7 @@ struct ConstantTensorDescriptor
constexpr
auto
fold_strides
=
constexpr
auto
fold_strides
=
Number
<
unfold_stride
>
{}
*
Number
<
unfold_stride
>
{}
*
reverse_inclusive_scan_sequence
(
reverse_inclusive_scan_sequence
(
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
m
od_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
m
ath
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
// left and right
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
IDim
,
1
>::
SeqType
{};
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
IDim
,
1
>::
SeqType
{};
...
@@ -376,7 +379,7 @@ struct ConstantTensorDescriptor
...
@@ -376,7 +379,7 @@ struct ConstantTensorDescriptor
// unfolded length, stride
// unfolded length, stride
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
GetLengths
().
Extract
(
middle
),
m
od_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
GetLengths
().
Extract
(
middle
),
m
ath
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
...
@@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s,
...
@@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s,
Strides
...);
Strides
...);
});
});
}
}
}
// namespace ck
#endif
src/include/Sequence.hpp
View file @
88b77181
#pragma once
#ifndef CK_SEQUENCE_HPP
#define CK_SEQUENCE_HPP
#include "integral_constant.hpp"
#include "integral_constant.hpp"
#include "functional.hpp"
#include "functional.hpp"
namespace
ck
{
template
<
class
Seq
>
template
<
class
Seq
>
struct
is_valid_sequence_map
;
struct
is_valid_sequence_map
;
...
@@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
...
@@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
static_if
<
nsize
==
10
>
{}(
static_if
<
nsize
==
10
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
}
}
}
// namespace ck
#endif
src/include/amd_inline_asm.hpp
View file @
88b77181
#pragma once
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP
#include "common.hpp"
#include "common.hpp"
#define NO_VM_WAIT 0
#define NO_VM_WAIT 0
...
@@ -7,6 +9,8 @@
...
@@ -7,6 +9,8 @@
#define NO_DS_WRITE 0
#define NO_DS_WRITE 0
#define NO_GLB_READ 0
#define NO_GLB_READ 0
namespace
ck
{
// cast a pointer of LDS to its address
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
...
@@ -759,3 +763,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off
...
@@ -759,3 +763,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off
}
}
#endif
#endif
}
}
}
// namespace ck
#endif
src/include/blockwise_2d_tensor_op.hpp
View file @
88b77181
#pragma once
#ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP
#define CK_BLOCKWISE_2D_TENSOR_OP_HPP
#include "common.hpp"
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
class
Float
,
class
DstDesc
,
class
F
>
template
<
index_t
BlockSize
,
class
Float
,
class
DstDesc
,
class
F
>
__device__
void
__device__
void
blockwise_2d_tensor_pointwise_operation_unary
(
DstDesc
,
Float
*
__restrict__
p_dst
,
F
f
)
blockwise_2d_tensor_pointwise_operation_unary
(
DstDesc
,
Float
*
__restrict__
p_dst
,
F
f
)
...
@@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1
...
@@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1
// but we need to make sure dst stride0 is big enough,
// but we need to make sure dst stride0 is big enough,
// so that the out-of-bound write won't contaminate next line in dst
// so that the out-of-bound write won't contaminate next line in dst
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
read_per_d1
=
m
od_conv
::
integer_divide_ceil
(
L1
,
DataPerRead
);
constexpr
index_t
read_per_d1
=
m
ath
::
integer_divide_ceil
(
L1
,
DataPerRead
);
static_assert
(
read_per_d1
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I0
),
static_assert
(
read_per_d1
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I0
),
"wrong! out-of-bound write will contaminate next line!
\n
"
);
"wrong! out-of-bound write will contaminate next line!
\n
"
);
...
@@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1
...
@@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1
constexpr
index_t
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
index_t
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
read_per_d1
=
m
od_conv
::
integer_divide_ceil
(
L1
,
DataPerRead
);
constexpr
index_t
read_per_d1
=
m
ath
::
integer_divide_ceil
(
L1
,
DataPerRead
);
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
read_per_d1
>
{});
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
read_per_d1
>
{});
...
@@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3
...
@@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3
}
}
}
}
#if USE_AMD_INLINE_ASM
#if
CK_
USE_AMD_INLINE_ASM
__device__
void
RunLoadRegisterClipboard_asm
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegisterClipboard_asm
(
const
Float
*
__restrict__
p_src
,
Float
*
p_clipboard
)
const
Float
*
p_clipboard
)
const
{
{
...
@@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3
...
@@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3
}
}
#endif
#endif
};
};
}
// namespace ck
#endif
src/include/blockwise_3d_tensor_op.hpp
View file @
88b77181
#pragma once
#ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP
#define CK_BLOCKWISE_3D_TENSOR_OP_HPP
#include "common.hpp"
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
class
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
...
@@ -33,7 +37,7 @@ struct Blockwise3dTensorCopy1
...
@@ -33,7 +37,7 @@ struct Blockwise3dTensorCopy1
// but we need to make sure dst stride2 is big enough,
// but we need to make sure dst stride2 is big enough,
// so that the out-of-bound write won't contaminate next line in dst
// so that the out-of-bound write won't contaminate next line in dst
constexpr
index_t
L2
=
CopyLengths
{}.
Get
(
I2
);
constexpr
index_t
L2
=
CopyLengths
{}.
Get
(
I2
);
constexpr
index_t
read_per_d2
=
m
od_conv
::
integer_divide_ceil
(
L2
,
DataPerRead
);
constexpr
index_t
read_per_d2
=
m
ath
::
integer_divide_ceil
(
L2
,
DataPerRead
);
static_assert
(
read_per_d2
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I1
),
static_assert
(
read_per_d2
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I1
),
"wrong! out-of-bound write will contaminate next line!
\n
"
);
"wrong! out-of-bound write will contaminate next line!
\n
"
);
...
@@ -52,7 +56,7 @@ struct Blockwise3dTensorCopy1
...
@@ -52,7 +56,7 @@ struct Blockwise3dTensorCopy1
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
L2
=
CopyLengths
{}.
Get
(
I2
);
constexpr
index_t
L2
=
CopyLengths
{}.
Get
(
I2
);
constexpr
index_t
read_per_d2
=
m
od_conv
::
integer_divide_ceil
(
L2
,
DataPerRead
);
constexpr
index_t
read_per_d2
=
m
ath
::
integer_divide_ceil
(
L2
,
DataPerRead
);
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
read_per_d2
>
{});
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
read_per_d2
>
{});
...
@@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3
...
@@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3
// we allow out-of-bound read from src in D2 dimension,
// we allow out-of-bound read from src in D2 dimension,
// but we need to make sure dst stride is big enough,
// but we need to make sure dst stride is big enough,
// so that the out-of-bound write won't contaminate next line in dst
// so that the out-of-bound write won't contaminate next line in dst
constexpr
index_t
nloop_d2
=
m
od_conv
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
constexpr
index_t
nloop_d2
=
m
ath
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
static_assert
(
nloop_d2
*
thread_per_d2
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I1
),
static_assert
(
nloop_d2
*
thread_per_d2
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I1
),
"wrong! out-of-bound write will contaminate next line!
\n
"
);
"wrong! out-of-bound write will contaminate next line!
\n
"
);
...
@@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3
...
@@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
constexpr
index_t
num_active_thread
=
constexpr
index_t
num_active_thread
=
accumulate_on_sequence
(
ThreadPerDims
{},
m
od_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
ThreadPerDims
{},
m
ath
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
if
(
BlockSize
>
num_active_thread
)
if
(
BlockSize
>
num_active_thread
)
{
{
...
@@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3
...
@@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d2
=
m
od_conv
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
constexpr
index_t
nloop_d2
=
m
ath
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
#pragma unroll
#pragma unroll
for
(
index_t
iloop_d0
=
0
;
iloop_d0
<
nloop_d0
;
++
iloop_d0
)
for
(
index_t
iloop_d0
=
0
;
iloop_d0
<
nloop_d0
;
++
iloop_d0
)
...
@@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3
...
@@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d2
=
m
od_conv
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
constexpr
index_t
nloop_d2
=
m
ath
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
return
DataPerRead
*
nloop_d0
*
nloop_d1
*
nloop_d2
;
return
DataPerRead
*
nloop_d0
*
nloop_d1
*
nloop_d2
;
}
}
...
@@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3
...
@@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d2
=
m
od_conv
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
constexpr
index_t
nloop_d2
=
m
ath
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
*
DataPerRead
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
*
DataPerRead
>
{});
...
@@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3
...
@@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d1
=
L1
/
thread_per_d1
;
constexpr
index_t
nloop_d2
=
m
od_conv
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
constexpr
index_t
nloop_d2
=
m
ath
::
integer_divide_ceil
(
L2
,
thread_per_d2
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
*
DataPerRead
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
*
DataPerRead
>
{});
...
@@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3
...
@@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3
}
}
}
}
};
};
}
// namespace ck
#endif
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment