Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5ed51b71
Commit
5ed51b71
authored
Jul 26, 2021
by
Jing Zhang
Browse files
init commit of conv+add
parent
b53926e9
Changes
11
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
2672 additions
and
122 deletions
+2672
-122
composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
..._add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+1139
-0
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
...tion_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+9
-11
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
...rnel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+6
-4
composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
...rnel/include/tensor_operation/gridwise_static_gemm_v3.hpp
+476
-0
host/driver_offline/CMakeLists.txt
host/driver_offline/CMakeLists.txt
+6
-0
host/driver_offline/conv_activ_fwd_driver_offline.cpp
host/driver_offline/conv_activ_fwd_driver_offline.cpp
+341
-0
host/driver_offline/conv_add_fwd_driver_offline.cpp
host/driver_offline/conv_add_fwd_driver_offline.cpp
+367
-0
host/driver_offline/conv_fwd_driver_offline.cpp
host/driver_offline/conv_fwd_driver_offline.cpp
+33
-105
host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...olution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+184
-0
host/host_tensor/include/host_conv.hpp
host/host_tensor/include/host_conv.hpp
+109
-0
script/run.sh
script/run.sh
+2
-2
No files found.
composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
0 → 100644
View file @
5ed51b71
This diff is collapsed.
Click to expand it.
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
View file @
5ed51b71
...
@@ -226,7 +226,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
...
@@ -226,7 +226,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
// GEMM
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v
3
<
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v
2
<
BlockSize
,
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
...
@@ -273,11 +273,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
...
@@ -273,11 +273,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
index_t
nrepeat
=
100
;
index_t
nrepeat
=
100
;
std
::
cout
<<
"conv_v5r1__NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
std
::
cout
<<
"NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
...
@@ -990,11 +989,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1
...
@@ -990,11 +989,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1
index_t
nrepeat
=
100
;
index_t
nrepeat
=
100
;
std
::
cout
<<
"conv_v5r1_NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
std
::
cout
<<
"NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
...
...
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
View file @
5ed51b71
...
@@ -47,7 +47,7 @@ template <index_t BlockSize,
...
@@ -47,7 +47,7 @@ template <index_t BlockSize,
typename
CGlobalIteratorHacks
,
typename
CGlobalIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
BGlobalMoveSliceWindowIteratorHacks
>
typename
BGlobalMoveSliceWindowIteratorHacks
>
struct
GridwiseStaticGemm_km_kn_mn_v
3
struct
GridwiseStaticGemm_km_kn_mn_v
2
{
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
{
...
@@ -237,10 +237,12 @@ struct GridwiseStaticGemm_km_kn_mn_v3
...
@@ -237,10 +237,12 @@ struct GridwiseStaticGemm_km_kn_mn_v3
c_thread_buf
;
c_thread_buf
;
// initialize output thread tensor
// initialize output thread tensor
#if 0
ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
decltype(c_k_n_ho_wo_thread_desc),
decltype(c_k_n_ho_wo_thread_desc),
Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
.Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
.Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
#endif
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
...
...
composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
0 → 100644
View file @
5ed51b71
This diff is collapsed.
Click to expand it.
host/driver_offline/CMakeLists.txt
View file @
5ed51b71
...
@@ -13,9 +13,15 @@ include_directories(BEFORE
...
@@ -13,9 +13,15 @@ include_directories(BEFORE
set
(
CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp
)
set
(
CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp
)
set
(
CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp
)
set
(
CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp
)
set
(
CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE conv_add_fwd_driver_offline.cpp
)
set
(
CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE conv_activ_fwd_driver_offline.cpp
)
add_executable
(
conv_fwd_driver_offline
${
CONV_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_fwd_driver_offline
${
CONV_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_bwd_driver_offline
${
CONV_BWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_bwd_driver_offline
${
CONV_BWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_add_fwd_driver_offline
${
CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_activ_fwd_driver_offline
${
CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE
}
)
target_link_libraries
(
conv_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_bwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_bwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_add_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_activ_fwd_driver_offline PRIVATE host_tensor
)
host/driver_offline/conv_activ_fwd_driver_offline.cpp
0 → 100644
View file @
5ed51b71
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHW 1
enum
ConvForwardAlgo
{
V5R1NCHW
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
// static mode
if
(
argc
<
7
)
{
printf
(
"arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
atoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
atoi
(
argv
[
2
]));
const
bool
do_verification
=
atoi
(
argv
[
3
]);
const
int
init_method
=
atoi
(
argv
[
4
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
constexpr
index_t
Wi
=
135
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
const
index_t
conv_stride_w
=
1
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
in_left_pad_h
=
1
;
const
index_t
in_left_pad_w
=
1
;
const
index_t
in_right_pad_h
=
1
;
const
index_t
in_right_pad_w
=
1
;
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
out_data_t
>
add_device
(
out_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
C
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
C
>
{},
Number
<
Y
>
{},
Number
<
X
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
K
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{},
Number
<
C
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
Y
>
{},
Number
<
X
>
{},
Number
<
C
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{},
Number
<
K
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
constexpr
ck
::
index_t
activ_type
=
2
;
#if USE_CONV_FWD_V5R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
8
,
8
,
activ_type
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_activ
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
,
layout
);
check_error
(
out_host
,
out_device
);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
}
host/driver_offline/conv_add_fwd_driver_offline.cpp
0 → 100644
View file @
5ed51b71
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHW 1
enum
ConvForwardAlgo
{
V5R1NCHW
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
// static mode
if
(
argc
<
7
)
{
printf
(
"arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
atoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
atoi
(
argv
[
2
]));
const
bool
do_verification
=
atoi
(
argv
[
3
]);
const
int
init_method
=
atoi
(
argv
[
4
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 1080;
constexpr index_t Wi = 1920;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
constexpr
index_t
Wi
=
135
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
const
index_t
conv_stride_w
=
1
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
in_left_pad_h
=
1
;
const
index_t
in_left_pad_w
=
1
;
const
index_t
in_right_pad_h
=
1
;
const
index_t
in_right_pad_w
=
1
;
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
index_t
Hox2
=
Ho
*
2
;
const
index_t
Wox2
=
Wo
*
2
;
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
),
add_lengths_host
(
4
);
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
add_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
add_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
add_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hox2
);
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
add_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
add_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hox2
);
add_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
add
(
add_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
add_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
add_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
add
.
mDesc
,
std
::
cout
<<
"add: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
C
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
C
>
{},
Number
<
Y
>
{},
Number
<
X
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
K
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{});
const
auto
add_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
K
>
{},
Number
<
Hox2
>
{},
Number
<
Wox2
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
add_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{},
Number
<
C
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
Y
>
{},
Number
<
X
>
{},
Number
<
C
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{},
Number
<
K
>
{});
const
auto
add_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Hox2
>
{},
Number
<
Wox2
>
{},
Number
<
K
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
add_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
constexpr
ck
::
index_t
activ_type
=
2
;
#if USE_CONV_FWD_V5R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
#if 1
device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
8
,
8
,
activ_type
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
// in_lengths_dev
tmp
[
I1
],
// wei_lengths_dev
tmp
[
I2
],
// add_lengths_dev
tmp
[
I3
],
// out_lengths_dev
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
tmp
[
I7
],
in
,
wei
,
add
,
out_device
,
nrepeat
);
#endif
}
#endif
if
(
do_verification
)
{
host_direct_convolution_add
(
in
,
wei
,
add
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
,
layout
);
check_error
(
out_host
,
out_device
);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
}
host/driver_offline/conv_fwd_driver_offline.cpp
View file @
5ed51b71
...
@@ -15,16 +15,15 @@
...
@@ -15,16 +15,15 @@
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE
0
#define USE_DYNAMIC_MODE
1
#define USE_CONV_FWD_V4R4_NCHW
0
#define USE_CONV_FWD_V4R4_NCHW
1
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NCHW
1
#define USE_CONV_FWD_V5R1_NCHW
0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...
@@ -103,82 +102,16 @@ int main(int argc, char* argv[])
...
@@ -103,82 +102,16 @@ int main(int argc, char* argv[])
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if 1
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
192
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
71
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
71
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
256
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
constexpr
index_t
Wi
=
135
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
const
index_t
conv_stride_h
=
2
;
const
index_t
conv_stride_w
=
1
;
const
index_t
conv_stride_w
=
2
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
in_left_pad_h
=
1
;
const
index_t
in_left_pad_h
=
1
;
...
@@ -193,7 +126,7 @@ int main(int argc, char* argv[])
...
@@ -193,7 +126,7 @@ int main(int argc, char* argv[])
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#endif
#endif
#if
0
#if
1
using
in_data_t
=
float
;
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
using
out_data_t
=
float
;
...
@@ -437,8 +370,6 @@ int main(int argc, char* argv[])
...
@@ -437,8 +370,6 @@ int main(int argc, char* argv[])
}
}
#endif
#endif
constexpr
ck
::
index_t
activ_type
=
2
;
#if USE_CONV_FWD_V5R1_NCHW
#if USE_CONV_FWD_V5R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
{
{
...
@@ -449,12 +380,10 @@ int main(int argc, char* argv[])
...
@@ -449,12 +380,10 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
const
auto
tmp
=
f_make_for_device_nchw
();
#if 1
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
16
,
#else
acc_data_t
,
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
out_data_t
>
(
tmp
[
I0
],
#endif
<
in_data_t
,
8
,
8
,
activ_type
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I3
],
...
@@ -524,14 +453,13 @@ int main(int argc, char* argv[])
...
@@ -524,14 +453,13 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
host_direct_convolution
_activ
(
in
,
host_direct_convolution
(
in
,
wei
,
wei
,
out_host
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
,
layout
);
layout
);
check_error
(
out_host
,
out_device
);
check_error
(
out_host
,
out_device
);
...
...
host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
5ed51b71
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp"
template
<
typename
TInWei
,
ck
::
index_t
InWeiVectorSize
,
ck
::
index_t
OutVectorSize
,
ck
::
index_t
activ_type
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
AddLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
AddLengths
&
add_n_k_hox2_wox2_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
const
Tensor
<
TOut
>&
add_n_k_hox2_wox2
,
Tensor
<
TOut
>&
out_n_k_hox2_wox2
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
N
=
out_n_k_ho_wo_lengths
[
I0
];
const
auto
K
=
out_n_k_ho_wo_lengths
[
I1
];
const
auto
C
=
wei_k_c_y_x_lengths
[
I1
];
const
auto
Hi
=
in_n_c_hi_wi_lengths
[
I2
];
const
auto
Wi
=
in_n_c_hi_wi_lengths
[
I3
];
const
auto
Ho
=
out_n_k_ho_wo_lengths
[
I2
];
const
auto
Wo
=
out_n_k_ho_wo_lengths
[
I3
];
const
auto
Hox2
=
Ho
*
2
;
const
auto
Wox2
=
Wo
*
2
;
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
const
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
const
auto
C1
=
Number
<
InWeiVectorSize
>
{};
const
auto
K0
=
K
/
Number
<
OutVectorSize
>
{};
const
auto
K1
=
Number
<
OutVectorSize
>
{};
Tensor
<
TInWei
>
in_n_c0_hi_wi_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
C0
,
Hi
,
Wi
,
C1
}));
Tensor
<
TInWei
>
wei_k_c0_y_x_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
K
,
C0
,
Y
,
X
,
C1
}));
Tensor
<
TOut
>
out_n_k0_hox2_wox2_k1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
K0
,
Hox2
,
Wox2
,
K1
}));
Tensor
<
TOut
>
add_n_k0_hox2_wox2_k1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
K0
,
Hox2
,
Wox2
,
K1
}));
auto
f_nchw2nc0hwc1
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
in_n_c0_hi_wi_c1
(
n
,
c
/
InWeiVectorSize
,
hi
,
wi
,
c
%
InWeiVectorSize
)
=
in_n_c_hi_wi
(
n
,
c
,
hi
,
wi
);
};
auto
f_kcyx2kc0yxc1
=
[
&
](
auto
k
,
auto
y
,
auto
x
,
auto
c
)
{
wei_k_c0_y_x_c1
(
k
,
c
/
InWeiVectorSize
,
y
,
x
,
c
%
InWeiVectorSize
)
=
wei_k_c_y_x
(
k
,
c
,
y
,
x
);
};
auto
f_nchx2wx2_to_nc0hx2wx2c1
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
c
)
{
add_n_k0_hox2_wox2_k1
(
n
,
c
/
InWeiVectorSize
,
ho
,
wo
,
c
%
InWeiVectorSize
)
=
add_n_k_hox2_wox2
(
n
,
c
,
ho
,
wo
);
};
make_ParallelTensorFunctor
(
f_nchw2nc0hwc1
,
N
,
Hi
,
Wi
,
C
)();
make_ParallelTensorFunctor
(
f_kcyx2kc0yxc1
,
K
,
Y
,
X
,
C
)();
make_ParallelTensorFunctor
(
f_nchx2wx2_to_nc0hx2wx2c1
,
N
,
Hox2
,
Wox2
,
K
)();
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
add_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
add_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
const
auto
in_n_c0_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
));
const
auto
wei_k_c0_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C0
,
Y
,
X
));
const
auto
out_n_k0_ho_wo_k1_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
const
auto
add_n_k0_hox2_wox2_k1_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
,
K1
));
// cdata = 64, BlockSize = 64, 16x8x32x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
K
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
C0
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
Y
*
X
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
K1
;
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector_W
==
0
,
""
);
constexpr
auto
conv_driver
=
DriverStaticConvolutionAddForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
TAcc
,
TOut
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
BThreadTransferSrcScalarPerVector_W
,
CThreadTransferDstScalarPerVector_W
>
{};
conv_driver
.
Run
(
wei_k_c0_y_x_desc
,
in_n_c0_hi_wi_desc
,
add_n_k0_hox2_wox2_k1_desc
,
out_n_k0_ho_wo_k1_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
Number
<
activ_type
>
{},
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()));
out_n_k0_hox2_wox2_k1_device_buf
.
FromDevice
(
out_n_k0_hox2_wox2_k1
.
mData
.
data
());
auto
f_nk0hwk1_to_nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_n_k_hox2_wox2
(
n
,
k
,
ho
,
wo
)
=
out_n_k0_hox2_wox2_k1
(
n
,
k
/
InWeiVectorSize
,
ho
,
wo
,
k
%
InWeiVectorSize
);
};
make_ParallelTensorFunctor
(
f_nk0hwk1_to_nkhw
,
N
,
K
,
Hox2
,
Wox2
)();
}
host/host_tensor/include/host_conv.hpp
View file @
5ed51b71
...
@@ -188,6 +188,115 @@ void host_direct_convolution_activ(const Tensor<TIn>& in,
...
@@ -188,6 +188,115 @@ void host_direct_convolution_activ(const Tensor<TIn>& in,
}
}
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_add
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
add
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
ck
::
index_t
activ_type
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c
,
hi
,
wi
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c
,
y
,
x
));
}
}
}
}
index_t
hox2
=
ho
*
2
;
index_t
wox2
=
wo
*
2
;
v
=
activ
(
v
,
activ_type
);
out
(
n
,
k
,
hox2
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
);
out
(
n
,
k
,
hox2
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
+
1
);
out
(
n
,
k
,
hox2
+
1
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
);
out
(
n
,
k
,
hox2
+
1
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
+
1
);
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
1
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
2
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
hi
,
wi
,
c
))
*
static_cast
<
const
double
>
(
wei
(
k
,
y
,
x
,
c
));
}
}
}
}
index_t
hox2
=
ho
*
2
;
index_t
wox2
=
wo
*
2
;
v
=
activ
(
v
,
activ_type
);
out
(
n
,
k
,
hox2
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
);
out
(
n
,
k
,
hox2
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
+
1
);
out
(
n
,
k
,
hox2
+
1
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
);
out
(
n
,
k
,
hox2
+
1
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
+
1
);
};
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
]
/
2
,
out
.
mDesc
.
GetLengths
()[
3
]
/
2
)(
std
::
thread
::
hardware_concurrency
());
break
;
case
ConvTensorLayout
::
NHWC
:
make_ParallelTensorFunctor
(
f_nhwc
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
]
/
2
,
out
.
mDesc
.
GetLengths
()[
3
]
/
2
)(
std
::
thread
::
hardware_concurrency
());
break
;
default:
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
InLeftPads
,
typename
InRightPads
>
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_winograd_3x3_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
void
host_winograd_3x3_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
const
Tensor
<
TWei
>&
wei_kcyx
,
...
...
script/run.sh
View file @
5ed51b71
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#export OLC_DEBUG_HIP_DUMP=1
#export OLC_DEBUG_HIP_DUMP=1
#export OLC_DEBUG_SAVE_TEMP_DIR=1
#export OLC_DEBUG_SAVE_TEMP_DIR=1
make
-j
conv_fwd_driver_offline
make
-j
conv_
add_
fwd_driver_offline
#make -j conv_bwd_driver_offline
#make -j conv_bwd_driver_offline
#make -j conv_fwd_driver_online
#make -j conv_fwd_driver_online
...
@@ -26,7 +26,7 @@ INIT=$4
...
@@ -26,7 +26,7 @@ INIT=$4
LOG
=
$5
LOG
=
$5
REPEAT
=
$6
REPEAT
=
$6
./host/driver_offline/conv_fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
./host/driver_offline/conv_
add_
fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment