Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
96d1a7a9
"...composable_kernel.git" did not exist on "42facfc6b7433595e052c6c0b53138dae84530d7"
Commit
96d1a7a9
authored
Dec 05, 2021
by
Chao Liu
Browse files
added profiler for conv+bias+relu+add
parent
cd929111
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
148 additions
and
145 deletions
+148
-145
example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
+16
-79
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+132
-66
No files found.
example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
View file @
96d1a7a9
...
@@ -12,17 +12,9 @@
...
@@ -12,17 +12,9 @@
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
#include "device_conv2d_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp"
#include "device_conv2d_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
struct
PassThrough
struct
AddLeakyReluAdd
{
template
<
typename
T
>
__host__
__device__
constexpr
T
operator
()(
T
v
)
const
{
return
v
;
}
};
struct
BiasLeakyReluAdd
{
{
template
<
typename
T1
,
typename
T2
>
template
<
typename
T1
,
typename
T2
>
__host__
constexpr
float
operator
()(
float
v0
,
T1
v1
,
T2
v2
)
const
__host__
constexpr
float
operator
()(
float
v0
,
T1
v1
,
T2
v2
)
const
...
@@ -96,61 +88,6 @@ struct BiasLeakyReluAdd
...
@@ -96,61 +88,6 @@ struct BiasLeakyReluAdd
}
}
};
};
struct
BiasReluAdd
{
template
<
typename
T1
,
typename
T2
>
__host__
constexpr
float
operator
()(
float
v0
,
T1
v1
,
T2
v2
)
const
{
float
b
=
v0
+
v1
;
float
c
=
b
>
0
?
b
:
0
;
float
d
=
c
+
v2
;
return
d
;
}
template
<
typename
T1
,
typename
T2
>
__device__
constexpr
float
operator
()(
float
v0
,
T1
v1
,
T2
v2
)
const
{
#if 0
float a = v1 + v0;
float b = max(a, float(0));
float c = b + v2;
return c;
#else
float
b
=
v1
+
v2
;
float
c
=
(
v0
>
-
v1
)
?
b
+
v0
:
v2
;
return
c
;
#endif
}
};
struct
BiasLeakyRelu
{
template
<
typename
T1
,
typename
T2
>
__host__
constexpr
float
operator
()(
float
v0
,
T1
v1
,
T2
)
const
{
float
a
=
v0
+
v1
;
float
b
=
0.1
*
a
;
float
c
=
b
>
0
?
b
:
0
;
return
c
;
}
template
<
typename
T1
,
typename
T2
>
__device__
constexpr
float
operator
()(
float
v0
,
T1
v1
,
T2
)
const
{
constexpr
float
alpha
=
0.1
;
float
b
=
v1
+
v0
;
float
c
=
max
(
b
,
float
(
0
));
float
d
=
alpha
*
c
;
return
d
;
}
};
using
InDataType
=
ck
::
half_t
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
...
@@ -163,18 +100,18 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
...
@@ -163,18 +100,18 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
;
using
InElementOp
=
PassThrough
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
Bias
ReluAdd
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add
ReluAdd
;
// clang-format off
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// clang-format off
// | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
// | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
// | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
// | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
1
,
2
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
S
<
1
,
4
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
7
,
1
,
true
,
true
>
;
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
1
,
2
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
S
<
1
,
4
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
7
,
1
,
true
,
true
>
;
// clang-format on
// clang-format on
template
<
typename
TIn
,
template
<
typename
TIn
,
...
@@ -191,7 +128,7 @@ void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
...
@@ -191,7 +128,7 @@ void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
const
std
::
vector
<
ck
::
index_t
>&
conv_strides
,
const
std
::
vector
<
ck
::
index_t
>&
conv_strides
,
const
std
::
vector
<
ck
::
index_t
>&
conv_dilations
,
const
std
::
vector
<
ck
::
index_t
>&
conv_dilations
,
const
std
::
vector
<
ck
::
index_t
>&
in_left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
in_left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
,
const
std
::
vector
<
ck
::
index_t
>&
/* in_right_pads */
,
const
InElementOp
&
in_element_op
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
)
const
OutElementOp
&
out_element_op
)
...
@@ -356,8 +293,8 @@ int main(int argc, char* argv[])
...
@@ -356,8 +293,8 @@ int main(int argc, char* argv[])
default:
default:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
bias_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.
5
,
0.5
});
bias_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
0.
0
,
1.0
});
resi_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.
5
,
0.5
});
resi_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
0.
0
,
1.0
});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
...
@@ -397,8 +334,8 @@ int main(int argc, char* argv[])
...
@@ -397,8 +334,8 @@ int main(int argc, char* argv[])
if
(
!
conv
.
IsSupportedArgument
(
argument
))
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
{
throw
std
::
runtime_error
(
throw
std
::
runtime_error
(
"wrong! device
_conv
with the specified compilation parameters does "
"wrong! device
operator
with the specified compilation parameters does "
"not support this
Conv
problem"
);
"not support this problem"
);
}
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
...
...
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
View file @
96d1a7a9
...
@@ -6,21 +6,23 @@
...
@@ -6,21 +6,23 @@
#include "host_conv.hpp"
#include "host_conv.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_conv_fwd.hpp"
#include "device_conv_fwd
_bias_activation_add
.hpp"
#include "element_wise_operation.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_conv2d_fwd_instance
{
namespace
device_conv2d_fwd_
bias_activation_add_
instance
{
using
DeviceConvFwdNoOpPtr
=
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
using
DeviceConvFwdBiasReluAddPtr
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
DeviceConvFwdBiasActivationAddPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
AddReluAdd
>
;
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_bias_relu_add_xdl_nhwc_kyxc_nhwk_fp16_instances
(
std
::
vector
<
DeviceConvFwdBiasReluAddPtr
>&
);
}
// namespace device_conv2d_fwd_instance
}
// namespace device_conv2d_fwd_
bias_activation_add_
instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
...
@@ -28,6 +30,56 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(std::vector<DeviceC
...
@@ -28,6 +30,56 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(std::vector<DeviceC
namespace
ck
{
namespace
ck
{
namespace
profiler
{
namespace
profiler
{
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
>
void
host_reference_calculation
(
const
Tensor
<
TIn
>&
in_n_c_hi_wi
,
const
Tensor
<
TWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
const
Tensor
<
TOut
>&
bias_k
,
const
Tensor
<
TOut
>&
resi_n_k_ho_wo
,
const
std
::
vector
<
ck
::
index_t
>&
conv_strides
,
const
std
::
vector
<
ck
::
index_t
>&
conv_dilations
,
const
std
::
vector
<
ck
::
index_t
>&
in_left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
/* in_right_pads */
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei_k_c_y_x
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei_k_c_y_x
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
0
]
+
y
*
conv_dilations
[
0
]
-
in_left_pads
[
0
];
for
(
int
x
=
0
;
x
<
wei_k_c_y_x
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
1
]
+
x
*
conv_dilations
[
1
]
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
in_n_c_hi_wi
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in_n_c_hi_wi
.
mDesc
.
GetLengths
()[
3
])
{
v
+=
in_element_op
(
static_cast
<
const
double
>
(
in_n_c_hi_wi
(
n
,
c
,
hi
,
wi
)))
*
wei_element_op
(
static_cast
<
const
double
>
(
wei_k_c_y_x
(
k
,
c
,
y
,
x
)));
}
}
}
}
out_n_k_ho_wo
(
n
,
k
,
ho
,
wo
)
=
out_element_op
(
v
,
bias_k
(
k
),
resi_n_k_ho_wo
(
n
,
k
,
ho
,
wo
));
};
make_ParallelTensorFunctor
(
f_nchw
,
out_n_k_ho_wo
.
mDesc
.
GetLengths
()[
0
],
out_n_k_ho_wo
.
mDesc
.
GetLengths
()[
1
],
out_n_k_ho_wo
.
mDesc
.
GetLengths
()[
2
],
out_n_k_ho_wo
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
template
<
int
NDimSpatial
,
template
<
int
NDimSpatial
,
typename
InDataType
,
typename
InDataType
,
typename
WeiDataType
,
typename
WeiDataType
,
...
@@ -35,20 +87,20 @@ template <int NDimSpatial,
...
@@ -35,20 +87,20 @@ template <int NDimSpatial,
typename
InLayout
,
typename
InLayout
,
typename
WeiLayout
,
typename
WeiLayout
,
typename
OutLayout
>
typename
OutLayout
>
void
profile_conv_fwd_bias_relu_
bias
_impl
(
int
do_verification
,
void
profile_conv_fwd_bias_relu_
add
_impl
(
int
do_verification
,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_log
,
int
nrepeat
,
int
nrepeat
,
ck
::
index_t
N
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
K
,
ck
::
index_t
C
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
{
{
const
ck
::
index_t
Y
=
filter_spatial_lengths
[
0
];
const
ck
::
index_t
Y
=
filter_spatial_lengths
[
0
];
const
ck
::
index_t
X
=
filter_spatial_lengths
[
1
];
const
ck
::
index_t
X
=
filter_spatial_lengths
[
1
];
...
@@ -84,9 +136,18 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
...
@@ -84,9 +136,18 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
// bias: assume contiguous 1d vector
Tensor
<
OutDataType
>
bias_k
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
K
)})));
// residual: assume same layout as output tensor
Tensor
<
OutDataType
>
resi_n_k_ho_wo
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_k_ho_wo: "
<<
out_n_k_ho_wo_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_k_ho_wo: "
<<
out_n_k_ho_wo_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"bias_k: "
<<
bias_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"resi_n_k_ho_wo: "
<<
resi_n_k_ho_wo
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -94,61 +155,63 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
...
@@ -94,61 +155,63 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
case
1
:
case
1
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
bias_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
resi_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
break
;
break
;
default:
default:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
bias_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
0.0
,
1.0
});
resi_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
0.0
,
1.0
});
}
}
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
AddReluAdd
;
if
(
do_verification
)
if
(
do_verification
)
{
{
host_conv_nchw_kcyx_nkhw
(
in_n_c_hi_wi
,
host_reference_calculation
(
in_n_c_hi_wi
,
wei_k_c_y_x
,
wei_k_c_y_x
,
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_host_result
,
conv_filter_strides
,
bias_k
,
conv_filter_dilations
,
resi_n_k_ho_wo
,
input_left_pads
,
conv_filter_strides
,
input_right_pads
);
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo_device_result
.
mDesc
.
GetElementSpace
());
out_n_k_ho_wo_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_device_buf
(
sizeof
(
OutDataType
)
*
bias_k
.
mDesc
.
GetElementSpace
());
DeviceMem
resi_device_buf
(
sizeof
(
OutDataType
)
*
resi_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
bias_device_buf
.
ToDevice
(
bias_k
.
mData
.
data
());
resi_device_buf
.
ToDevice
(
resi_n_k_ho_wo
.
mData
.
data
());
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceConvFwdBiasReluAddPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdBiasActivationAddPtr
<
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
// add device
Conv
instances
// add device
operator
instances
std
::
vector
<
DeviceConvFwd
NoOp
Ptr
>
conv
_ptrs
;
std
::
vector
<
DeviceConvFwd
BiasReluAdd
Ptr
>
op
_ptrs
;
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
floa
t
>
&&
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
half_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
floa
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
half_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
floa
t
>
)
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
half_
t
>
)
{
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_bias_activation_add_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp32_instances
(
conv_ptrs
);
add_device_conv2d_fwd_bias_relu_add_xdl_nhwc_kyxc_nhwk_fp16_instances
(
op_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
half_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_fp16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_fp16_instances
(
conv_ptrs
);
}
}
if
(
conv
_ptrs
.
size
()
<=
0
)
if
(
op
_ptrs
.
size
()
<=
0
)
{
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
}
...
@@ -159,12 +222,14 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
...
@@ -159,12 +222,14 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
float
best_gb_per_sec
=
0
;
float
best_gb_per_sec
=
0
;
// profile device Conv instances
// profile device Conv instances
for
(
auto
&
conv
_ptr
:
conv
_ptrs
)
for
(
auto
&
op
_ptr
:
op
_ptrs
)
{
{
auto
argument_ptr
=
conv
_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
op
_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
OutDataType
*>
(
bias_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
OutDataType
*>
(
resi_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
K
,
K
,
C
,
C
,
...
@@ -175,23 +240,24 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
...
@@ -175,23 +240,24 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
,
input_right_pads
,
PassThrough
{},
InElementOp
{},
PassThrough
{},
WeiElementOp
{},
PassThrough
{});
OutElementOp
{});
auto
invoker_ptr
=
conv
_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op
_ptr
->
MakeInvokerPointer
();
if
(
conv
_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op
_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
std
::
string
conv_name
=
conv
_ptr
->
GetTypeString
();
std
::
string
conv_name
=
op
_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
std
::
size_t
num_btype
=
sizeof
(
InDataType
)
*
(
N
*
C
*
Hi
*
Wi
)
+
std
::
size_t
num_btype
=
sizeof
(
WeiDataType
)
*
(
K
*
C
*
Y
*
X
)
+
sizeof
(
InDataType
)
*
(
N
*
C
*
Hi
*
Wi
)
+
sizeof
(
WeiDataType
)
*
(
K
*
C
*
Y
*
X
)
+
sizeof
(
OutDataType
)
*
(
N
*
K
*
Ho
*
Wo
);
sizeof
(
OutDataType
)
*
(
N
*
K
*
Ho
*
Wo
)
+
sizeof
(
OutDataType
)
*
(
K
)
+
sizeof
(
OutDataType
)
*
(
N
*
K
*
Ho
*
Wo
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment