Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
fe427fd1
Commit
fe427fd1
authored
Jul 23, 2021
by
Jing Zhang
Browse files
init commit for conv+activ
parent
b8bb1480
Changes
6
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
901 additions
and
36 deletions
+901
-36
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
...tion_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+711
-2
composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
...nsor_description/dynamic_multi_index_transform_helper.hpp
+1
-1
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
...rnel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+13
-0
host/driver_offline/conv_fwd_driver_offline.cpp
host/driver_offline/conv_fwd_driver_offline.cpp
+62
-29
host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+7
-4
host/host_tensor/include/host_conv.hpp
host/host_tensor/include/host_conv.hpp
+107
-0
No files found.
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
View file @
fe427fd1
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
View file @
fe427fd1
...
...
@@ -32,7 +32,7 @@ __host__ __device__ constexpr auto make_left_pad_transform(
return
DynamicLeftPad
<
LowLength
,
LeftPad
,
SkipIsValidCheck
>
{
low_length
,
left_pad
};
}
template
<
typename
LowLength
,
typename
RightPad
,
bool
SkipIsValidCheck
>
template
<
typename
LowLength
,
typename
RightPad
,
bool
SkipIsValidCheck
=
false
>
__host__
__device__
constexpr
auto
make_right_pad_transform
(
const
LowLength
&
low_length
,
const
RightPad
&
right_pad
,
...
...
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
View file @
fe427fd1
...
...
@@ -346,6 +346,19 @@ struct GridwiseStaticGemm_km_kn_mn_v3
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
}
// activ
{
constexpr
index_t
activ_type
=
2
;
static_for
<
0
,
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
(),
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
activ_type
==
1
)
c_thread_buf
(
i
)
=
c_thread_buf
[
i
]
>=
0
?
c_thread_buf
[
i
]
:
0.0
;
else
if
constexpr
(
activ_type
==
2
)
c_thread_buf
(
i
)
=
1.0
/
(
1.0
+
exp
(
-
c_thread_buf
[
i
]));
}
);
}
// output: register to global memory
{
// hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
...
...
host/driver_offline/conv_fwd_driver_offline.cpp
View file @
fe427fd1
...
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if
0
#if
1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
...
...
@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif
1
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
...
...
@@ -135,7 +135,7 @@ int main(int argc, char* argv[])
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif
1
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
...
...
@@ -143,6 +143,38 @@ int main(int argc, char* argv[])
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
...
...
@@ -420,7 +452,7 @@ int main(int argc, char* argv[])
#else
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
#endif
<
in_data_t
,
8
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
<
in_data_t
,
8
,
8
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
...
...
@@ -490,14 +522,15 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
host_direct_convolution
(
in
,
host_direct_convolution
_activ
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
layout
,
ActivType_t
::
sigmoid
);
check_error
(
out_host
,
out_device
);
...
...
host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
fe427fd1
...
...
@@ -6,6 +6,7 @@
template
<
typename
TInWei
,
ck
::
index_t
InWeiVectorSize
,
ck
::
index_t
OutVectorSize
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
...
...
@@ -53,8 +54,8 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
const
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
const
auto
C1
=
Number
<
InWeiVectorSize
>
{};
const
auto
K0
=
K
/
Number
<
InWei
VectorSize
>
{};
const
auto
K1
=
Number
<
InWei
VectorSize
>
{};
const
auto
K0
=
K
/
Number
<
Out
VectorSize
>
{};
const
auto
K1
=
Number
<
Out
VectorSize
>
{};
Tensor
<
TInWei
>
in_n_c0_hi_wi_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
C0
,
Hi
,
Wi
,
C1
}));
...
...
@@ -105,7 +106,7 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
Y
*
X
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
...
...
@@ -120,8 +121,10 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr
auto
conv_driver
=
#if 0
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#el
se
#el
if
1
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
#elif 1
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
#endif
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
...
...
host/host_tensor/include/host_conv.hpp
View file @
fe427fd1
#pragma once
#include "host_tensor.hpp"
typedef
enum
{
passthrough
=
0
,
relu
,
sigmoid
}
ActivType_t
;
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
...
...
@@ -88,6 +95,106 @@ void host_direct_convolution(const Tensor<TIn>& in,
}
}
template
<
typename
T
>
inline
auto
activ
(
T
v
,
const
ActivType_t
activ_type
)
{
switch
(
activ_type
)
{
case
passthrough
:
return
v
;
case
relu
:
return
(
v
>=
0
?
v
:
0
);
case
sigmoid
:
return
(
1
/
(
1
+
exp
(
-
v
)));
default:
throw
std
::
runtime_error
(
"unsupported activ type"
);
break
;
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_activ
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
,
const
ActivType_t
activ_type
=
ActivType_t
::
passthrough
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c
,
hi
,
wi
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c
,
y
,
x
));
}
}
}
}
out
(
n
,
k
,
ho
,
wo
)
=
activ
(
v
,
activ_type
);
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
1
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
2
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
hi
,
wi
,
c
))
*
static_cast
<
const
double
>
(
wei
(
k
,
y
,
x
,
c
));
}
}
}
}
out
(
n
,
k
,
ho
,
wo
)
=
activ
(
v
,
activ_type
);
};
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
case
ConvTensorLayout
::
NHWC
:
make_ParallelTensorFunctor
(
f_nhwc
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
default:
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_winograd_3x3_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment