Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c5143bca
"vscode:/vscode.git/clone" did not exist on "45837d86ffd5a3c08c2973bf2978fdce76493810"
Commit
c5143bca
authored
Oct 10, 2019
by
Wen-Heng (Jack) Chung
Browse files
Added backward weight cpu changes in driver
parent
32850b93
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
95 additions
and
16 deletions
+95
-16
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp
...cit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp
+2
-0
driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
...de/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
+15
-11
driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
...device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+77
-4
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
c5143bca
...
...
@@ -80,6 +80,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buf
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
if
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
==
0
)
printf
(
"conv dir %d"
,
conv_dir
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
...
...
driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
View file @
c5143bca
...
...
@@ -16,13 +16,13 @@
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#define CK_ENABLE_XDLOPS 0
#define CK_PARAM_PROBLEM_DIRECTION
0
#define CK_PARAM_PROBLEM_DIRECTION
2
#define CK_PARAM_EPACK_LENGTH 1
#define CK_PARAM_TUNABLE_BLOCK_SIZE 64
#define CK_PARAM_TUNABLE_K_PER_BLOCK 32
#define CK_PARAM_TUNABLE_B_PER_BLOCK 64
#define CK_PARAM_TUNABLE_E_PER_BLOCK 8
#define CK_PARAM_DEPENDENT_GRID_SIZE
16
#define CK_PARAM_DEPENDENT_GRID_SIZE
2
#define CK_PARAM_GEMM_M_PER_WAVE 32
#define CK_PARAM_GEMM_N_PER_WAVE 64
#define CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E 8
...
...
@@ -109,17 +109,21 @@ void device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw(InDesc,
// of the wrw convolution when used in a fwd context
printf
(
"backward weight is executed
\n
"
);
constexpr
auto
tmp_in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
tmp_wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
tmp_out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
in_nchw_desc
=
tmp_in_nchw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
// wei and out are swapped in the solver
constexpr
auto
wei_kcyx_desc
=
tmp_out_nkhw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
constexpr
auto
out_nkhw_desc
=
tmp_wei_kcyx_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
//
constexpr auto tmp_in_nchw_desc =
//
make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
//
constexpr auto tmp_wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
//
constexpr auto tmp_out_nkhw_desc =
//
make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
//
constexpr auto in_nchw_desc = tmp_in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
//
//
wei and out are swapped in the solver
//
constexpr auto wei_kcyx_desc = tmp_out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
//
constexpr auto out_nkhw_desc = tmp_wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
constexpr
auto
dir
=
ImplicitGemmDirection
::
BackwardWeight
;
constexpr
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
// swap stride and dilation
// using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
// using ConvStrides = Sequence<ConvDilationH, ConvDilationW>;
...
...
driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
View file @
c5143bca
driver/src/driver.cpp
View file @
c5143bca
...
...
@@ -17,6 +17,10 @@
using
namespace
ck
;
#define CONV_DIRECTION_FWD_DATA 0
#define CONV_DIRECTION_BWD_DATA 0
#define CONV_DIRECTION_BWD_WEIT 1
struct
GeneratorTensor_1
{
template
<
class
...
Is
>
...
...
@@ -29,7 +33,7 @@ struct GeneratorTensor_1
struct
GeneratorTensor_2
{
int
min_value
=
0
;
int
max_value
=
1
;
int
max_value
=
1
6
;
template
<
class
...
Is
>
double
operator
()(
Is
...)
...
...
@@ -50,13 +54,27 @@ struct GeneratorTensor_3
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
index_t
(
0
),
f_acc
);
}
};
struct
GeneratorTensor_fixed
{
template
<
class
...
Is
>
double
operator
()(
Is
...
is
)
{
std
::
array
<
index_t
,
sizeof
...(
Is
)
>
dims
=
{{
static_cast
<
index_t
>
(
is
)...}};
if
(
dims
[
0
]
==
0
)
return
(
dims
[
1
]
*
16
+
dims
[
2
]
*
4
+
dims
[
3
]);
else
return
1
;
}
};
struct
GeneratorTensor_Checkboard
{
template
<
class
...
Ts
>
double
operator
()(
Ts
...
Xs
)
const
{
std
::
array
<
index_t
,
sizeof
...(
Ts
)
>
dims
=
{{
Xs
...}};
std
::
array
<
index_t
,
sizeof
...(
Ts
)
>
dims
=
{{
static_cast
<
index_t
>
(
Xs
)
...}};
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
true
,
...
...
@@ -401,7 +419,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
float
ref_value
=
0
,
result_value
=
0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
std
::
cout
<<
result
.
mData
[
i
]
<<
" "
;
std
::
cout
<<
result
.
mData
[
i
]
<<
","
<<
ref
.
mData
[
i
]
<<
" "
;
error
+=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
...
...
@@ -819,15 +837,24 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
HO
=
4
;
constexpr
index_t
WO
=
4
;
#endif
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
#if CONV_DIRECTION_FWD_DATA
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
lower_pads
,
upper_pads
);
#elif CONV_DIRECTION_BWD_WEIT
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
N
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
K
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
N
,
HO
,
WO
>
{});
#endif
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
...
...
@@ -835,10 +862,19 @@ int main(int argc, char* argv[])
using
in_data_t
=
float
;
using
out_data_t
=
float
;
#if CONV_DIRECTION_FWD_DATA
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
#elif CONV_DIRECTION_BWD_WEIT
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
out_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
wei_kcyx_host
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
in_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
in_data_t
>
out_nkhw
(
make_TensorDescriptor
(
out_nkhw_desc
));
#endif
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
...
@@ -854,8 +890,14 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
#if 1
#if CONV_DIRECTION_FWD_DATA // fwd data
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#elif CONV_DIRECTION_BWD_WEIT // bwd wrw
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{},
num_thread
);
//out_nkhw_host.GenerateTensorValue(GeneratorTensor_2{}, num_thread);
out_nkhw
.
GenerateTensorValue
(
GeneratorTensor_2
{},
num_thread
);
#endif
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
...
...
@@ -891,6 +933,7 @@ int main(int argc, char* argv[])
#elif 1
device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw
#endif
#if CONV_DIRECTION_FWD_DATA // fwd data
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
@@ -900,6 +943,17 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif CONV_DIRECTION_BWD_WEIT // bwd wrw
(
in_nchw_desc
,
in_nchw
,
out_nkhw_desc
,
out_nkhw
,
wei_kcyx_desc
,
wei_kcyx
,
ConvDilations
{},
ConvStrides
{},
nrepeat
);
#endif
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
...
...
@@ -924,6 +978,8 @@ int main(int argc, char* argv[])
else
#endif
{
#if CONV_DIRECTION_FWD_DATA // fwd data
host_direct_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
...
...
@@ -931,8 +987,25 @@ int main(int argc, char* argv[])
ConvDilations
{},
lower_pads
,
upper_pads
);
#elif CONV_DIRECTION_BWD_WEIT // bwd wrw
host_direct_convolution
(
in_nchw
,
out_nkhw
,
wei_kcyx_host
,
ConvDilations
{},
ConvStrides
{},
lower_pads
,
upper_pads
);
#endif
}
#if CONV_DIRECTION_FWD_DATA // fwd data
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#elif CONV_DIRECTION_BWD_WEIT // bwd wrw
check_error
(
wei_kcyx_host
,
wei_kcyx
);
#endif
LogRange
(
std
::
cout
<<
"in_nchw : "
,
in_nchw
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"out_nkhw_device : "
,
out_nkhw
.
mData
,
","
)
<<
std
::
endl
;
//LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment