Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5d015452
Commit
5d015452
authored
Jul 06, 2022
by
Chaitanya Inumella
Browse files
Rebased the hipTENSOR development branch with the contraction branch
parents
b7fa6bb1
ed3feb4d
Changes
443
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
223 additions
and
3448 deletions
+223
-3448
library/src/obselete_driver_offline/CMakeLists.txt
library/src/obselete_driver_offline/CMakeLists.txt
+0
-37
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
...lete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+0
-416
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+0
-488
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+0
-549
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
...obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+0
-393
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
..._driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+0
-415
library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
...y/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
+0
-532
library/src/obselete_driver_offline/gemm_driver_offline.cpp
library/src/obselete_driver_offline/gemm_driver_offline.cpp
+0
-456
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+45
-62
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
..._batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+28
-20
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
..._batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
..._batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
..._batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
...ice_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
...ice_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
...ice_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
...ice_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
...ice_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
...ice_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
+15
-8
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
...ice_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
+15
-8
No files found.
Too many changes to show.
To preserve performance only
443 of 443+
files are displayed.
Plain diff
Email patch
library/src/obselete_driver_offline/CMakeLists.txt
deleted
100644 → 0
View file @
b7fa6bb1
include_directories
(
BEFORE
include
${
PROJECT_SOURCE_DIR
}
/host/host_tensor/include
${
PROJECT_SOURCE_DIR
}
/host/device/include
${
PROJECT_SOURCE_DIR
}
/host/solver/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_description
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_operation
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/problem_transform
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/driver
${
PROJECT_SOURCE_DIR
}
/external/rocm/include
)
set
(
CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp
)
set
(
CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp
)
set
(
CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp
)
set
(
CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp
)
set
(
CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp
)
set
(
CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp
)
set
(
GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp
)
add_executable
(
conv_fwd_driver_offline
${
CONV_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_fwd_driver_offline_nchwc
${
CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE
}
)
add_executable
(
conv_add_fwd_driver_offline_nchwc
${
CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE
}
)
add_executable
(
conv_maxpool_fwd_driver_offline_nchwc
${
CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE
}
)
add_executable
(
conv_bwd_driver_offline
${
CONV_BWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_wrw_driver_offline
${
CONV_WRW_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
gemm_driver_offline
${
GEMM_DRIVER_OFFLINE_SOURCE
}
)
target_link_libraries
(
conv_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_fwd_driver_offline_nchwc PRIVATE host_tensor
)
target_link_libraries
(
conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor
)
target_link_libraries
(
conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor
)
target_link_libraries
(
conv_bwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_wrw_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
gemm_driver_offline PRIVATE host_tensor
)
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
deleted
100644 → 0
View file @
b7fa6bb1
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum
ConvForwardAlgo
{
V5R1NCHWC
// 0
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_add_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
add
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
add_host
,
Tensor
<
TOut
>&
out_host
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
activ_type
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
double
v
=
0
;
auto
k
=
k0
*
out_host
.
mDesc
.
GetLengths
()[
4
]
+
k1
;
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c0
,
y
,
x
,
c1
));
}
}
}
}
}
v
+=
bias
(
k0
,
k1
);
v
=
activ
(
v
,
activ_type
);
const
int
hox2
=
ho
*
2
;
const
int
wox2
=
wo
*
2
;
out_host
(
n
,
k0
,
ho
,
wo
,
k1
)
=
v
;
add_host
(
n
,
k0
,
hox2
,
wox2
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
,
wox2
,
k1
);
add_host
(
n
,
k0
,
hox2
,
wox2
+
1
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
,
wox2
+
1
,
k1
);
add_host
(
n
,
k0
,
hox2
+
1
,
wox2
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
+
1
,
wox2
,
k1
);
add_host
(
n
,
k0
,
hox2
+
1
,
wox2
+
1
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
+
1
,
wox2
+
1
,
k1
);
};
make_ParallelTensorFunctor
(
f_nchw
,
out_host
.
mDesc
.
GetLengths
()[
0
],
out_host
.
mDesc
.
GetLengths
()[
1
],
out_host
.
mDesc
.
GetLengths
()[
2
],
out_host
.
mDesc
.
GetLengths
()[
3
],
out_host
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
23
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
index_t
N
=
std
::
stoi
(
argv
[
6
]);
const
index_t
K0
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K1
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C0
=
std
::
stoi
(
argv
[
9
]);
const
index_t
C1
=
std
::
stoi
(
argv
[
10
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
11
]);
const
index_t
X
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
17
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
auto
Hox2
=
Ho
*
2
;
const
auto
Wox2
=
Wo
*
2
;
#else
// static mode
if
(
argc
<
6
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
#elif
0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 1
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
Hi
=
Number
<
135
>
{};
constexpr
auto
Wi
=
Number
<
240
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
32
>
{};
constexpr
auto
Wi
=
Number
<
32
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
8
>
{};
#endif
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
constexpr
auto
Hox2
=
Number
<
Ho
*
2
>
{};
constexpr
auto
Wox2
=
Number
<
Wo
*
2
>
{};
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
add_lengths_host
(
5
),
bias_lengths_host
(
2
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
*
K1
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
add_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
add_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
add_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hox2
);
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
add_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
add
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_device
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_host
(
add_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
add
.
mDesc
,
std
::
cout
<<
"add: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
add
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
auto
f_make_for_device_nchwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
C1
);
const
auto
wei_lengths_dev
=
make_tuple
(
K0
*
K1
,
C0
,
Y
,
X
,
C1
);
const
auto
add_lengths_dev
=
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
,
K1
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
add_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_FWD_V5R1_NCHWC
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
{
const
auto
tmp
=
f_make_for_device_nchwc
();
device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
<
in_data_t
,
acc_data_t
,
out_data_t
,
activ_type
>
(
tmp
[
I0
],
// in_lengths_dev
tmp
[
I1
],
// wei_lengths_dev
tmp
[
I2
],
// add_lengths_dev
tmp
[
I3
],
// out_lengths_dev
tmp
[
I4
],
// conv_strides_dev
tmp
[
I5
],
// conv_dilations_dev
tmp
[
I6
],
// in_left_pads_dev
tmp
[
I7
],
// in_right_pads_dev
in
,
wei
,
bias
,
add
,
add_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_add_nchwc
(
in
,
wei
,
add
,
bias
,
add_host
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
ck
::
utils
::
check_err
(
add_device
.
mData
,
add_host
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"add_host: "
,
add_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"add_device: "
,
add_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
deleted
100644 → 0
View file @
b7fa6bb1
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp"
#define USE_MODE 1
#define USE_CONV_BWD_V4R1_XDL_NHWC 0
#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
enum
ConvTensorLayout
{
NCHW
,
NHWC
,
CHWN
,
NCHWc
,
NHWCc
};
enum
ConvBackwardDataAlgo
{
V4R1XDLNHWC
,
// 0
V4R1R2XDLNHWC
,
// 1
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_convolution_backward_data
(
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
/* in_right_pads */
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
std
::
size_t
K
=
wei
.
mDesc
.
GetLengths
()[
I0
];
std
::
size_t
Y
=
wei
.
mDesc
.
GetLengths
()[
I2
];
std
::
size_t
X
=
wei
.
mDesc
.
GetLengths
()[
I3
];
std
::
size_t
Ho
=
out
.
mDesc
.
GetLengths
()[
I2
];
std
::
size_t
Wo
=
out
.
mDesc
.
GetLengths
()[
I3
];
double
v
=
0
;
for
(
int
y
=
0
;
y
<
Y
;
++
y
)
{
int
h_tmp
=
hi
+
in_left_pads
[
I0
]
-
y
*
conv_dilations
[
I0
];
if
(
h_tmp
%
conv_strides
[
I0
]
==
0
)
{
int
ho
=
h_tmp
/
conv_strides
[
I0
];
if
(
ho
>=
0
&&
ho
<
Ho
)
{
for
(
int
x
=
0
;
x
<
X
;
++
x
)
{
int
w_tmp
=
wi
+
in_left_pads
[
I1
]
-
x
*
conv_dilations
[
I1
];
if
(
w_tmp
%
conv_strides
[
I1
]
==
0
)
{
int
wo
=
w_tmp
/
conv_strides
[
I1
];
if
(
wo
>=
0
&&
wo
<
Wo
)
{
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
out
(
n
,
k
,
ho
,
wo
)
*
wei
(
k
,
c
,
y
,
x
);
}
}
}
}
}
}
}
in
(
n
,
c
,
hi
,
wi
)
=
v
;
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
std
::
size_t
K
=
wei
.
mDesc
.
GetLengths
()[
I0
];
std
::
size_t
Y
=
wei
.
mDesc
.
GetLengths
()[
I1
];
std
::
size_t
X
=
wei
.
mDesc
.
GetLengths
()[
I2
];
std
::
size_t
Ho
=
out
.
mDesc
.
GetLengths
()[
I1
];
std
::
size_t
Wo
=
out
.
mDesc
.
GetLengths
()[
I2
];
double
v
=
0
;
for
(
int
y
=
0
;
y
<
Y
;
++
y
)
{
int
h_tmp
=
hi
+
in_left_pads
[
I0
]
-
y
*
conv_dilations
[
I0
];
if
(
h_tmp
%
conv_strides
[
I0
]
==
0
)
{
int
ho
=
h_tmp
/
conv_strides
[
I0
];
if
(
ho
>=
0
&&
ho
<
Ho
)
{
for
(
int
x
=
0
;
x
<
X
;
++
x
)
{
int
w_tmp
=
wi
+
in_left_pads
[
I1
]
-
x
*
conv_dilations
[
I1
];
if
(
w_tmp
%
conv_strides
[
I1
]
==
0
)
{
int
wo
=
w_tmp
/
conv_strides
[
I1
];
if
(
wo
>=
0
&&
wo
<
Wo
)
{
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
out
(
n
,
ho
,
wo
,
k
)
*
wei
(
k
,
y
,
x
,
c
);
}
}
}
}
}
}
}
in
(
n
,
hi
,
wi
,
c
)
=
v
;
};
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
make_ParallelTensorFunctor
(
f_nchw
,
in
.
mDesc
.
GetLengths
()[
0
],
in
.
mDesc
.
GetLengths
()[
1
],
in
.
mDesc
.
GetLengths
()[
2
],
in
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
make_ParallelTensorFunctor
(
f_nhwc
,
in
.
mDesc
.
GetLengths
()[
0
],
in
.
mDesc
.
GetLengths
()[
1
],
in
.
mDesc
.
GetLengths
()[
2
],
in
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_MODE
// dynamic mode
if
(
argc
!=
22
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvBackwardDataAlgo
algo
=
static_cast
<
ConvBackwardDataAlgo
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
const
index_t
N
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C
=
std
::
stoi
(
argv
[
9
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
10
]);
const
index_t
X
=
std
::
stoi
(
argv
[
11
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
21
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#else
// static mode
if
(
argc
<
7
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvBackwardDataAlgo
algo
=
static_cast
<
ConvBackwardDataAlgo
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
C
=
Number
<
192
>
{};
constexpr
auto
Hi
=
Number
<
71
>
{};
constexpr
auto
Wi
=
Number
<
71
>
{};
constexpr
auto
K
=
Number
<
256
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
conv_stride_h
=
I2
;
constexpr
auto
conv_stride_w
=
I2
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
}
else
{
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in_host
(
in_lengths_host
);
Tensor
<
in_data_t
>
in_device
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
out
(
out_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in_host
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
out
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
2
:
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
4
:
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
out_data_t
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_t
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nhwc
=
[
&
]()
{
#if USE_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
#else
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{},
Number
<
C
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
Y
>
{},
Number
<
X
>
{},
Number
<
C
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{},
Number
<
K
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
#endif
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_BWD_V4R1_XDL_NHWC
if
(
algo
==
ConvBackwardDataAlgo
::
V4R1XDLNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in_device
,
wei
,
out
,
nrepeat
);
}
#endif
#if USE_CONV_BWD_V4R1R2_XDL_NHWC
if
(
algo
==
ConvBackwardDataAlgo
::
V4R1R2XDLNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
if
(
Y
==
1
&&
X
==
1
&&
in_left_pad_h
==
0
&&
in_left_pad_w
==
0
&&
in_right_pad_h
==
0
&&
in_right_pad_w
==
0
)
{
device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in_device
,
wei
,
out
,
nrepeat
);
}
else
{
#if 1
device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in_device
,
wei
,
out
,
nrepeat
);
#endif
}
}
#endif
if
(
do_verification
)
{
host_convolution_backward_data
(
in_host
,
wei
,
out
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
ck
::
utils
::
check_err
(
in_device
.
mData
,
in_host
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out : "
,
out
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in_host : "
,
in_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in_device: "
,
in_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
deleted
100644 → 0
View file @
b7fa6bb1
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE 1
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
enum
ConvTensorLayout
{
NCHW
,
NHWC
,
CHWN
,
NCHWc
,
NHWCc
};
enum
ConvForwardAlgo
{
V4R4NCHW
,
// 0
V4R4R2NHWC
,
// 1
V6R1NCHW
,
// 2
V4R4R2XDLNCHW
,
// 3
V4R4R4XDLNHWC
// 4
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_convolution_forward
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
if
constexpr
(
is_same
<
TIn
,
bhalf_t
>::
value
)
{
v
+=
ck
::
type_convert
<
float
>
(
in
(
n
,
c
,
hi
,
wi
))
*
ck
::
type_convert
<
float
>
(
wei
(
k
,
c
,
y
,
x
));
}
else
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c
,
hi
,
wi
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c
,
y
,
x
));
}
}
}
}
}
if
constexpr
(
is_same
<
TOut
,
bhalf_t
>::
value
)
{
out
(
n
,
k
,
ho
,
wo
)
=
ck
::
type_convert
<
bhalf_t
>
(
static_cast
<
float
>
(
v
));
}
else
{
out
(
n
,
k
,
ho
,
wo
)
=
v
;
}
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
1
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
2
])
{
if
constexpr
(
is_same
<
TIn
,
bhalf_t
>::
value
)
{
v
+=
ck
::
type_convert
<
float
>
(
in
(
n
,
hi
,
wi
,
c
))
*
ck
::
type_convert
<
float
>
(
wei
(
k
,
y
,
x
,
c
));
}
else
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
hi
,
wi
,
c
))
*
static_cast
<
const
double
>
(
wei
(
k
,
y
,
x
,
c
));
}
}
}
}
}
if
constexpr
(
is_same
<
TOut
,
bhalf_t
>::
value
)
{
out
(
n
,
ho
,
wo
,
k
)
=
ck
::
type_convert
<
bhalf_t
>
(
static_cast
<
float
>
(
v
));
}
else
{
out
(
n
,
ho
,
wo
,
k
)
=
v
;
}
};
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
make_ParallelTensorFunctor
(
f_nhwc
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
22
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
const
index_t
N
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C
=
std
::
stoi
(
argv
[
9
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
10
]);
const
index_t
X
=
std
::
stoi
(
argv
[
11
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
21
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#else
// static mode
if
(
argc
<
7
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
C
=
Number
<
192
>
{};
constexpr
auto
Hi
=
Number
<
71
>
{};
constexpr
auto
Wi
=
Number
<
71
>
{};
constexpr
auto
K
=
Number
<
256
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
#endif
#if 1
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
#elif 1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 0
using
in_data_t
=
bhalf_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
bhalf_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
}
else
{
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_t
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_t
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C
,
Hi
,
Wi
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
C
,
Y
,
X
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K
,
Ho
,
Wo
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_FWD_V4R4_NCHW
if
(
algo
==
ConvForwardAlgo
::
V4R4NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4R2_NHWC
if
(
algo
==
ConvForwardAlgo
::
V4R4R2NHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V6R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V6R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4R2_XDL_NCHW
if
(
algo
==
ConvForwardAlgo
::
V4R4R2XDLNCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4R4_XDL_NHWC
if
(
algo
==
ConvForwardAlgo
::
V4R4R4XDLNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_convolution_forward
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
ck
::
utils
::
check_err
(
out_device
.
mData
,
out_host
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
deleted
100644 → 0
View file @
b7fa6bb1
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum
ConvForwardAlgo
{
V5R1NCHWC
// 0
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
activ_type
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
double
v
=
0
;
const
int
k
=
k0
*
out
.
mDesc
.
GetLengths
()[
4
]
+
k1
;
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c0
,
y
,
x
,
c1
));
}
}
}
}
}
v
+=
bias
(
k0
,
k1
);
out
(
n
,
k0
,
ho
,
wo
,
k1
)
=
activ
(
v
,
activ_type
);
};
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
],
out
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
23
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
index_t
N
=
std
::
stoi
(
argv
[
6
]);
const
index_t
K0
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K1
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C0
=
std
::
stoi
(
argv
[
9
]);
const
index_t
C1
=
std
::
stoi
(
argv
[
10
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
11
]);
const
index_t
X
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
17
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#else
// static mode
if
(
argc
<
6
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
// constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<1>{};
constexpr auto K1 = Number<4>{};
#elif
1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
1
>
{};
constexpr
auto
X
=
Number
<
1
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Y
=
Number
<
1
>
{};
constexpr
auto
X
=
Number
<
1
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
constexpr
auto
Y
=
Number
<
1
>
{};
constexpr
auto
X
=
Number
<
1
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#endif
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
#if 1
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
#else
constexpr
auto
in_left_pad_h
=
I0
;
constexpr
auto
in_left_pad_w
=
I0
;
constexpr
auto
in_right_pad_h
=
I0
;
constexpr
auto
in_right_pad_w
=
I0
;
#endif
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
bias_lengths_host
(
2
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
*
K1
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
bias
.
mDesc
,
std
::
cout
<<
"bias: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
C1
);
const
auto
wei_lengths_dev
=
make_tuple
(
K0
*
K1
,
C0
,
Y
,
X
,
C1
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_FWD_V5R1_NCHWC
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
{
const
auto
tmp
=
f_make_for_device_nchwc
();
device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
<
in_data_t
,
acc_data_t
,
out_data_t
,
activ_type
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
bias
,
out_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_nchwc
(
in
,
wei
,
bias
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
ck
::
utils
::
check_err
(
out_device
.
mData
,
out_host
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"bias: "
,
bias
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
deleted
100644 → 0
View file @
b7fa6bb1
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum
ConvForwardAlgo
{
V5R1NCHWC
// 0
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_maxpool_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
out_host
,
Tensor
<
TOut
>&
max_host
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
activ_type
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
double
v
=
0
;
auto
k
=
k0
*
out_host
.
mDesc
.
GetLengths
()[
4
]
+
k1
;
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c0
,
y
,
x
,
c1
));
}
}
}
}
}
v
+=
bias
(
k0
,
k1
);
v
=
activ
(
v
,
activ_type
);
out_host
(
n
,
k0
,
ho
,
wo
,
k1
)
=
v
;
};
make_ParallelTensorFunctor
(
f_nchw
,
out_host
.
mDesc
.
GetLengths
()[
0
],
out_host
.
mDesc
.
GetLengths
()[
1
],
out_host
.
mDesc
.
GetLengths
()[
2
],
out_host
.
mDesc
.
GetLengths
()[
3
],
out_host
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
auto
maxpool_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
auto
hx
=
ho
*
2
;
auto
wx
=
wo
*
2
;
auto
v0
=
out_host
(
n
,
k0
,
hx
,
wx
,
k1
);
auto
v1
=
out_host
(
n
,
k0
,
hx
,
wx
+
1
,
k1
);
auto
v2
=
out_host
(
n
,
k0
,
hx
+
1
,
wx
,
k1
);
auto
v3
=
out_host
(
n
,
k0
,
hx
+
1
,
wx
+
1
,
k1
);
max_host
(
n
,
k0
,
ho
,
wo
,
k1
)
=
std
::
max
({
v0
,
v1
,
v2
,
v3
});
};
make_ParallelTensorFunctor
(
maxpool_nchw
,
max_host
.
mDesc
.
GetLengths
()[
0
],
max_host
.
mDesc
.
GetLengths
()[
1
],
max_host
.
mDesc
.
GetLengths
()[
2
],
max_host
.
mDesc
.
GetLengths
()[
3
],
max_host
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
23
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
index_t
N
=
std
::
stoi
(
argv
[
6
]);
const
index_t
K0
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K1
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C0
=
std
::
stoi
(
argv
[
9
]);
const
index_t
C1
=
std
::
stoi
(
argv
[
10
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
11
]);
const
index_t
X
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
17
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
index_t
Ho_2
=
Ho
/
2
;
const
index_t
Wo_2
=
Wo
/
2
;
#else
// static mode
if
(
argc
<
6
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
3
>
{};
constexpr
auto
C1
=
Number
<
4
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#endif
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
constexpr
auto
Ho_2
=
Number
<
Ho
/
2
>
{};
constexpr
auto
Wo_2
=
Number
<
Wo
/
2
>
{};
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
max_lengths_host
(
5
),
bias_lengths_host
(
2
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
*
K1
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
max_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
max_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
max_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho_2
);
max_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo_2
);
max_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
in_data_t
>
max_device
(
max_lengths_host
);
Tensor
<
in_data_t
>
max_host
(
max_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
auto
f_make_for_device_nchwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
C1
);
const
auto
wei_lengths_dev
=
make_tuple
(
K0
*
K1
,
C0
,
Y
,
X
,
C1
);
const
auto
max_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho_2
,
Wo_2
,
K1
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
max_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_FWD_V5R1_NCHWC
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
{
const
auto
tmp
=
f_make_for_device_nchwc
();
device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
<
in_data_t
,
acc_data_t
,
out_data_t
,
activ_type
>
(
tmp
[
I0
],
// in_lengths_dev
tmp
[
I1
],
// wei_lengths_dev
tmp
[
I2
],
// max_lengths_dev
tmp
[
I3
],
// out_lengths_dev
tmp
[
I4
],
// conv_strides_dev
tmp
[
I5
],
// conv_dilations_dev
tmp
[
I6
],
// in_left_pads_dev
tmp
[
I7
],
// in_right_pads_dev
in
,
wei
,
bias
,
out_device
,
max_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_maxpool_nchwc
(
in
,
wei
,
bias
,
out_host
,
max_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
ck
::
utils
::
check_err
(
out_device
.
mData
,
out_host
.
mData
);
ck
::
utils
::
check_err
(
max_device
.
mData
,
max_host
.
mData
);
if
(
do_log
)
{
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
// std::endl;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"max_host: "
,
max_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"max_device: "
,
max_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
deleted
100644 → 0
View file @
b7fa6bb1
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk.hpp"
enum
ConvTensorLayout
{
NCHW
,
NHWC
,
CHWN
,
NCHWc
,
NHWCc
};
#define USE_DYNAMIC_MODE 1
#define USE_CONV_WRW_V4R4R2_XDL_NCHW 0
#define USE_CONV_WRW_V4R4R4_XDL_NHWC 0
#define USE_CONV_WRW_V4R4R2_XDL_ATOMIC_NCHW 0
#define USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC 0
#define USE_CONV_WRW_V4R4R5_XDL_ATOMIC_NHWC 1
enum
ConvBackwardWeightAlgo
{
V4R4R2XDLNCHW
,
// 0
V4R4R4XDLNHWC
,
// 1
V4R4R2XDLATOMICNCHW
,
// 2
V4R4R4XDLATOMICNHWC
,
// 3
V4R4R5XDLATOMICNHWC
,
// 4
};
template
<
typename
TOut
,
typename
TIn
,
typename
TWei
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_convolution_backward_weight
(
const
Tensor
<
TOut
>&
out
,
const
Tensor
<
TIn
>&
in
,
Tensor
<
TWei
>&
wei
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_kcyx
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
double
v
=
0
;
for
(
int
n
=
0
;
n
<
out
.
mDesc
.
GetLengths
()[
0
];
++
n
)
{
for
(
int
ho
=
0
;
ho
<
out
.
mDesc
.
GetLengths
()[
2
];
++
ho
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
wo
=
0
;
wo
<
out
.
mDesc
.
GetLengths
()[
3
];
++
wo
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c
,
hi
,
wi
))
*
static_cast
<
const
double
>
(
out
(
n
,
k
,
ho
,
wo
));
}
}
}
}
wei
(
k
,
c
,
y
,
x
)
=
v
;
};
auto
f_kyxc
=
[
&
](
auto
k
,
auto
y
,
auto
x
,
auto
c
)
{
double
v
=
0
;
for
(
int
n
=
0
;
n
<
out
.
mDesc
.
GetLengths
()[
0
];
++
n
)
{
for
(
int
ho
=
0
;
ho
<
out
.
mDesc
.
GetLengths
()[
1
];
++
ho
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
wo
=
0
;
wo
<
out
.
mDesc
.
GetLengths
()[
2
];
++
wo
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
1
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
2
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
hi
,
wi
,
c
))
*
static_cast
<
const
double
>
(
out
(
n
,
ho
,
wo
,
k
));
}
}
}
}
wei
(
k
,
y
,
x
,
c
)
=
v
;
};
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
make_ParallelTensorFunctor
(
f_kcyx
,
wei
.
mDesc
.
GetLengths
()[
0
],
wei
.
mDesc
.
GetLengths
()[
1
],
wei
.
mDesc
.
GetLengths
()[
2
],
wei
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
make_ParallelTensorFunctor
(
f_kyxc
,
wei
.
mDesc
.
GetLengths
()[
0
],
wei
.
mDesc
.
GetLengths
()[
1
],
wei
.
mDesc
.
GetLengths
()[
2
],
wei
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
23
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
\n
"
);
printf
(
"additional: desired_grid_size
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvBackwardWeightAlgo
algo
=
static_cast
<
ConvBackwardWeightAlgo
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
const
index_t
N
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C
=
std
::
stoi
(
argv
[
9
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
10
]);
const
index_t
X
=
std
::
stoi
(
argv
[
11
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
21
]);
const
index_t
desired_grid_size
=
std
::
stoi
(
argv
[
22
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#else
// static mode
if
(
argc
<
7
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvBackwardWeightAlgo
algo
=
static_cast
<
ConvBackwardWeightAlgo
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
C
=
Number
<
128
>
{};
constexpr
auto
Hi
=
Number
<
14
>
{};
constexpr
auto
Wi
=
Number
<
14
>
{};
constexpr
auto
K
=
Number
<
256
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
#endif
#if 0
using in_data_t = float;
using wei_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
out_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
wei_data_t
=
float
;
#elif 1
using
in_data_t
=
int8_t
;
using
out_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
wei_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
}
else
{
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
wei_data_t
>
wei_device
(
wei_lengths_host
);
Tensor
<
wei_data_t
>
wei_host
(
wei_lengths_host
);
Tensor
<
out_data_t
>
out
(
out_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei_host
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
out
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_t
>
{
-
0.1
,
0.1
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
out_data_t
>
{
-
0.1
,
0.1
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
},
num_thread
);
auto
gen_out
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
<
out_data_t
>
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
out
.
GenerateTensorValue
(
gen_out
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C
,
Hi
,
Wi
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
C
,
Y
,
X
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K
,
Ho
,
Wo
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
// set zero to wei_device
wei_device
.
GenerateTensorValue
(
GeneratorTensor_0
{},
num_thread
);
#if USE_CONV_WRW_V4R4R2_XDL_NCHW
if
(
algo
==
ConvBackwardWeightAlgo
::
V4R4R2XDLNCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
wei_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei_device
,
out
,
nrepeat
);
}
#endif
#if USE_CONV_WRW_V4R4R4_XDL_NHWC
if
(
algo
==
ConvBackwardWeightAlgo
::
V4R4R4XDLNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
wei_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei_device
,
out
,
nrepeat
);
}
#endif
#if USE_CONV_WRW_V4R4R2_XDL_ATOMIC_NCHW
if
(
algo
==
ConvBackwardWeightAlgo
::
V4R4R2XDLATOMICNCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw
<
in_data_t
,
wei_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei_device
,
out
,
desired_grid_size
,
nrepeat
);
}
#endif
#if USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC
if
(
algo
==
ConvBackwardWeightAlgo
::
V4R4R4XDLATOMICNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk
<
in_data_t
,
wei_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei_device
,
out
,
desired_grid_size
,
nrepeat
);
}
#endif
#if USE_CONV_WRW_V4R4R5_XDL_ATOMIC_NHWC
if
(
algo
==
ConvBackwardWeightAlgo
::
V4R4R5XDLATOMICNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk
<
in_data_t
,
wei_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei_device
,
out
,
desired_grid_size
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_convolution_backward_weight
(
out
,
in
,
wei_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
ck
::
utils
::
check_err
(
wei_device
.
mData
,
wei_host
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_device: "
,
wei_device
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_host : "
,
wei_host
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
library/src/obselete_driver_offline/gemm_driver_offline.cpp
deleted
100644 → 0
View file @
b7fa6bb1
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdlops_mk_kn_mn.hpp"
#include "device_gemm_xdlops_mk_nk_mn.hpp"
#include "device_gemm_xdlops_km_kn_mn.hpp"
#include "device_gemm_xdlops_km_nk_mn.hpp"
#include "device_gemm_xdlops_mk_kn_nm.hpp"
#include "device_gemm_xdlops_mk_nk_nm.hpp"
#include "device_gemm_xdlops_km_kn_nm.hpp"
#include "device_gemm_xdlops_km_nk_nm.hpp"
#define USE_GEMM_XDL_MK_KN_MN 1
#define USE_GEMM_XDL_MK_NK_MN 1
#define USE_GEMM_XDL_KM_KN_MN 1
#define USE_GEMM_XDL_KM_NK_MN 1
#define USE_GEMM_XDL_MK_KN_NM 0
#define USE_GEMM_XDL_MK_NK_NM 0
#define USE_GEMM_XDL_KM_KN_NM 0
#define USE_GEMM_XDL_KM_NK_NM 0
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
// 7
};
enum
struct
GemmAlgo
{
Xdl_MK_KN_MN
,
// 0
Xdl_MK_NK_MN
,
// 1
Xdl_KM_KN_MN
,
// 2
Xdl_KM_NK_MN
,
// 3
Xdl_MK_KN_NM
,
// 4
Xdl_MK_NK_NM
,
// 5
Xdl_KM_KN_NM
,
// 6
Xdl_KM_NK_NM
,
// 7
};
template
<
typename
AType
,
typename
BType
,
typename
CType
>
void
host_gemm
(
const
Tensor
<
AType
>&
a
,
const
Tensor
<
BType
>&
b
,
Tensor
<
CType
>&
c
,
const
GemmMatrixLayout
layout
)
{
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
auto
f_mk_kn_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_kn_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
auto
f_mk_nk_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_nk_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
auto
f_km_kn_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_kn_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
auto
f_km_nk_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_nk_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_KN_NM
)
{
auto
f_mk_kn_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_NK_NM
)
{
auto
f_mk_nk_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_KN_NM
)
{
auto
f_km_kn_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_NK_NM
)
{
auto
f_km_nk_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
if
(
argc
!=
12
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: M, N, K
\n
"
);
printf
(
"debug_driver_gemm_xdlops_v2r3::M01, debug_driver_gemm_xdlops_v2r3::N01
\n
"
);
exit
(
1
);
}
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
auto
algo
=
static_cast
<
GemmAlgo
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
const
index_t
M
=
std
::
stoi
(
argv
[
7
]);
const
index_t
N
=
std
::
stoi
(
argv
[
8
]);
const
index_t
K
=
std
::
stoi
(
argv
[
9
]);
debug
::
debug_driver_gemm_xdlops_v2r3
::
M01
=
std
::
stoi
(
argv
[
10
]);
debug
::
debug_driver_gemm_xdlops_v2r3
::
N01
=
std
::
stoi
(
argv
[
11
]);
#if 0
using ab_data_t = float;
using acc_data_t = float;
using c_data_t = float;
#elif
1
using
ab_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
c_data_t
=
half_t
;
#elif 1
using
ab_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
c_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
a_lengths_host
(
2
),
b_lengths_host
(
2
),
c_lengths_host
(
2
);
std
::
vector
<
std
::
size_t
>
a_strides_host
(
2
),
b_strides_host
(
2
),
c_strides_host
(
2
);
// A
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
||
layout
==
GemmMatrixLayout
::
MK_NK_MN
||
layout
==
GemmMatrixLayout
::
MK_KN_NM
||
layout
==
GemmMatrixLayout
::
MK_NK_NM
)
{
a_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
else
{
a_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
// B
if
(
layout
==
GemmMatrixLayout
::
MK_NK_MN
||
layout
==
GemmMatrixLayout
::
KM_NK_MN
||
layout
==
GemmMatrixLayout
::
MK_NK_NM
||
layout
==
GemmMatrixLayout
::
KM_NK_NM
)
{
b_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
else
{
b_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
// C
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
||
layout
==
GemmMatrixLayout
::
KM_KN_MN
||
layout
==
GemmMatrixLayout
::
MK_NK_MN
||
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
c_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
else
{
c_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
Tensor
<
ab_data_t
>
a
(
a_lengths_host
,
a_strides_host
);
Tensor
<
ab_data_t
>
b
(
b_lengths_host
,
b_strides_host
);
Tensor
<
c_data_t
>
c_host
(
c_lengths_host
,
c_strides_host
);
Tensor
<
c_data_t
>
c_device
(
c_lengths_host
,
c_strides_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
a
.
mDesc
,
std
::
cout
<<
"a: "
);
ostream_HostTensorDescriptor
(
b
.
mDesc
,
std
::
cout
<<
"b: "
);
ostream_HostTensorDescriptor
(
c_host
.
mDesc
,
std
::
cout
<<
"c: "
);
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
a
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
break
;
case
2
:
a
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
break
;
case
4
:
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
default:
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ab_data_t
>
{
0.0
,
1.0
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_3
<
ab_data_t
>
{
-
0.5
,
0.5
},
num_thread
);
}
#if USE_GEMM_XDL_MK_KN_MN
if
(
algo
==
GemmAlgo
::
Xdl_MK_KN_MN
)
{
if
(
layout
!=
GemmMatrixLayout
::
MK_KN_MN
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_mk_kn_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_MK_NK_MN
if
(
algo
==
GemmAlgo
::
Xdl_MK_NK_MN
)
{
if
(
layout
!=
GemmMatrixLayout
::
MK_NK_MN
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_mk_nk_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_KM_KN_MN
if
(
algo
==
GemmAlgo
::
Xdl_KM_KN_MN
)
{
if
(
layout
!=
GemmMatrixLayout
::
KM_KN_MN
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_km_kn_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_KM_NK_MN
if
(
algo
==
GemmAlgo
::
Xdl_KM_NK_MN
)
{
if
(
layout
!=
GemmMatrixLayout
::
KM_NK_MN
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_km_nk_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_MK_KN_NM
if
(
algo
==
GemmAlgo
::
Xdl_MK_KN_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
MK_KN_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_mk_kn_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_MK_NK_NM
if
(
algo
==
GemmAlgo
::
Xdl_MK_NK_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
MK_NK_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_mk_nk_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_KM_KN_NM
if
(
algo
==
GemmAlgo
::
Xdl_KM_KN_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
KM_KN_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_km_kn_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_KM_NK_NM
if
(
algo
==
GemmAlgo
::
Xdl_KM_NK_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
KM_NK_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_km_nk_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_gemm
(
a
,
b
,
c_host
,
layout
);
ck
::
utils
::
check_err
(
c_device
.
mData
,
c_host
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host : "
,
c_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_device: "
,
c_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
5d015452
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/include/ck/host_utility
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_description
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor
${
PROJECT_SOURCE_DIR
}
/include/ck/problem_transform
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/device
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/grid
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/block
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/warp
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/thread
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/element
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/tensor_operation_instance
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/tensor_operation_instance/gpu/reduce
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
function
(
add_instance_library INSTANCE_NAME
)
message
(
"adding instance
${
INSTANCE_NAME
}
"
)
add_library
(
${
INSTANCE_NAME
}
OBJECT
${
ARGN
}
)
...
...
@@ -26,43 +6,50 @@ function(add_instance_library INSTANCE_NAME)
endfunction
(
add_instance_library INSTANCE_NAME
)
add_subdirectory
(
gemm
)
add_subdirectory
(
gemm_
bias2d
)
add_subdirectory
(
gemm_bi
as_relu
)
add_subdirectory
(
gemm_
bias_relu_add
)
add_subdirectory
(
gemm_
splitk
)
add_subdirectory
(
gemm_bi
linear
)
add_subdirectory
(
gemm_
add_add_fastgelu
)
add_subdirectory
(
gemm_reduce
)
add_subdirectory
(
gemm_bias_add_reduce
)
add_subdirectory
(
batched_gemm
)
add_subdirectory
(
batched_gemm_reduce
)
add_subdirectory
(
grouped_gemm
)
add_subdirectory
(
contraction_scale
)
add_subdirectory
(
contraction_bilinear
)
add_subdirectory
(
conv1d_fwd
)
add_subdirectory
(
conv2d_fwd
)
add_subdirectory
(
conv3d_fwd
)
add_subdirectory
(
conv2d_fwd_bias_relu
)
add_subdirectory
(
conv2d_fwd_bias_relu_add
)
add_subdirectory
(
conv2d_fwd_bias_relu_atomic_add
)
add_subdirectory
(
conv2d_bwd_data
)
add_subdirectory
(
reduce
)
add_subdirectory
(
convnd_bwd_data
)
add_subdirectory
(
grouped_gemm
)
add_subdirectory
(
conv2d_bwd_weight
)
add_subdirectory
(
batched_gemm_reduce
)
add_subdirectory
(
reduce
)
add_subdirectory
(
normalization
)
add_subdirectory
(
elementwise
)
add_library
(
device_operations STATIC
$<TARGET_OBJECTS:device_conv1d_fwd_instance>
$<TARGET_OBJECTS:device_gemm_instance>
$<TARGET_OBJECTS:device_gemm_splitk_instance>
$<TARGET_OBJECTS:device_gemm_bilinear_instance>
$<TARGET_OBJECTS:device_gemm_add_add_fastgelu_instance>
$<TARGET_OBJECTS:device_gemm_bias_add_reduce_instance>
$<TARGET_OBJECTS:device_batched_gemm_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
$<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
$<TARGET_OBJECTS:device_grouped_gemm_instance>
$<TARGET_OBJECTS:device_contraction_scale_instance>
$<TARGET_OBJECTS:device_contraction_bilinear_instance>
$<TARGET_OBJECTS:device_conv1d_fwd_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_instance>
$<TARGET_OBJECTS:device_conv3d_fwd_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_atomic_add_instance>
$<TARGET_OBJECTS:device_gemm_instance>
$<TARGET_OBJECTS:device_gemm_bias_relu_instance>
$<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
$<TARGET_OBJECTS:device_gemm_bias2d_instance>
$<TARGET_OBJECTS:device_reduce_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
$<TARGET_OBJECTS:device_convnd_bwd_data_instance>
$<TARGET_OBJECTS:device_grouped_gemm_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
$<TARGET_OBJECTS:device_
batched_gemm_
reduce_instance>
$<TARGET_OBJECTS:device_
conv3d_fwd
_instance>
device_conv2d.cpp
$<TARGET_OBJECTS:device_reduce_instance>
$<TARGET_OBJECTS:device_
normalization
_instance>
$<TARGET_OBJECTS:device_elementwise_instance>
)
add_library
(
composablekernels::device_operations ALIAS device_operations
)
...
...
@@ -70,8 +57,8 @@ add_library(composablekernels::device_operations ALIAS device_operations)
set
(
DEV_OPS_INC_DIRS
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
${
PROJECT_SOURCE_DIR
}
/external/include/
)
target_compile_features
(
device_operations PUBLIC
)
set_target_properties
(
device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_operations PUBLIC
...
...
@@ -87,28 +74,24 @@ target_include_directories(device_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/thread>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/element>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/host_tensor>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/host>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/reduce>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/half>
)
#once new arches are enabled make this an option on the main cmake file
# and pass down here to be exported
target_compile_options
(
device_operations
PRIVATE
--offload-arch=gfx90
8
target_compile_options
(
device_operations PRIVATE
--offload-arch=gfx908
--offload-arch=gfx90
a
)
# install(TARGETS device_operations LIBRARY DESTINATION lib)
install
(
TARGETS device_operations
EXPORT device_operationsTargets
LIBRARY DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
ARCHIVE DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
RUNTIME DESTINATION
${
CMAKE_INSTALL_BINDIR
}
INCLUDES DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
install
(
EXPORT device_operationsTargets
rocm_install
(
TARGETS device_operations
EXPORT device_operationsTargets
)
rocm_install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
rocm_install
(
EXPORT device_operationsTargets
FILE composable_kerneldevice_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
...
...
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
...
...
@@ -23,10 +29,10 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using
device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances
=
std
::
tuple
<
// clang-format off
//##########
|
AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##########
|
Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
//##########
|
| | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
//##########
|
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//##########
########|
AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##########
########|
Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
//##########
########|
| | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
//##########
########|
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedGemmXdl
<
BF16
,
BF16
,
BF16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
7
,
1
>
,
DeviceBatchedGemmXdl
<
BF16
,
BF16
,
BF16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
7
,
1
>
,
DeviceBatchedGemmXdl
<
BF16
,
BF16
,
BF16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
7
,
1
>
,
...
...
@@ -39,13 +45,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
...
...
@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
...
...
@@ -43,13 +48,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
...
...
@@ -44,13 +49,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Row
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -48,13 +53,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -44,13 +49,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Col
,
Row
,
Row
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Col
,
Col
,
Row
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
View file @
5d015452
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances = std::tuple<
>
;
void
add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemm
<
Row
,
Row
,
Row
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances
{});
}
}
// namespace
device_batched_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
…
16
17
18
19
20
21
22
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment