Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
ae98b52a
"docs/git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "f389848010870356b2426f1fd68bb635050f3b80"
Commit
ae98b52a
authored
Aug 07, 2021
by
Chao Liu
Browse files
remove online compilation from CK
parent
cb954213
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1 addition
and
3622 deletions
+1
-3622
CMakeLists.txt
CMakeLists.txt
+1
-4
cmake/AddKernels.cmake
cmake/AddKernels.cmake
+0
-40
cmake/TargetFlags.cmake
cmake/TargetFlags.cmake
+0
-50
host/CMakeLists.txt
host/CMakeLists.txt
+0
-2
host/driver_online/CMakeLists.txt
host/driver_online/CMakeLists.txt
+0
-22
host/driver_online/conv_fwd_driver_online.cpp
host/driver_online/conv_fwd_driver_online.cpp
+0
-453
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+0
-395
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+0
-386
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+0
-389
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+0
-183
host/online_compile/CMakeLists.txt
host/online_compile/CMakeLists.txt
+0
-168
host/online_compile/addkernels/CMakeLists.txt
host/online_compile/addkernels/CMakeLists.txt
+0
-30
host/online_compile/addkernels/addkernels.cpp
host/online_compile/addkernels/addkernels.cpp
+0
-264
host/online_compile/addkernels/include_inliner.cpp
host/online_compile/addkernels/include_inliner.cpp
+0
-213
host/online_compile/addkernels/include_inliner.hpp
host/online_compile/addkernels/include_inliner.hpp
+0
-142
host/online_compile/addkernels/source_file_desc.hpp
host/online_compile/addkernels/source_file_desc.hpp
+0
-45
host/online_compile/hip_utility/binary_cache.cpp
host/online_compile/hip_utility/binary_cache.cpp
+0
-112
host/online_compile/hip_utility/exec_utils.cpp
host/online_compile/hip_utility/exec_utils.cpp
+0
-93
host/online_compile/hip_utility/handlehip.cpp
host/online_compile/hip_utility/handlehip.cpp
+0
-285
host/online_compile/hip_utility/hip_build_utils.cpp
host/online_compile/hip_utility/hip_build_utils.cpp
+0
-346
No files found.
CMakeLists.txt
View file @
ae98b52a
cmake_minimum_required
(
VERSION 2.8.3
)
cmake_minimum_required
(
VERSION 2.8.3
)
project
(
modular_convolution
)
project
(
composable_kernel
)
list
(
APPEND CMAKE_MODULE_PATH
"
${
PROJECT_SOURCE_DIR
}
/cmake"
)
list
(
APPEND CMAKE_MODULE_PATH
"
${
PROJECT_SOURCE_DIR
}
/cmake"
)
include
(
TargetFlags
)
include
(
AddKernels
)
## C++
## C++
enable_language
(
CXX
)
enable_language
(
CXX
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD 17
)
...
...
cmake/AddKernels.cmake
deleted
100644 → 0
View file @
cb954213
function
(
add_kernels SRC_DIR KERNEL_FILES
)
set
(
INIT_KERNELS_LIST
)
set
(
KERNELS_DECLS
)
foreach
(
KERNEL_FILE
${
KERNEL_FILES
}
)
if
(
"
${
CMAKE_VERSION
}
"
VERSION_LESS 3.0
)
configure_file
(
${
KERNEL_FILE
}
${
KERNEL_FILE
}
.delete
)
else
()
set_property
(
DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
${
KERNEL_FILE
}
)
endif
()
get_filename_component
(
BASE_NAME
${
KERNEL_FILE
}
NAME_WE
)
string
(
TOUPPER
"
${
BASE_NAME
}
"
KEY_NAME
)
string
(
MAKE_C_IDENTIFIER
"
${
KEY_NAME
}
"
VAR_NAME
)
string
(
APPEND KERNELS_DECLS
"extern const size_t APP_KERNEL_
${
VAR_NAME
}
_SIZE;
\n
"
)
string
(
APPEND KERNELS_DECLS
"extern const unsigned char APP_KERNEL_
${
VAR_NAME
}
[];
\n
"
)
list
(
APPEND INIT_KERNELS_LIST
" {
\"
${
KEY_NAME
}
\"
, std::string(reinterpret_cast<const char*>(APP_KERNEL_
${
VAR_NAME
}
), APP_KERNEL_
${
VAR_NAME
}
_SIZE) }"
)
endforeach
()
string
(
REPLACE
";"
",
\n
"
INIT_KERNELS
"
${
INIT_KERNELS_LIST
}
"
)
configure_file
(
${
SRC_DIR
}
/kernel.cpp.in
${
PROJECT_BINARY_DIR
}
/kernel.cpp
)
endfunction
()
function
(
add_kernel_includes SRC_DIR KERNEL_FILES
)
set
(
INIT_KERNELS_LIST
)
foreach
(
KERNEL_FILE
${
KERNEL_FILES
}
)
if
(
"
${
CMAKE_VERSION
}
"
VERSION_LESS 3.0
)
configure_file
(
${
KERNEL_FILE
}
${
KERNEL_FILE
}
.delete
)
else
()
set_property
(
DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
${
KERNEL_FILE
}
)
endif
()
get_filename_component
(
BASE_NAME
${
KERNEL_FILE
}
NAME_WE
)
get_filename_component
(
FILE_NAME
${
KERNEL_FILE
}
NAME
)
string
(
TOUPPER
"
${
BASE_NAME
}
"
KEY_NAME
)
string
(
MAKE_C_IDENTIFIER
"
${
KEY_NAME
}
"
VAR_NAME
)
list
(
APPEND INIT_KERNELS_LIST
" {
\"
${
FILE_NAME
}
\"
, std::string(reinterpret_cast<const char*>(
${
VAR_NAME
}
),
${
VAR_NAME
}
_SIZE) }"
)
endforeach
()
string
(
REPLACE
";"
",
\n
"
INIT_KERNELS
"
${
INIT_KERNELS_LIST
}
"
)
configure_file
(
${
SRC_DIR
}
/kernel_includes.cpp.in
${
PROJECT_BINARY_DIR
}
/kernel_includes.cpp
)
endfunction
()
cmake/TargetFlags.cmake
deleted
100644 → 0
View file @
cb954213
function
(
get_target_property2 VAR TARGET PROPERTY
)
get_target_property
(
_pflags
${
TARGET
}
${
PROPERTY
}
)
if
(
_pflags
)
set
(
${
VAR
}
${
_pflags
}
PARENT_SCOPE
)
else
()
set
(
${
VAR
}
""
PARENT_SCOPE
)
endif
()
endfunction
()
macro
(
append_flags FLAGS TARGET PROPERTY PREFIX
)
get_target_property2
(
_pflags
${
TARGET
}
${
PROPERTY
}
)
foreach
(
FLAG
${
_pflags
}
)
if
(
TARGET
${
FLAG
}
)
target_flags
(
_pflags2
${
FLAG
}
)
string
(
APPEND
${
FLAGS
}
"
${
_pflags2
}
"
)
else
()
string
(
APPEND
${
FLAGS
}
"
${
PREFIX
}${
FLAG
}
"
)
endif
()
endforeach
()
endmacro
()
macro
(
append_link_flags FLAGS TARGET PROPERTY
)
get_target_property2
(
_pflags
${
TARGET
}
${
PROPERTY
}
)
foreach
(
FLAG
${
_pflags
}
)
if
(
TARGET
${
FLAG
}
)
target_flags
(
_pflags2
${
FLAG
}
)
string
(
APPEND
${
FLAGS
}
"
${
_pflags2
}
"
)
elseif
(
FLAG MATCHES
"^-.*"
)
string
(
APPEND
${
FLAGS
}
"
${
FLAG
}
"
)
elseif
(
EXISTS
${
FLAG
}
)
string
(
APPEND
${
FLAGS
}
"
${
FLAG
}
"
)
else
()
string
(
APPEND
${
FLAGS
}
" -l
${
FLAG
}
"
)
endif
()
endforeach
()
endmacro
()
function
(
target_flags FLAGS TARGET
)
set
(
_flags
)
append_flags
(
_flags
${
TARGET
}
"INTERFACE_COMPILE_OPTIONS"
""
)
append_flags
(
_flags
${
TARGET
}
"INTERFACE_COMPILE_DEFINITIONS"
"-D"
)
append_flags
(
_flags
${
TARGET
}
"INTERFACE_INCLUDE_DIRECTORIES"
"-isystem "
)
append_flags
(
_flags
${
TARGET
}
"INTERFACE_LINK_DIRECTORIES"
"-L "
)
append_flags
(
_flags
${
TARGET
}
"INTERFACE_LINK_OPTIONS"
""
)
append_link_flags
(
_flags
${
TARGET
}
"INTERFACE_LINK_LIBRARIES"
""
)
# message("_flags: ${_flags}")
set
(
${
FLAGS
}
${
_flags
}
PARENT_SCOPE
)
endfunction
()
host/CMakeLists.txt
View file @
ae98b52a
add_subdirectory
(
host_tensor
)
add_subdirectory
(
host_tensor
)
add_subdirectory
(
online_compile
)
add_subdirectory
(
driver_offline
)
add_subdirectory
(
driver_offline
)
add_subdirectory
(
driver_online
)
host/driver_online/CMakeLists.txt
deleted
100644 → 0
View file @
cb954213
include_directories
(
BEFORE
include
${
PROJECT_BINARY_DIR
}
/host/online_compile/include
${
PROJECT_SOURCE_DIR
}
/host/online_compile/include
${
PROJECT_SOURCE_DIR
}
/host/host_tensor/include
${
PROJECT_SOURCE_DIR
}
/host/solver/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_description
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_operation
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/problem_transform
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/driver
${
PROJECT_SOURCE_DIR
}
/external/rocm/include
${
PROJECT_SOURCE_DIR
}
/external/half/include
)
set
(
CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp
)
add_executable
(
conv_fwd_driver_online
${
CONV_FWD_DRIVER_ONLINE_SOURCE
}
)
target_link_libraries
(
conv_fwd_driver_online PRIVATE host_tensor
)
target_link_libraries
(
conv_fwd_driver_online PRIVATE online_compile
)
host/driver_online/conv_fwd_driver_online.cpp
deleted
100644 → 0
View file @
cb954213
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "handle.hpp"
#include "hipCheck.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V6R1_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
enum
ConvForwardAlgo
{
V4R4NCHW
,
// 0
V6R1NCHW
,
// 1
V4R4XDLNCHW
,
// 2
V4R4XDLNHWC
// 3
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
using
namespace
ck
::
driver
;
using
size_t
=
std
::
size_t
;
hipStream_t
stream
;
online_compile
::
Handle
*
handle
;
MY_HIP_CHECK
(
hipStreamCreate
(
&
stream
));
handle
=
new
online_compile
::
Handle
(
stream
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
if
(
argc
!=
22
)
{
printf
(
"arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
atoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
atoi
(
argv
[
2
]));
const
bool
do_verification
=
atoi
(
argv
[
3
]);
const
int
init_method
=
atoi
(
argv
[
4
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
const
index_t
N
=
atoi
(
argv
[
7
]);
const
index_t
K
=
atoi
(
argv
[
8
]);
const
index_t
C
=
atoi
(
argv
[
9
]);
const
index_t
Y
=
atoi
(
argv
[
10
]);
const
index_t
X
=
atoi
(
argv
[
11
]);
const
index_t
Hi
=
atoi
(
argv
[
12
]);
const
index_t
Wi
=
atoi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
atoi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
atoi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
atoi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
atoi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
atoi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
atoi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
atoi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
atoi
(
argv
[
21
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#if 1
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
#elif 0
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C
,
Hi
,
Wi
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
C
,
Y
,
X
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K
,
Ho
,
Wo
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
);
};
const
auto
conv_strides
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
#if USE_CONV_FWD_V4R4_NCHW
if
(
algo
==
ConvForwardAlgo
::
V4R4NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
tunable
=
&
default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
tunable
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V6R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V6R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
#if 1
const
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
=
{
get_datatype_enum_from_type
<
in_data_t
>::
value
,
get_datatype_enum_from_type
<
acc_data_t
>::
value
,
get_datatype_enum_from_type
<
out_data_t
>::
value
,
256
,
4
,
1
,
128
,
32
,
8
,
4
,
4
,
1
,
{
8
,
2
},
{
8
,
2
},
{
4
,
1
,
1
,
1
,
1
},
{
2
,
1
,
1
,
128
,
1
},
{
4
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
4
,
1
,
1
,
1
},
{
8
,
1
,
1
,
32
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
4
,
true
,
true
};
#elif 0
const
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
=
{
get_datatype_enum_from_type
<
in_data_t
>::
value
,
get_datatype_enum_from_type
<
acc_data_t
>::
value
,
get_datatype_enum_from_type
<
out_data_t
>::
value
,
256
,
4
,
2
,
128
,
32
,
8
,
4
,
4
,
1
,
{
8
,
2
},
{
8
,
2
},
{
4
,
1
,
1
,
1
,
2
},
{
2
,
1
,
1
,
128
,
1
},
{
4
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
4
,
1
,
1
,
2
},
{
8
,
1
,
1
,
32
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
4
,
true
,
true
};
#elif 1
const
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
=
{
get_datatype_enum_from_type
<
in_data_t
>::
value
,
get_datatype_enum_from_type
<
acc_data_t
>::
value
,
get_datatype_enum_from_type
<
out_data_t
>::
value
,
256
,
4
,
4
,
128
,
32
,
8
,
4
,
4
,
1
,
{
8
,
2
},
{
8
,
2
},
{
4
,
1
,
1
,
1
,
4
},
{
2
,
1
,
1
,
128
,
1
},
{
4
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
4
,
1
,
1
,
4
},
{
8
,
1
,
1
,
32
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
4
,
true
,
true
};
#endif
online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
compile_param
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
if
(
algo
==
ConvForwardAlgo
::
V4R4XDLNCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
tunable
=
&
default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
tunable
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
if
(
algo
==
ConvForwardAlgo
::
V4R4XDLNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
tunable
=
&
default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
tunable
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
check_error
(
out_host
,
out_device
);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
delete
handle
;
MY_HIP_CHECK
(
hipStreamDestroy
(
stream
));
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
cb954213
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
namespace
detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
{
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_network_config_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_network_config_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
(
"TUN_"
);
out
+=
std
::
to_string
(
pt
->
BlockSize
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
NPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
KPerBlock
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
M1PerThread
)
+
"x"
+
std
::
to_string
(
pt
->
N1PerThread
)
+
"x"
+
std
::
to_string
(
pt
->
KPerThread
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
M1N1ThreadClusterM10
)
+
"x"
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN10
)
+
"x"
+
std
::
to_string
(
pt
->
M1N1ThreadClusterM11
)
+
"x"
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN11
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_M1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_N1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_definition_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
" -DCK_PARAM_ABDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
" -DCK_PARAM_AccDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
" -DCK_PARAM_CDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_definition_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
;
out
+=
" -DCK_PARAM_BlockSize="
+
std
::
to_string
(
pt
->
BlockSize
);
out
+=
" -DCK_PARAM_MPerBlock="
+
std
::
to_string
(
pt
->
MPerBlock
)
+
" -DCK_PARAM_NPerBlock="
+
std
::
to_string
(
pt
->
NPerBlock
)
+
" -DCK_PARAM_KPerBlock="
+
std
::
to_string
(
pt
->
KPerBlock
);
out
+=
" -DCK_PARAM_M1PerThread="
+
std
::
to_string
(
pt
->
M1PerThread
)
+
" -DCK_PARAM_N1PerThread="
+
std
::
to_string
(
pt
->
N1PerThread
)
+
" -DCK_PARAM_KPerThread="
+
std
::
to_string
(
pt
->
KPerThread
);
out
+=
" -DCK_PARAM_M1N1ThreadClusterM10="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterM10
)
+
" -DCK_PARAM_M1N1ThreadClusterN10="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN10
)
+
" -DCK_PARAM_M1N1ThreadClusterM11="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterM11
)
+
" -DCK_PARAM_M1N1ThreadClusterN11="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN11
);
out
+=
" -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_ABlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_ABlockTransferDstScalarPerVector_M1="
+
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_M1
);
out
+=
" -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_BBlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_BBlockTransferDstScalarPerVector_N1="
+
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_N1
);
out
+=
" -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstAccessOrder="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
]);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstVectorDim="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
);
out
+=
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
+
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
}
// namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
const
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
tunable
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
ck
::
driver
;
using
namespace
detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
;
using
size_t
=
std
::
size_t
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
in_n_c_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_k_c_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
a_k_m_grid_desc
=
descs
[
I0
];
const
auto
c_m_n_grid_desc
=
descs
[
I2
];
const
auto
M
=
c_m_n_grid_desc
.
GetLength
(
I0
);
const
auto
N
=
c_m_n_grid_desc
.
GetLength
(
I1
);
const
auto
K
=
a_k_m_grid_desc
.
GetLength
(
I0
);
const
index_t
grid_size
=
(
M
/
tunable
->
MPerBlock
)
*
(
N
/
tunable
->
NPerBlock
);
const
bool
hasMainKBlockLoop
=
((
K
+
tunable
->
KPerBlock
)
/
(
2
*
tunable
->
KPerBlock
)
>
1
);
const
bool
hasDoubleTailKBlockLoop
=
((
K
/
tunable
->
KPerBlock
)
%
2
==
0
);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem
in_n_c_hi_wi_dev_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_dev_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
in_n_c_hi_wi_dev_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_k_c_y_x_dev_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_dev_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem
workspace_buf
(
4096
);
void
*
a_k_m0_m1_grid_desc_dev_buf
=
workspace_buf
.
GetDeviceBuffer
();
void
*
b_k_n0_n1_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
1024
);
void
*
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
2048
);
void
*
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
3072
);
const
std
::
vector
<
size_t
>
vld
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
tunable
->
BlockSize
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v4r4_dlops_nchw"
;
std
::
string
param
=
" -std=c++17 "
;
std
::
string
network_config
;
param
+=
get_definition_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
" "
+
get_definition_string_from_tunable
(
tunable
)
+
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP="
+
std
::
to_string
(
hasMainKBlockLoop
)
+
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP="
+
std
::
to_string
(
hasDoubleTailKBlockLoop
);
network_config
=
get_network_config_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
"_"
+
get_network_config_string_from_tunable
(
tunable
)
+
"_"
+
std
::
to_string
(
hasMainKBlockLoop
)
+
"_"
+
std
::
to_string
(
hasDoubleTailKBlockLoop
);
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld
,
vgd1
,
param
)(
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I2
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I3
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
a_k_m0_m1_grid_desc_dev_buf
,
b_k_n0_n1_grid_desc_dev_buf
,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
);
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld
,
vgd2
,
param
)(
reinterpret_cast
<
const
TInWei
*>
(
wei_k_c_y_x_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
in_n_c_hi_wi_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_k_ho_wo_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
a_k_m0_m1_grid_desc_dev_buf
),
(
const
void
*
)(
b_k_n0_n1_grid_desc_dev_buf
),
(
const
void
*
)(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
),
(
const
void
*
)(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
const
auto
N
=
in_n_c_hi_wi_lengths
[
I0
];
const
auto
C
=
in_n_c_hi_wi_lengths
[
I1
];
const
auto
K
=
out_n_k_ho_wo_lengths
[
I1
];
const
auto
Ho
=
out_n_k_ho_wo_lengths
[
I2
];
const
auto
Wo
=
out_n_k_ho_wo_lengths
[
I3
];
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
(
ave_time1
+
ave_time2
);
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_k_ho_wo_dev_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
cb954213
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
{
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_network_config_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_network_config_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
(
"TUN_"
);
out
+=
std
::
to_string
(
pt
->
BlockSize
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
NPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
KPerBlock
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
NPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
MRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
NRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_definition_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
" -DCK_PARAM_ABDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
" -DCK_PARAM_AccDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
" -DCK_PARAM_CDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_definition_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
;
out
+=
" -DCK_PARAM_BlockSize="
+
std
::
to_string
(
pt
->
BlockSize
);
out
+=
" -DCK_PARAM_MPerBlock="
+
std
::
to_string
(
pt
->
MPerBlock
)
+
" -DCK_PARAM_NPerBlock="
+
std
::
to_string
(
pt
->
NPerBlock
)
+
" -DCK_PARAM_KPerBlock="
+
std
::
to_string
(
pt
->
KPerBlock
);
out
+=
" -DCK_PARAM_MPerWave="
+
std
::
to_string
(
pt
->
MPerWave
)
+
" -DCK_PARAM_NPerWave="
+
std
::
to_string
(
pt
->
NPerWave
)
+
" -DCK_PARAM_K1="
+
std
::
to_string
(
pt
->
K1
)
+
" -DCK_PARAM_MRepeat="
+
std
::
to_string
(
pt
->
MRepeat
)
+
" -DCK_PARAM_NRepeat="
+
std
::
to_string
(
pt
->
NRepeat
);
out
+=
" -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_ABlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_ABlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_BBlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_BBlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstAccessOrder="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
]);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstVectorDim="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
);
out
+=
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
+
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
}
// namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
const
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
tunable
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
ck
::
driver
;
using
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
;
using
size_t
=
std
::
size_t
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
in_n_c_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_k_c_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
const
auto
n
=
in_n_c_hi_wi_desc
.
GetLength
(
I0
);
const
auto
c
=
in_n_c_hi_wi_desc
.
GetLength
(
I1
);
const
auto
hi
=
in_n_c_hi_wi_desc
.
GetLength
(
I2
);
const
auto
wi
=
in_n_c_hi_wi_desc
.
GetLength
(
I3
);
const
auto
k
=
wei_k_c_y_x_desc
.
GetLength
(
I0
);
const
auto
y
=
wei_k_c_y_x_desc
.
GetLength
(
I2
);
const
auto
x
=
wei_k_c_y_x_desc
.
GetLength
(
I3
);
const
auto
ho
=
out_n_k_ho_wo_desc
.
GetLength
(
I2
);
const
auto
wo
=
out_n_k_ho_wo_desc
.
GetLength
(
I3
);
const
auto
M
=
k
;
const
auto
N
=
n
*
ho
*
wo
;
const
auto
K
=
c
*
y
*
x
;
const
auto
K0
=
K
/
tunable
->
K1
;
const
index_t
grid_size
=
(
M
/
tunable
->
MPerBlock
)
*
(
N
/
tunable
->
NPerBlock
);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem
in_n_c_hi_wi_dev_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_dev_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
in_n_c_hi_wi_dev_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_k_c_y_x_dev_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_dev_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem
workspace_buf
(
4096
);
void
*
a_k_m0_m1_grid_desc_dev_buf
=
workspace_buf
.
GetDeviceBuffer
();
void
*
b_k_n0_n1_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
1024
);
void
*
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
2048
);
void
*
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
3072
);
const
std
::
vector
<
size_t
>
vld
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
tunable
->
BlockSize
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v4r4_xdlops_nchw"
;
std
::
string
param
=
" -std=c++17 "
;
std
::
string
network_config
;
param
+=
get_definition_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
" "
+
" -DCK_USE_AMD_XDLOPS"
+
get_definition_string_from_tunable
(
tunable
);
network_config
=
get_network_config_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
"_"
+
get_network_config_string_from_tunable
(
tunable
);
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld
,
vgd1
,
param
)(
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I2
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I3
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
a_k_m0_m1_grid_desc_dev_buf
,
b_k_n0_n1_grid_desc_dev_buf
,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
);
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld
,
vgd2
,
param
)(
reinterpret_cast
<
const
TInWei
*>
(
wei_k_c_y_x_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
in_n_c_hi_wi_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_k_ho_wo_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
a_k_m0_m1_grid_desc_dev_buf
),
(
const
void
*
)(
b_k_n0_n1_grid_desc_dev_buf
),
(
const
void
*
)(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
),
(
const
void
*
)(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
const
auto
N
=
in_n_c_hi_wi_lengths
[
I0
];
const
auto
C
=
in_n_c_hi_wi_lengths
[
I1
];
const
auto
K
=
out_n_k_ho_wo_lengths
[
I1
];
const
auto
Ho
=
out_n_k_ho_wo_lengths
[
I2
];
const
auto
Wo
=
out_n_k_ho_wo_lengths
[
I3
];
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
(
ave_time1
+
ave_time2
);
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_k_ho_wo_dev_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
deleted
100644 → 0
View file @
cb954213
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
{
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_network_config_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_network_config_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
pt
)
{
std
::
string
out
(
"TUN_"
);
out
+=
std
::
to_string
(
pt
->
BlockSize
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
NPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
KPerBlock
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
NPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
MRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
NRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_definition_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
" -DCK_PARAM_ABDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
" -DCK_PARAM_AccDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
" -DCK_PARAM_CDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_definition_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
pt
)
{
std
::
string
out
;
out
+=
" -DCK_PARAM_BlockSize="
+
std
::
to_string
(
pt
->
BlockSize
);
out
+=
" -DCK_PARAM_MPerBlock="
+
std
::
to_string
(
pt
->
MPerBlock
)
+
" -DCK_PARAM_NPerBlock="
+
std
::
to_string
(
pt
->
NPerBlock
)
+
" -DCK_PARAM_KPerBlock="
+
std
::
to_string
(
pt
->
KPerBlock
);
out
+=
" -DCK_PARAM_MPerWave="
+
std
::
to_string
(
pt
->
MPerWave
)
+
" -DCK_PARAM_NPerWave="
+
std
::
to_string
(
pt
->
NPerWave
)
+
" -DCK_PARAM_K1="
+
std
::
to_string
(
pt
->
K1
)
+
" -DCK_PARAM_MRepeat="
+
std
::
to_string
(
pt
->
MRepeat
)
+
" -DCK_PARAM_NRepeat="
+
std
::
to_string
(
pt
->
NRepeat
);
out
+=
" -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_ABlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_ABlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_BBlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_BBlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstAccessOrder="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
]);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstVectorDim="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
);
out
+=
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
+
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
}
// namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_hi_wi_c
,
const
Tensor
<
TInWei
>&
wei_k_y_x_c
,
Tensor
<
TOut
>&
out_n_ho_wo_k
,
const
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
tunable
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
;
using
size_t
=
std
::
size_t
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
in_n_hi_wi_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
const
auto
n
=
in_n_hi_wi_c_desc
.
GetLength
(
I0
);
const
auto
hi
=
in_n_hi_wi_c_desc
.
GetLength
(
I1
);
const
auto
wi
=
in_n_hi_wi_c_desc
.
GetLength
(
I2
);
const
auto
c
=
in_n_hi_wi_c_desc
.
GetLength
(
I3
);
const
auto
k
=
wei_k_y_x_c_desc
.
GetLength
(
I0
);
const
auto
y
=
wei_k_y_x_c_desc
.
GetLength
(
I1
);
const
auto
x
=
wei_k_y_x_c_desc
.
GetLength
(
I2
);
const
auto
ho
=
out_n_ho_wo_k_desc
.
GetLength
(
I1
);
const
auto
wo
=
out_n_ho_wo_k_desc
.
GetLength
(
I2
);
const
auto
M
=
k
;
const
auto
N
=
n
*
ho
*
wo
;
const
auto
K
=
c
*
y
*
x
;
const
auto
K0
=
K
/
tunable
->
K1
;
const
index_t
grid_size
=
(
M
/
tunable
->
MPerBlock
)
*
(
N
/
tunable
->
NPerBlock
);
// these buffers are usually provided by the user application
DeviceMem
in_n_hi_wi_c_dev_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_y_x_c_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_y_x_c
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_ho_wo_k_dev_buf
(
sizeof
(
TOut
)
*
out_n_ho_wo_k
.
mDesc
.
GetElementSpace
());
in_n_hi_wi_c_dev_buf
.
ToDevice
(
in_n_hi_wi_c
.
mData
.
data
());
wei_k_y_x_c_dev_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_dev_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem
workspace_buf
(
4096
);
void
*
a_k0_m_k1_grid_desc_dev_buf
=
workspace_buf
.
GetDeviceBuffer
();
void
*
b_k0_n_k1_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
1024
);
void
*
c_m0_m1_m2_n_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
2048
);
void
*
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
3072
);
const
std
::
vector
<
size_t
>
vld
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
tunable
->
BlockSize
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v4r4_xdlops_nhwc"
;
std
::
string
param
=
" -std=c++17 "
;
std
::
string
network_config
;
param
+=
get_definition_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
" -DCK_USE_AMD_XDLOPS "
;
param
+=
get_definition_string_from_tunable
(
tunable
);
network_config
=
get_network_config_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
"_"
+
get_network_config_string_from_tunable
(
tunable
);
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld
,
vgd1
,
param
)(
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_y_x_c_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_y_x_c_lengths
[
I1
]),
static_cast
<
index_t
>
(
wei_k_y_x_c_lengths
[
I2
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
a_k0_m_k1_grid_desc_dev_buf
,
b_k0_n_k1_grid_desc_dev_buf
,
c_m0_m1_m2_n_grid_desc_dev_buf
,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
);
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld
,
vgd2
,
param
)(
reinterpret_cast
<
const
TInWei
*>
(
in_n_hi_wi_c_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
wei_k_y_x_c_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_ho_wo_k_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
a_k0_m_k1_grid_desc_dev_buf
),
(
const
void
*
)(
b_k0_n_k1_grid_desc_dev_buf
),
(
const
void
*
)(
c_m0_m1_m2_n_grid_desc_dev_buf
),
(
const
void
*
)(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
const
auto
N
=
in_n_hi_wi_c_lengths
[
I0
];
const
auto
C
=
in_n_hi_wi_c_lengths
[
I3
];
const
auto
Ho
=
out_n_ho_wo_k_lengths
[
I1
];
const
auto
Wo
=
out_n_ho_wo_k_lengths
[
I2
];
const
auto
K
=
out_n_ho_wo_k_lengths
[
I3
];
const
auto
Y
=
wei_k_y_x_c_lengths
[
I1
];
const
auto
X
=
wei_k_y_x_c_lengths
[
I2
];
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time2
;
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_ho_wo_k_dev_buf
.
FromDevice
(
out_n_ho_wo_k
.
mData
.
data
());
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
cb954213
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "convolution_problem_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
const
ck
::
driver
::
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
&
compile_param
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
ck
::
driver
;
using
size_t
=
std
::
size_t
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
ConvolutionProblemDescriptor
conv_problem_desc
{
in_n_c_hi_wi_lengths
[
I0
],
out_n_k_ho_wo_lengths
[
I1
],
in_n_c_hi_wi_lengths
[
I1
],
wei_k_c_y_x_lengths
[
I2
],
wei_k_c_y_x_lengths
[
I3
],
in_n_c_hi_wi_lengths
[
I2
],
in_n_c_hi_wi_lengths
[
I3
],
out_n_k_ho_wo_lengths
[
I2
],
out_n_k_ho_wo_lengths
[
I3
],
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
get_datatype_enum_from_type
<
TInWei
>::
value
,
get_datatype_enum_from_type
<
TInWei
>::
value
,
get_datatype_enum_from_type
<
TOut
>::
value
};
if
(
!
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
IsValidCompileParameter
(
conv_problem_desc
,
compile_param
))
{
throw
std
::
runtime_error
(
"wrong! IsValidCompileParameter fail"
);
}
DeviceMem
in_n_c_hi_wi_dev_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_dev_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
in_n_c_hi_wi_dev_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_k_c_y_x_dev_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_dev_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
// workspace is used for save transformed tensor descritpors created by prepare kernel
DeviceMem
workspace_dev_buf
(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
GetWorkSpaceSize
(
conv_problem_desc
,
compile_param
));
const
auto
block_size
=
std
::
size_t
(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
GetBlockSize
(
conv_problem_desc
,
compile_param
));
const
auto
grid_size
=
std
::
size_t
(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
GetGridSize
(
conv_problem_desc
,
compile_param
));
const
std
::
vector
<
size_t
>
vld1
=
{
1
,
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
1
,
1
,
1
};
const
std
::
vector
<
size_t
>
vld2
=
{
static_cast
<
size_t
>
(
block_size
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
block_size
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v6r1_dlops_nchw"
;
std
::
string
compile_param_string
=
get_ck_hip_online_compile_common_flag
()
+
compile_param
.
GetCompileParameterString
();
std
::
string
network_config
=
compile_param_string
;
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
+
1
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld1
,
vgd1
,
compile_param_string
)(
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I2
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I3
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
(
void
*
)(
workspace_dev_buf
.
GetDeviceBuffer
()));
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld2
,
vgd2
,
compile_param_string
)(
reinterpret_cast
<
const
TInWei
*>
(
wei_k_c_y_x_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
in_n_c_hi_wi_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_k_ho_wo_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
workspace_dev_buf
.
GetDeviceBuffer
()));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
nrepeat
;
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
nrepeat
;
float
perf
=
(
float
)(
conv_problem_desc
.
CalculateFlop
())
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
(
ave_time1
+
ave_time2
);
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_k_ho_wo_dev_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
}
host/online_compile/CMakeLists.txt
deleted
100644 → 0
View file @
cb954213
set
(
CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++
)
## for online-compiling of HIP kernels
set
(
OLC_HIP_COMPILER
${
CMAKE_CXX_COMPILER
}
CACHE PATH
""
)
## reset to avoid the C++ options from the parent project
set
(
CMAKE_CXX_FLAGS
""
)
message
(
"Compiling options for library and kernels:
${
CMAKE_CXX_FLAGS
}
"
)
# look for and register clang-offload-bundler
if
(
OLC_HIP_COMPILER MATCHES
".*clang
\\
+
\\
+$"
)
find_program
(
OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
PATH_SUFFIXES bin
PATHS
/opt/rocm/llvm
${
CMAKE_INSTALL_PREFIX
}
/llvm
)
endif
()
if
(
OLC_OFFLOADBUNDLER_BIN
)
message
(
STATUS
"clang-offload-bundler found:
${
OLC_OFFLOADBUNDLER_BIN
}
"
)
set
(
OLC_OFFLOADBUNDLER_BIN
"
${
OLC_OFFLOADBUNDLER_BIN
}
"
)
else
()
# look for and register extractkernel
message
(
STATUS
"clang-offload-bundler not found"
)
find_program
(
EXTRACTKERNEL_BIN extractkernel
PATH_SUFFIXES bin
PATHS
/opt/rocm/hip
/opt/rocm/hcc
/opt/rocm
${
CMAKE_INSTALL_PREFIX
}
/hip
${
CMAKE_INSTALL_PREFIX
}
/hcc
${
CMAKE_INSTALL_PREFIX
}
)
if
(
EXTRACTKERNEL_BIN
)
message
(
STATUS
"extractkernel found:
${
EXTRACTKERNEL_BIN
}
"
)
set
(
EXTRACTKERNEL_BIN
"
${
EXTRACTKERNEL_BIN
}
"
)
else
()
message
(
FATAL_ERROR
"extractkernel not found"
)
endif
()
endif
()
option
(
Boost_USE_STATIC_LIBS
"Use boost static libraries"
OFF
)
set
(
BOOST_COMPONENTS filesystem
)
add_definitions
(
-DBOOST_ALL_NO_LIB=1
)
find_package
(
Boost REQUIRED COMPONENTS
${
BOOST_COMPONENTS
}
)
# HIP is always required
find_package
(
hip REQUIRED PATHS /opt/rocm
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
target_flags
(
HIP_COMPILER_FLAGS hip::device
)
# Remove cuda arch flags
string
(
REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REGEX REPLACE --offload-arch=[a-z0-9]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
set
(
OLC_hip_VERSION_MAJOR
"
${
hip_VERSION_MAJOR
}
"
)
set
(
OLC_hip_VERSION_MINOR
"
${
hip_VERSION_MINOR
}
"
)
set
(
OLC_hip_VERSION_PATCH
"
${
hip_VERSION_PATCH
}
"
)
option
(
ENABLE_DEBUG
"Build to enable debugging"
ON
)
if
(
ENABLE_DEBUG
)
set
(
OLC_DEBUG 1
)
else
()
set
(
OLC_DEBUG 0
)
endif
()
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/host/online_compile/include/config.h.in"
"
${
PROJECT_BINARY_DIR
}
/host/online_compile/include/config.h"
)
include_directories
(
BEFORE
${
PROJECT_BINARY_DIR
}
/host/online_compile/include
)
message
(
STATUS
"Hip compiler flags:
${
HIP_COMPILER_FLAGS
}
"
)
## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
set
(
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
${
HIP_ONLINE_COMPILER_FLAGS
}
"
)
add_definitions
(
"-DHIP_COMPILER_FLAGS=
${
HIP_COMPILER_FLAGS
}
"
)
file
(
GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/*/*.hpp"
)
file
(
GLOB COMPOSABLE_KERNEL_INCLUDE_2
"
${
PROJECT_SOURCE_DIR
}
/external/rocm/include/bfloat16_dev.hpp"
)
set
(
MCONV_KERNEL_INCLUDES
${
COMPOSABLE_KERNEL_INCLUDE_1
}
${
COMPOSABLE_KERNEL_INCLUDE_2
}
)
file
(
GLOB_RECURSE MCONV_KERNELS
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/src/kernel_wrapper/*.cpp"
)
add_kernels
(
${
CMAKE_CURRENT_SOURCE_DIR
}
"
${
MCONV_KERNELS
}
"
)
add_kernel_includes
(
${
CMAKE_CURRENT_SOURCE_DIR
}
"
${
MCONV_KERNEL_INCLUDES
}
"
)
set
(
ONLINE_COMPILATION_SOURCE
${
PROJECT_BINARY_DIR
}
/kernel.cpp
${
PROJECT_BINARY_DIR
}
/kernel_includes.cpp
)
include_directories
(
BEFORE
${
PROJECT_BINARY_DIR
}
/host/online_compile/include
include
)
set
(
OLC_HIP_UTILITY_CPPS
hip_utility/logger.cpp
hip_utility/tmp_dir.cpp
hip_utility/md5.cpp
hip_utility/exec_utils.cpp
hip_utility/target_properties.cpp
hip_utility/handlehip.cpp
hip_utility/kernel_build_params.cpp
hip_utility/hip_build_utils.cpp
hip_utility/hipoc_program.cpp
hip_utility/hipoc_kernel.cpp
hip_utility/kernel_cache.cpp
hip_utility/binary_cache.cpp
)
list
(
APPEND OLC_SOURCES
${
OLC_HIP_UTILITY_CPPS
}
${
OLC_HIP_UTILITY_HEADERS
}
)
## addkernels provide the tool to create inlined kernels in one header
add_subdirectory
(
addkernels
)
function
(
inline_kernels_src KERNELS KERNEL_INCLUDES
)
set
(
KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp
)
set
(
KERNEL_SRC_HPP_PATH
${
PROJECT_BINARY_DIR
}
/inlined_kernels/
${
KERNEL_SRC_HPP_FILENAME
}
)
set
(
KERNEL_SRC_CPP_PATH
${
PROJECT_BINARY_DIR
}
/inlined_kernels/batch_all.cpp
)
add_custom_command
(
OUTPUT
${
KERNEL_SRC_HPP_PATH
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
DEPENDS addkernels
${
KERNELS
}
${
KERNEL_INCLUDES
}
COMMAND $<TARGET_FILE:addkernels> -target
${
KERNEL_SRC_HPP_PATH
}
-extern -source
${
KERNELS
}
COMMENT
"Inlining All kernels"
)
configure_file
(
kernels_batch.cpp.in
${
KERNEL_SRC_CPP_PATH
}
)
list
(
APPEND OLC_SOURCES
${
KERNEL_SRC_CPP_PATH
}
${
KERNEL_SRC_HPP_PATH
}
)
set
(
OLC_SOURCES
${
OLC_SOURCES
}
PARENT_SCOPE
)
endfunction
()
inline_kernels_src
(
"
${
MCONV_KERNELS
}
"
"
${
MCONV_KERNEL_INCLUDES
}
"
)
list
(
APPEND ONLINE_COMPILATION_SOURCE
${
OLC_SOURCES
}
${
PROJECT_BINARY_DIR
}
/olc_kernel_includes.h
)
add_custom_command
(
OUTPUT
${
PROJECT_BINARY_DIR
}
/olc_kernel_includes.h
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
DEPENDS addkernels
${
MCONV_KERNEL_INCLUDES
}
COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target
${
PROJECT_BINARY_DIR
}
/olc_kernel_includes.h -source
${
MCONV_KERNEL_INCLUDES
}
COMMENT
"Inlining HIP kernel includes"
)
## the library target
add_library
(
online_compile SHARED
${
ONLINE_COMPILATION_SOURCE
}
)
target_include_directories
(
online_compile PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/online_compile/include/
)
target_include_directories
(
online_compile PRIVATE
${
PROJECT_BINARY_DIR
}
)
target_include_directories
(
online_compile PRIVATE
${
PROJECT_SOURCE_DIR
}
/external/half/include/
)
target_link_libraries
(
online_compile PRIVATE hip::device
)
target_link_libraries
(
online_compile INTERFACE hip::host
)
target_link_libraries
(
online_compile PRIVATE Boost::filesystem
)
target_compile_features
(
online_compile PUBLIC
)
set_target_properties
(
online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS online_compile LIBRARY DESTINATION lib
)
host/online_compile/addkernels/CMakeLists.txt
deleted
100644 → 0
View file @
cb954213
################################################################################
#
# MIT License
#
# Copyright (c) 2017 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
################################################################################
set
(
ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp
)
add_executable
(
addkernels EXCLUDE_FROM_ALL
${
ADD_KERNELS_SOURCE
}
)
host/online_compile/addkernels/addkernels.cpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "include_inliner.hpp"
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
void
Bin2Hex
(
std
::
istream
&
source
,
std
::
ostream
&
target
,
const
std
::
string
&
variable
,
bool
nullTerminate
,
size_t
bufferSize
,
size_t
lineSize
)
{
source
.
seekg
(
0
,
std
::
ios
::
end
);
std
::
unique_ptr
<
unsigned
char
[]
>
buffer
(
new
unsigned
char
[
bufferSize
]);
std
::
streamoff
sourceSize
=
source
.
tellg
();
std
::
streamoff
blockStart
=
0
;
if
(
variable
.
length
()
!=
0
)
{
target
<<
"extern const size_t "
<<
variable
<<
"_SIZE;"
<<
std
::
endl
;
target
<<
"extern const unsigned char "
<<
variable
<<
"[];"
<<
std
::
endl
;
target
<<
"const size_t "
<<
variable
<<
"_SIZE = "
<<
std
::
setbase
(
10
)
<<
sourceSize
<<
";"
<<
std
::
endl
;
target
<<
"const unsigned char "
<<
variable
<<
"[] = {"
<<
std
::
endl
;
}
target
<<
std
::
setbase
(
16
)
<<
std
::
setfill
(
'0'
);
source
.
seekg
(
0
,
std
::
ios
::
beg
);
while
(
blockStart
<
sourceSize
)
{
source
.
read
(
reinterpret_cast
<
char
*>
(
buffer
.
get
()),
bufferSize
);
std
::
streamoff
pos
=
source
.
tellg
();
std
::
streamoff
blockSize
=
(
pos
<
0
?
sourceSize
:
pos
)
-
blockStart
;
std
::
streamoff
i
=
0
;
while
(
i
<
blockSize
)
{
size_t
j
=
i
;
size_t
end
=
std
::
min
<
size_t
>
(
i
+
lineSize
,
blockSize
);
for
(;
j
<
end
;
j
++
)
target
<<
"0x"
<<
std
::
setw
(
2
)
<<
static_cast
<
unsigned
>
(
buffer
[
j
])
<<
","
;
target
<<
std
::
endl
;
i
=
end
;
}
blockStart
+=
blockSize
;
}
if
(
nullTerminate
)
target
<<
"0x00,"
<<
std
::
endl
;
if
(
variable
.
length
()
!=
0
)
{
target
<<
"};"
<<
std
::
endl
;
}
}
void
PrintHelp
()
{
std
::
cout
<<
"Usage: bin2hex {<option>}"
<<
std
::
endl
;
std
::
cout
<<
"Option format: -<option name>[ <option value>]"
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"Options:"
<<
std
::
endl
;
std
::
cout
<<
"[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
<<
std
::
endl
;
std
::
cout
<<
" -t[arget] <path>: target file. Default: std out."
<<
std
::
endl
;
std
::
cout
<<
" -l[ine-size] <number>: bytes in one line. Default: 16."
<<
std
::
endl
;
std
::
cout
<<
" -b[uffer] <number>: read buffer size. Default: 512."
<<
std
::
endl
;
std
::
cout
<<
" -g[uard] <string>: guard name. Default: no guard"
<<
std
::
endl
;
std
::
cout
<<
" -n[o-recurse] : dont expand include files recursively. Default: off"
<<
std
::
endl
;
}
[[
gnu
::
noreturn
]]
void
WrongUsage
(
const
std
::
string
&
error
)
{
std
::
cout
<<
"Wrong usage: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
PrintHelp
();
std
::
exit
(
1
);
}
[[
gnu
::
noreturn
]]
void
UnknownArgument
(
const
std
::
string
&
arg
)
{
std
::
ostringstream
ss
;
ss
<<
"unknown argument - "
<<
arg
;
WrongUsage
(
ss
.
str
());
}
void
Process
(
const
std
::
string
&
sourcePath
,
std
::
ostream
&
target
,
size_t
bufferSize
,
size_t
lineSize
,
bool
recurse
,
bool
as_extern
)
{
std
::
string
fileName
(
sourcePath
);
std
::
string
extension
,
root
;
std
::
stringstream
inlinerTemp
;
auto
extPos
=
fileName
.
rfind
(
'.'
);
auto
slashPos
=
fileName
.
rfind
(
'/'
);
if
(
extPos
!=
std
::
string
::
npos
)
{
extension
=
fileName
.
substr
(
extPos
+
1
);
fileName
=
fileName
.
substr
(
0
,
extPos
);
}
if
(
slashPos
!=
std
::
string
::
npos
)
{
root
=
fileName
.
substr
(
0
,
slashPos
+
1
);
fileName
=
fileName
.
substr
(
slashPos
+
1
);
}
std
::
string
variable
(
fileName
);
std
::
ifstream
sourceFile
(
sourcePath
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
std
::
istream
*
source
=
&
sourceFile
;
if
(
!
sourceFile
.
good
())
{
std
::
cerr
<<
"File not found: "
<<
sourcePath
<<
std
::
endl
;
std
::
exit
(
1
);
}
const
auto
is_asm
=
extension
==
"s"
;
const
auto
is_cl
=
extension
==
"cl"
;
const
auto
is_hip
=
extension
==
"cpp"
;
const
auto
is_header
=
extension
==
"hpp"
;
if
(
is_asm
||
is_cl
||
is_hip
||
is_header
)
{
IncludeInliner
inliner
;
try
{
if
(
is_asm
)
inliner
.
Process
(
sourceFile
,
inlinerTemp
,
root
,
sourcePath
,
".include"
,
false
,
recurse
);
else
if
(
is_cl
||
is_header
)
inliner
.
Process
(
sourceFile
,
inlinerTemp
,
root
,
sourcePath
,
"#include"
,
true
,
recurse
);
else
if
(
is_hip
)
inliner
.
Process
(
sourceFile
,
inlinerTemp
,
root
,
sourcePath
,
"<#not_include>"
,
true
,
false
);
}
catch
(
const
InlineException
&
ex
)
{
std
::
cerr
<<
ex
.
What
()
<<
std
::
endl
;
std
::
cerr
<<
ex
.
GetTrace
()
<<
std
::
endl
;
std
::
exit
(
1
);
}
source
=
&
inlinerTemp
;
}
std
::
transform
(
variable
.
begin
(),
variable
.
end
(),
variable
.
begin
(),
::
toupper
);
if
(
as_extern
&&
variable
.
length
()
!=
0
)
{
variable
=
"APP_KERNEL_"
+
variable
;
}
Bin2Hex
(
*
source
,
target
,
variable
,
true
,
bufferSize
,
lineSize
);
}
int
main
(
int
argsn
,
char
**
args
)
{
if
(
argsn
==
1
)
{
PrintHelp
();
return
2
;
}
std
::
string
guard
;
size_t
bufferSize
=
512
;
size_t
lineSize
=
16
;
std
::
ofstream
targetFile
;
std
::
ostream
*
target
=
&
std
::
cout
;
bool
recurse
=
true
;
bool
as_extern
=
false
;
int
i
=
0
;
while
(
++
i
<
argsn
&&
**
args
!=
'-'
)
{
std
::
string
arg
(
args
[
i
]
+
1
);
std
::
transform
(
arg
.
begin
(),
arg
.
end
(),
arg
.
begin
(),
::
tolower
);
if
(
arg
==
"s"
||
arg
==
"source"
)
{
if
(
guard
.
length
()
>
0
)
{
*
target
<<
"#ifndef "
<<
guard
<<
std
::
endl
;
*
target
<<
"#define "
<<
guard
<<
std
::
endl
;
}
*
target
<<
"#include <cstddef>"
<<
std
::
endl
;
while
(
++
i
<
argsn
)
{
Process
(
args
[
i
],
*
target
,
bufferSize
,
lineSize
,
recurse
,
as_extern
);
}
if
(
guard
.
length
()
>
0
)
{
*
target
<<
"#endif"
<<
std
::
endl
;
}
return
0
;
}
else
if
(
arg
==
"t"
||
arg
==
"target"
)
{
targetFile
.
open
(
args
[
++
i
],
std
::
ios
::
out
);
target
=
&
targetFile
;
}
else
if
(
arg
==
"l"
||
arg
==
"line-size"
)
lineSize
=
std
::
stol
(
args
[
++
i
]);
else
if
(
arg
==
"b"
||
arg
==
"buffer"
)
bufferSize
=
std
::
stol
(
args
[
++
i
]);
else
if
(
arg
==
"g"
||
arg
==
"guard"
)
guard
=
args
[
++
i
];
else
if
(
arg
==
"n"
||
arg
==
"no-recurse"
)
recurse
=
false
;
else
if
(
arg
==
"e"
||
arg
==
"extern"
)
as_extern
=
true
;
else
UnknownArgument
(
arg
);
}
WrongUsage
(
"source key is required"
);
}
host/online_compile/addkernels/include_inliner.cpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <algorithm>
#include <exception>
#include <fstream>
#include <sstream>
#ifdef _WIN32
#include <windows.h>
#endif
#ifdef __linux__
#include <linux/limits.h>
#include <cstdlib>
#endif // !WIN32
#include "include_inliner.hpp"
namespace
PathHelpers
{
static
int
GetMaxPath
()
{
#ifdef _WIN32
return
MAX_PATH
;
#else
return
PATH_MAX
;
#endif
}
static
std
::
string
GetAbsolutePath
(
const
std
::
string
&
path
)
{
std
::
string
result
(
GetMaxPath
(),
' '
);
#ifdef _WIN32
const
auto
retval
=
GetFullPathName
(
path
.
c_str
(),
result
.
size
(),
&
result
[
0
],
nullptr
);
if
(
retval
==
0
)
return
""
;
#else
auto
*
const
retval
=
realpath
(
path
.
c_str
(),
&
result
[
0
]);
if
(
retval
==
nullptr
)
return
""
;
#endif
return
result
;
}
}
// namespace PathHelpers
std
::
string
IncludeFileExceptionBase
::
What
()
const
{
std
::
ostringstream
ss
;
ss
<<
GetMessage
()
<<
": <"
<<
_file
<<
">"
;
return
ss
.
str
();
}
void
IncludeInliner
::
Process
(
std
::
istream
&
input
,
std
::
ostream
&
output
,
const
std
::
string
&
root
,
const
std
::
string
&
file_name
,
const
std
::
string
&
directive
,
bool
allow_angle_brackets
,
bool
recurse
)
{
ProcessCore
(
input
,
output
,
root
,
file_name
,
0
,
directive
,
allow_angle_brackets
,
recurse
);
}
void
IncludeInliner
::
ProcessCore
(
std
::
istream
&
input
,
std
::
ostream
&
output
,
const
std
::
string
&
root
,
const
std
::
string
&
file_name
,
int
line_number
,
const
std
::
string
&
directive
,
bool
allow_angle_brackets
,
bool
recurse
)
{
if
(
_include_depth
>=
include_depth_limit
)
throw
InlineStackOverflowException
(
GetIncludeStackTrace
(
0
));
_include_depth
++
;
_included_stack_head
=
std
::
make_shared
<
SourceFileDesc
>
(
file_name
,
_included_stack_head
,
line_number
);
auto
current_line
=
0
;
auto
next_include_optional
=
false
;
while
(
!
input
.
eof
())
{
std
::
string
line
;
std
::
string
word
;
std
::
getline
(
input
,
line
);
std
::
istringstream
line_parser
(
line
);
line_parser
>>
word
;
current_line
++
;
std
::
transform
(
word
.
begin
(),
word
.
end
(),
word
.
begin
(),
::
tolower
);
const
auto
include_optional
=
next_include_optional
;
next_include_optional
=
false
;
if
(
!
word
.
empty
()
&&
word
==
"//inliner-include-optional"
)
{
if
(
include_optional
)
throw
IncludeExpectedException
(
GetIncludeStackTrace
(
current_line
));
next_include_optional
=
true
;
continue
;
}
if
(
!
word
.
empty
()
&&
word
==
directive
&&
recurse
)
{
auto
first_quote_pos
=
line
.
find
(
'"'
,
static_cast
<
int
>
(
line_parser
.
tellg
())
+
1
);
std
::
string
::
size_type
second_quote_pos
;
if
(
first_quote_pos
!=
std
::
string
::
npos
)
{
second_quote_pos
=
line
.
find
(
'"'
,
first_quote_pos
+
1
);
if
(
second_quote_pos
==
std
::
string
::
npos
)
throw
WrongInlineDirectiveException
(
GetIncludeStackTrace
(
current_line
));
}
else
{
if
(
!
allow_angle_brackets
)
throw
WrongInlineDirectiveException
(
GetIncludeStackTrace
(
current_line
));
first_quote_pos
=
line
.
find
(
'<'
,
static_cast
<
int
>
(
line_parser
.
tellg
())
+
1
);
if
(
first_quote_pos
==
std
::
string
::
npos
)
throw
WrongInlineDirectiveException
(
GetIncludeStackTrace
(
current_line
));
second_quote_pos
=
line
.
find
(
'>'
,
first_quote_pos
+
1
);
if
(
second_quote_pos
==
std
::
string
::
npos
)
throw
WrongInlineDirectiveException
(
GetIncludeStackTrace
(
current_line
));
}
const
std
::
string
include_file_path
=
line
.
substr
(
first_quote_pos
+
1
,
second_quote_pos
-
first_quote_pos
-
1
);
const
std
::
string
abs_include_file_path
(
PathHelpers
::
GetAbsolutePath
(
root
+
"/"
+
include_file_path
));
// NOLINT
if
(
abs_include_file_path
.
empty
())
{
if
(
include_optional
)
continue
;
throw
IncludeNotFoundException
(
include_file_path
,
GetIncludeStackTrace
(
current_line
));
}
std
::
ifstream
include_file
(
abs_include_file_path
,
std
::
ios
::
in
);
if
(
!
include_file
.
good
())
throw
IncludeCantBeOpenedException
(
include_file_path
,
GetIncludeStackTrace
(
current_line
));
ProcessCore
(
include_file
,
output
,
root
,
include_file_path
,
current_line
,
directive
,
allow_angle_brackets
,
recurse
);
}
else
{
if
(
include_optional
)
throw
IncludeExpectedException
(
GetIncludeStackTrace
(
current_line
));
if
(
output
.
tellp
()
>
0
)
output
<<
std
::
endl
;
output
<<
line
;
}
}
auto
prev_file
=
_included_stack_head
->
included_from
;
_included_stack_head
=
prev_file
;
_include_depth
--
;
}
std
::
string
IncludeInliner
::
GetIncludeStackTrace
(
int
line
)
{
std
::
ostringstream
ss
;
if
(
_included_stack_head
==
nullptr
)
return
""
;
auto
item
=
_included_stack_head
;
ss
<<
" "
<<
item
->
path
<<
":"
<<
line
;
while
(
item
->
included_from
!=
nullptr
)
{
ss
<<
std
::
endl
<<
" from "
<<
item
->
included_from
->
path
<<
":"
<<
item
->
included_line
;
item
=
item
->
included_from
;
}
return
ss
.
str
();
}
host/online_compile/addkernels/include_inliner.hpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef SOURCE_INLINER_HPP
#define SOURCE_INLINER_HPP
#include "source_file_desc.hpp"
#include <ostream>
#include <memory>
#include <stack>
class
InlineException
:
public
std
::
exception
{
public:
InlineException
(
const
std
::
string
&
trace
)
:
_trace
(
trace
)
{}
virtual
std
::
string
What
()
const
=
0
;
const
std
::
string
&
GetTrace
()
const
{
return
_trace
;
}
private:
std
::
string
_trace
;
};
class
InlineStackOverflowException
:
public
InlineException
{
public:
InlineStackOverflowException
(
const
std
::
string
&
trace
)
:
InlineException
(
trace
)
{}
std
::
string
What
()
const
override
{
return
"Include stack depth limit has been reached, possible circle includes"
;
}
};
class
IncludeExpectedException
:
public
InlineException
{
public:
IncludeExpectedException
(
const
std
::
string
&
trace
)
:
InlineException
(
trace
)
{}
std
::
string
What
()
const
override
{
return
"Include directive expected"
;
}
};
class
WrongInlineDirectiveException
:
public
InlineException
{
public:
WrongInlineDirectiveException
(
const
std
::
string
&
trace
)
:
InlineException
(
trace
)
{}
std
::
string
What
()
const
override
{
return
"Include directive has wrong format"
;
}
};
class
IncludeFileExceptionBase
:
public
InlineException
{
public:
IncludeFileExceptionBase
(
const
std
::
string
&
file
,
const
std
::
string
&
trace
)
:
InlineException
(
trace
),
_file
(
file
)
{
}
std
::
string
What
()
const
override
;
virtual
std
::
string
GetMessage
()
const
=
0
;
private:
std
::
string
_file
;
};
class
IncludeNotFoundException
:
public
IncludeFileExceptionBase
{
public:
IncludeNotFoundException
(
const
std
::
string
&
file
,
const
std
::
string
&
trace
)
:
IncludeFileExceptionBase
(
file
,
trace
)
{
}
std
::
string
GetMessage
()
const
override
{
return
"Include file not found (if it is optional put //inliner-include-optional on line "
"before it)"
;
}
};
class
IncludeCantBeOpenedException
:
public
IncludeFileExceptionBase
{
public:
IncludeCantBeOpenedException
(
const
std
::
string
&
file
,
const
std
::
string
&
trace
)
:
IncludeFileExceptionBase
(
file
,
trace
)
{
}
std
::
string
GetMessage
()
const
override
{
return
"Can not open include file"
;
}
};
class
IncludeInliner
{
public:
int
include_depth_limit
=
256
;
void
Process
(
std
::
istream
&
input
,
std
::
ostream
&
output
,
const
std
::
string
&
root
,
const
std
::
string
&
file_name
,
const
std
::
string
&
directive
,
bool
allow_angle_brackets
,
bool
recurse
);
std
::
string
GetIncludeStackTrace
(
int
line
);
private:
int
_include_depth
=
0
;
std
::
shared_ptr
<
SourceFileDesc
>
_included_stack_head
=
nullptr
;
void
ProcessCore
(
std
::
istream
&
input
,
std
::
ostream
&
output
,
const
std
::
string
&
root
,
const
std
::
string
&
file_name
,
int
line_number
,
const
std
::
string
&
directive
,
bool
allow_angle_brackets
,
bool
recurse
);
};
#endif // !SOURCE_INLINER_HPP
host/online_compile/addkernels/source_file_desc.hpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef SOURCE_FILE_DESC_HPP
#define SOURCE_FILE_DESC_HPP
#include <string>
#include <memory>
class
SourceFileDesc
{
public:
const
std
::
string
path
;
int
included_line
;
std
::
shared_ptr
<
SourceFileDesc
>
included_from
;
SourceFileDesc
(
const
std
::
string
&
path_
,
std
::
shared_ptr
<
SourceFileDesc
>
from
,
int
line
)
:
path
(
path_
),
included_line
(
line
),
included_from
(
from
)
{
}
};
#endif // SOURCE_FILE_DESC_HPP
host/online_compile/hip_utility/binary_cache.cpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <binary_cache.hpp>
#include <handle.hpp>
#include <md5.hpp>
#include <env.hpp>
#include <stringutils.hpp>
#include <logger.hpp>
#include <target_properties.hpp>
#include <boost/filesystem.hpp>
#include <fstream>
#include <iostream>
namespace
online_compile
{
OLC_DECLARE_ENV_VAR
(
OLC_DISABLE_CACHE
)
OLC_DECLARE_ENV_VAR
(
HOME
)
static
boost
::
filesystem
::
path
ComputeCachePath
()
{
const
char
*
home_dir
=
GetStringEnv
(
HOME
{});
if
(
home_dir
==
nullptr
||
home_dir
==
std
::
string
(
"/"
)
||
home_dir
==
std
::
string
(
""
))
{
home_dir
=
"/tmp"
;
}
auto
p
=
boost
::
filesystem
::
path
{
home_dir
}
/
"_hip_binary_kernels_"
;
if
(
!
boost
::
filesystem
::
exists
(
p
))
boost
::
filesystem
::
create_directories
(
p
);
return
p
;
}
boost
::
filesystem
::
path
GetCachePath
()
{
static
const
boost
::
filesystem
::
path
user_path
=
ComputeCachePath
();
return
user_path
;
}
static
bool
IsCacheDisabled
()
{
return
online_compile
::
IsEnabled
(
OLC_DISABLE_CACHE
{});
}
boost
::
filesystem
::
path
GetCacheFile
(
const
std
::
string
&
device
,
const
std
::
string
&
name
,
const
std
::
string
&
args
)
{
// std::string filename = (is_kernel_str ? online_compile::md5(name) : name) + ".o";
std
::
string
filename
=
name
+
".o"
;
return
GetCachePath
()
/
online_compile
::
md5
(
device
+
":"
+
args
)
/
filename
;
}
boost
::
filesystem
::
path
LoadBinary
(
const
TargetProperties
&
target
,
const
size_t
num_cu
,
const
std
::
string
&
name
,
const
std
::
string
&
args
)
{
if
(
online_compile
::
IsCacheDisabled
())
return
{};
(
void
)
num_cu
;
auto
f
=
GetCacheFile
(
target
.
DbId
(),
name
,
args
);
if
(
boost
::
filesystem
::
exists
(
f
))
{
return
f
.
string
();
}
else
{
return
{};
}
}
void
SaveBinary
(
const
boost
::
filesystem
::
path
&
binary_path
,
const
TargetProperties
&
target
,
const
std
::
string
&
name
,
const
std
::
string
&
args
)
{
if
(
online_compile
::
IsCacheDisabled
())
{
boost
::
filesystem
::
remove
(
binary_path
);
}
else
{
auto
p
=
GetCacheFile
(
target
.
DbId
(),
name
,
args
);
boost
::
filesystem
::
create_directories
(
p
.
parent_path
());
boost
::
filesystem
::
rename
(
binary_path
,
p
);
}
}
}
// namespace online_compile
host/online_compile/hip_utility/exec_utils.cpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <exec_utils.hpp>
#include <manage_ptr.hpp>
#include <istream>
#include <ostream>
#include <string>
#include <cstdio>
#include <array>
#include <cassert>
#ifdef __linux__
#include <unistd.h>
#include <cstdio>
#include <sys/wait.h>
#endif // __linux__
namespace
online_compile
{
namespace
exec
{
int
Run
(
const
std
::
string
&
p
,
std
::
istream
*
in
,
std
::
ostream
*
out
)
{
#ifdef __linux__
const
auto
redirect_stdin
=
(
in
!=
nullptr
);
const
auto
redirect_stdout
=
(
out
!=
nullptr
);
assert
(
!
(
redirect_stdin
&&
redirect_stdout
));
const
auto
file_mode
=
redirect_stdout
?
"r"
:
"w"
;
OLC_MANAGE_PTR
(
FILE
*
,
pclose
)
pipe
{
popen
(
p
.
c_str
(),
file_mode
)};
if
(
!
pipe
)
throw
std
::
runtime_error
(
"online_compile::exec::Run(): popen("
+
p
+
", "
+
file_mode
+
") failed"
);
if
(
redirect_stdin
||
redirect_stdout
)
{
std
::
array
<
char
,
1024
>
buffer
{};
if
(
redirect_stdout
)
{
while
(
feof
(
pipe
.
get
())
==
0
)
if
(
fgets
(
buffer
.
data
(),
buffer
.
size
(),
pipe
.
get
())
!=
nullptr
)
*
out
<<
buffer
.
data
();
}
else
{
while
(
!
in
->
eof
())
{
in
->
read
(
buffer
.
data
(),
buffer
.
size
()
-
1
);
buffer
[
in
->
gcount
()]
=
0
;
if
(
fputs
(
buffer
.
data
(),
pipe
.
get
())
==
EOF
)
throw
std
::
runtime_error
(
"online_compile::exec::Run(): fputs() failed"
);
}
}
}
auto
status
=
pclose
(
pipe
.
release
());
return
WEXITSTATUS
(
status
);
#else
(
void
)
p
;
(
void
)
in
;
(
void
)
out
;
return
-
1
;
#endif // __linux__
}
}
// namespace exec
}
// namespace online_compile
host/online_compile/hip_utility/handlehip.cpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <handle.hpp>
#include <binary_cache.hpp>
#include <env.hpp>
#include <kernel_cache.hpp>
#include <stringutils.hpp>
#include <target_properties.hpp>
#include <hipCheck.hpp>
#include <write_file.hpp>
#include <boost/filesystem.hpp>
#include <boost/lexical_cast.hpp>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <algorithm>
#include <cassert>
#include <chrono>
#include <thread>
OLC_DECLARE_ENV_VAR
(
OLC_DEVICE_CU
)
namespace
online_compile
{
std
::
size_t
GetAvailableMemory
()
{
size_t
free
,
total
;
MY_HIP_CHECK
(
hipMemGetInfo
(
&
free
,
&
total
));
return
free
;
}
int
get_device_id
()
// Get random device
{
int
device
;
MY_HIP_CHECK
(
hipGetDevice
(
&
device
));
return
device
;
}
void
set_device
(
int
id
)
{
MY_HIP_CHECK
(
hipSetDevice
(
id
));
}
int
set_default_device
()
{
int
n
;
MY_HIP_CHECK
(
hipGetDeviceCount
(
&
n
));
// Pick device based on process id
auto
pid
=
::
getpid
();
assert
(
pid
>
0
);
set_device
(
pid
%
n
);
return
(
pid
%
n
);
}
struct
HandleImpl
{
using
StreamPtr
=
std
::
shared_ptr
<
typename
std
::
remove_pointer
<
hipStream_t
>::
type
>
;
HandleImpl
()
{}
StreamPtr
create_stream
()
{
hipStream_t
result
;
MY_HIP_CHECK
(
hipStreamCreate
(
&
result
));
return
StreamPtr
{
result
,
&
hipStreamDestroy
};
}
static
StreamPtr
reference_stream
(
hipStream_t
s
)
{
return
StreamPtr
{
s
,
null_deleter
{}};
}
std
::
string
get_device_name
()
const
{
hipDeviceProp_t
props
;
MY_HIP_CHECK
(
hipGetDeviceProperties
(
&
props
,
device
));
const
std
::
string
name
(
props
.
gcnArchName
);
return
name
;
}
StreamPtr
stream
=
nullptr
;
int
device
=
-
1
;
KernelCache
cache
;
TargetProperties
target_properties
;
};
Handle
::
Handle
(
hipStream_t
stream
)
:
impl
(
new
HandleImpl
())
{
this
->
impl
->
device
=
get_device_id
();
if
(
stream
==
nullptr
)
this
->
impl
->
stream
=
HandleImpl
::
reference_stream
(
nullptr
);
else
this
->
impl
->
stream
=
HandleImpl
::
reference_stream
(
stream
);
this
->
impl
->
target_properties
.
Init
(
this
);
}
Handle
::
Handle
()
:
impl
(
new
HandleImpl
())
{
this
->
impl
->
device
=
get_device_id
();
this
->
impl
->
stream
=
HandleImpl
::
reference_stream
(
nullptr
);
this
->
impl
->
target_properties
.
Init
(
this
);
}
Handle
::~
Handle
()
{}
void
Handle
::
SetStream
(
hipStream_t
streamID
)
const
{
this
->
impl
->
stream
=
HandleImpl
::
reference_stream
(
streamID
);
this
->
impl
->
target_properties
.
Init
(
this
);
}
hipStream_t
Handle
::
GetStream
()
const
{
return
impl
->
stream
.
get
();
}
KernelInvoke
Handle
::
AddKernel
(
const
std
::
string
&
algorithm
,
const
std
::
string
&
network_config
,
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
vector
<
size_t
>&
vld
,
const
std
::
vector
<
size_t
>&
vgd
,
const
std
::
string
&
params
,
std
::
size_t
cache_index
)
const
{
auto
obj
=
this
->
impl
->
cache
.
AddKernel
(
*
this
,
algorithm
,
network_config
,
program_name
,
kernel_name
,
vld
,
vgd
,
params
,
cache_index
);
return
this
->
Run
(
obj
);
}
void
Handle
::
ClearKernels
(
const
std
::
string
&
algorithm
,
const
std
::
string
&
network_config
)
const
{
this
->
impl
->
cache
.
ClearKernels
(
algorithm
,
network_config
);
}
const
std
::
vector
<
Kernel
>&
Handle
::
GetKernelsImpl
(
const
std
::
string
&
algorithm
,
const
std
::
string
&
network_config
)
const
{
return
this
->
impl
->
cache
.
GetKernels
(
algorithm
,
network_config
);
}
bool
Handle
::
HasKernel
(
const
std
::
string
&
algorithm
,
const
std
::
string
&
network_config
)
const
{
return
this
->
impl
->
cache
.
HasKernels
(
algorithm
,
network_config
);
}
KernelInvoke
Handle
::
Run
(
Kernel
k
)
const
{
return
k
.
Invoke
(
this
->
GetStream
());
}
Program
Handle
::
LoadProgram
(
const
std
::
string
&
program_name
,
std
::
string
params
)
const
{
if
((
!
online_compile
::
EndsWith
(
program_name
,
".mlir-cpp"
))
&&
(
!
online_compile
::
EndsWith
(
program_name
,
".mlir"
)))
{
params
+=
" -mcpu="
+
this
->
GetTargetProperties
().
Name
();
}
auto
hsaco
=
online_compile
::
LoadBinary
(
this
->
GetTargetProperties
(),
this
->
GetMaxComputeUnits
(),
program_name
,
params
);
if
(
hsaco
.
empty
())
{
auto
p
=
HIPOCProgram
{
program_name
,
params
,
this
->
GetTargetProperties
()};
auto
path
=
online_compile
::
GetCachePath
()
/
boost
::
filesystem
::
unique_path
();
if
(
p
.
IsCodeObjectInMemory
())
online_compile
::
WriteFile
(
p
.
GetCodeObjectBlob
(),
path
);
else
boost
::
filesystem
::
copy_file
(
p
.
GetCodeObjectPathname
(),
path
);
online_compile
::
SaveBinary
(
path
,
this
->
GetTargetProperties
(),
program_name
,
params
);
return
p
;
}
else
{
return
HIPOCProgram
{
program_name
,
hsaco
};
}
}
bool
Handle
::
HasProgram
(
const
std
::
string
&
program_name
,
const
std
::
string
&
params
)
const
{
return
this
->
impl
->
cache
.
HasProgram
(
program_name
,
params
);
}
void
Handle
::
AddProgram
(
Program
prog
,
const
std
::
string
&
program_name
,
const
std
::
string
&
params
)
const
{
this
->
impl
->
cache
.
AddProgram
(
prog
,
program_name
,
params
);
}
void
Handle
::
Finish
()
const
{
MY_HIP_CHECK
(
hipStreamSynchronize
(
this
->
GetStream
()));
}
std
::
size_t
Handle
::
GetLocalMemorySize
()
const
{
int
result
;
MY_HIP_CHECK
(
hipDeviceGetAttribute
(
&
result
,
hipDeviceAttributeMaxSharedMemoryPerBlock
,
this
->
impl
->
device
));
return
result
;
}
std
::
size_t
Handle
::
GetGlobalMemorySize
()
const
{
size_t
result
;
MY_HIP_CHECK
(
hipDeviceTotalMem
(
&
result
,
this
->
impl
->
device
));
return
result
;
}
std
::
size_t
Handle
::
GetMaxComputeUnits
()
const
{
int
result
;
const
char
*
const
num_cu
=
online_compile
::
GetStringEnv
(
OLC_DEVICE_CU
{});
if
(
num_cu
!=
nullptr
&&
strlen
(
num_cu
)
>
0
)
{
return
boost
::
lexical_cast
<
std
::
size_t
>
(
num_cu
);
}
MY_HIP_CHECK
(
hipDeviceGetAttribute
(
&
result
,
hipDeviceAttributeMultiprocessorCount
,
this
->
impl
->
device
));
return
result
;
}
std
::
size_t
Handle
::
GetWavefrontWidth
()
const
{
hipDeviceProp_t
props
{};
MY_HIP_CHECK
(
hipGetDeviceProperties
(
&
props
,
this
->
impl
->
device
));
auto
result
=
static_cast
<
size_t
>
(
props
.
warpSize
);
return
result
;
}
std
::
string
Handle
::
GetDeviceNameImpl
()
const
{
return
this
->
impl
->
get_device_name
();
}
std
::
string
Handle
::
GetDeviceName
()
const
{
return
this
->
impl
->
target_properties
.
Name
();
}
const
TargetProperties
&
Handle
::
GetTargetProperties
()
const
{
return
this
->
impl
->
target_properties
;
}
std
::
ostream
&
Handle
::
Print
(
std
::
ostream
&
os
)
const
{
os
<<
"stream: "
<<
this
->
impl
->
stream
<<
", device_id: "
<<
this
->
impl
->
device
;
return
os
;
}
}
// namespace online_compile
host/online_compile/hip_utility/hip_build_utils.cpp
deleted
100644 → 0
View file @
cb954213
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <hip_build_utils.hpp>
#include <stringutils.hpp>
#include <tmp_dir.hpp>
#include <env.hpp>
#include <target_properties.hpp>
#include <write_file.hpp>
#include <exec_utils.hpp>
#include <logger.hpp>
#include <config.h>
#include <boost/optional.hpp>
#include <sstream>
#include <string>
#include <stdexcept>
#include <iostream>
OLC_DECLARE_ENV_VAR
(
OLC_DEBUG_HIP_VERBOSE
)
OLC_DECLARE_ENV_VAR
(
OLC_DEBUG_HIP_DUMP
)
#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
namespace
online_compile
{
bool
IsHccCompiler
()
{
static
const
auto
isHcc
=
EndsWith
(
OLC_HIP_COMPILER
,
"hcc"
);
return
isHcc
;
}
bool
IsHipClangCompiler
()
{
static
const
auto
isClangXX
=
EndsWith
(
OLC_HIP_COMPILER
,
"clang++"
);
return
isClangXX
;
}
namespace
{
inline
bool
ProduceCoV3
()
{
// Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
return
(
HipCompilerVersion
()
>=
external_tool_version_t
{
3
,
0
,
-
1
});
}
/// Returns option for enabling/disabling CO v3 generation for the compiler
/// that builds HIP kernels, depending on compiler version etc.
inline
const
std
::
string
&
GetCoV3Option
(
const
bool
enable
)
{
/// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
/// It's unclear why... HCC included in ROCm 2.8 does not support it,
/// perhaps it suits for some older HCC?
///
/// These options are Ok for ROCm 3.0:
static
const
std
::
string
option_enable
{
"-mcode-object-v3"
};
static
const
std
::
string
no_option
{};
if
(
enable
)
return
option_enable
;
else
return
no_option
;
}
}
// namespace
static
boost
::
filesystem
::
path
HipBuildImpl
(
boost
::
optional
<
TmpDir
>&
tmp_dir
,
const
std
::
string
&
filename
,
std
::
string
src
,
std
::
string
params
,
const
TargetProperties
&
target
,
const
bool
testing_mode
,
const
bool
sources_already_reside_on_filesystem
)
{
#ifdef __linux__
// Write out the include files
// Let's assume includes are overkill for feature tests & optimize'em out.
if
(
!
testing_mode
)
{
auto
inc_list
=
GetHipKernelIncList
();
auto
inc_path
=
tmp_dir
->
path
;
boost
::
filesystem
::
create_directories
(
inc_path
);
for
(
auto
inc_file
:
inc_list
)
{
auto
inc_src
=
GetKernelInc
(
inc_file
);
WriteFile
(
inc_src
,
inc_path
/
inc_file
);
}
}
// Sources produced by MLIR-cpp already reside in tmp dir.
if
(
!
sources_already_reside_on_filesystem
)
{
src
+=
"
\n
int main() {}
\n
"
;
WriteFile
(
src
,
tmp_dir
->
path
/
filename
);
}
// cppcheck-suppress unreadVariable
const
LcOptionTargetStrings
lots
(
target
);
auto
env
=
std
::
string
(
""
);
if
(
IsHccCompiler
())
{
params
+=
" -amdgpu-target="
+
target
.
Name
();
params
+=
" "
+
GetCoV3Option
(
ProduceCoV3
());
}
else
if
(
IsHipClangCompiler
())
{
if
(
params
.
find
(
"-std="
)
==
std
::
string
::
npos
)
params
+=
" --std=c++11"
;
if
(
HipCompilerVersion
()
<
external_tool_version_t
{
4
,
1
,
0
})
params
+=
" --cuda-gpu-arch="
+
lots
.
device
;
else
params
+=
" --cuda-gpu-arch="
+
lots
.
device
+
lots
.
xnack
;
params
+=
" --cuda-device-only"
;
params
+=
" -c"
;
params
+=
" -O3 "
;
}
params
+=
" -Wno-unused-command-line-argument -I. "
;
params
+=
OLC_STRINGIZE
(
HIP_COMPILER_FLAGS
);
if
(
IsHccCompiler
())
{
env
+=
std
::
string
(
"KMOPTLLC=
\"
-mattr=+enable-ds128 "
);
if
(
HipCompilerVersion
()
>=
external_tool_version_t
{
2
,
8
,
0
})
env
+=
" --amdgpu-spill-vgpr-to-agpr=0"
;
env
+=
'\"'
;
}
else
if
(
IsHipClangCompiler
())
{
params
+=
" -mllvm --amdgpu-spill-vgpr-to-agpr=0"
;
params
+=
" -mllvm -amdgpu-early-inline-all=true"
;
params
+=
" -mllvm -amdgpu-function-calls=false"
;
}
if
(
online_compile
::
IsEnabled
(
OLC_DEBUG_HIP_VERBOSE
{}))
{
params
+=
" -v"
;
}
if
(
online_compile
::
IsEnabled
(
OLC_DEBUG_HIP_DUMP
{}))
{
if
(
IsHccCompiler
())
{
params
+=
" -gline-tables-only"
;
env
+=
" KMDUMPISA=1"
;
env
+=
" KMDUMPLLVM=1"
;
}
else
if
(
IsHipClangCompiler
())
{
params
+=
" -gline-tables-only"
;
params
+=
" -save-temps"
;
}
}
// hip version
params
+=
std
::
string
(
" -DHIP_PACKAGE_VERSION_FLAT="
)
+
std
::
to_string
(
HIP_PACKAGE_VERSION_FLAT
);
params
+=
" "
;
auto
bin_file
=
tmp_dir
->
path
/
(
filename
+
".o"
);
// compile
const
std
::
string
redirector
=
testing_mode
?
" 1>/dev/null 2>&1"
:
""
;
tmp_dir
->
Execute
(
env
+
std
::
string
(
" "
)
+
OLC_HIP_COMPILER
,
params
+
filename
+
" -o "
+
bin_file
.
string
()
+
redirector
);
if
(
!
boost
::
filesystem
::
exists
(
bin_file
))
throw
std
::
runtime_error
(
filename
+
" failed to compile"
);
#ifdef EXTRACTKERNEL_BIN
if
(
IsHccCompiler
())
{
// call extract kernel
tmp_dir
->
Execute
(
EXTRACTKERNEL_BIN
,
" -i "
+
bin_file
.
string
());
auto
hsaco
=
std
::
find_if
(
boost
::
filesystem
::
directory_iterator
{
tmp_dir
->
path
},
{},
[](
auto
entry
)
{
return
(
entry
.
path
().
extension
()
==
".hsaco"
);
});
if
(
hsaco
==
boost
::
filesystem
::
directory_iterator
{})
{
fdt_log
(
LogLevel
::
Info
,
"HipBuild"
,
"failed to find *.hsaco in "
)
<<
hsaco
->
path
().
string
()
<<
std
::
endl
;
}
return
hsaco
->
path
();
}
#endif
return
bin_file
;
#else
(
void
)
filename
;
(
void
)
params
;
throw
std
::
runtimer_error
(
"HIP kernels are only supported in Linux"
);
#endif
}
boost
::
filesystem
::
path
HipBuild
(
boost
::
optional
<
TmpDir
>&
tmp_dir
,
const
std
::
string
&
filename
,
std
::
string
src
,
std
::
string
params
,
const
TargetProperties
&
target
,
const
bool
sources_already_reside_on_filesystem
)
{
return
HipBuildImpl
(
tmp_dir
,
filename
,
src
,
params
,
target
,
false
,
sources_already_reside_on_filesystem
);
}
void
bin_file_to_str
(
const
boost
::
filesystem
::
path
&
file
,
std
::
string
&
buf
)
{
std
::
ifstream
bin_file_ptr
(
file
.
string
().
c_str
(),
std
::
ios
::
binary
);
std
::
ostringstream
bin_file_strm
;
bin_file_strm
<<
bin_file_ptr
.
rdbuf
();
buf
=
bin_file_strm
.
str
();
}
static
external_tool_version_t
HipCompilerVersionImpl
()
{
external_tool_version_t
version
;
if
(
IsHccCompiler
())
{
const
std
::
string
path
(
OLC_HIP_COMPILER
);
const
std
::
string
mandatory_prefix
(
"(based on HCC "
);
do
{
if
(
path
.
empty
()
||
!
std
::
ifstream
(
path
).
good
())
break
;
std
::
stringstream
out
;
if
(
online_compile
::
exec
::
Run
(
path
+
" --version"
,
nullptr
,
&
out
)
!=
0
)
break
;
std
::
string
line
;
while
(
!
out
.
eof
())
{
std
::
getline
(
out
,
line
);
fdt_log
()
<<
line
;
auto
begin
=
line
.
find
(
mandatory_prefix
);
if
(
begin
==
std
::
string
::
npos
)
continue
;
begin
+=
mandatory_prefix
.
size
();
int
v3
,
v2
,
v1
=
v2
=
v3
=
-
1
;
char
c2
,
c1
=
c2
=
'X'
;
std
::
istringstream
iss
(
line
.
substr
(
begin
));
iss
>>
v1
>>
c1
>>
v2
>>
c2
>>
v3
;
if
(
!
iss
.
fail
()
&&
v1
>=
0
)
{
version
.
major
=
v1
;
if
(
c1
==
'.'
&&
v2
>=
0
)
{
version
.
minor
=
v2
;
if
(
c2
==
'.'
&&
v3
>=
0
)
version
.
patch
=
v3
;
}
}
break
;
}
}
while
(
false
);
}
else
{
#ifdef HIP_PACKAGE_VERSION_MAJOR
fdt_log
(
LogLevel
::
Info
,
"HipCompilerVersion"
,
"Read version information from HIP package..."
);
version
.
major
=
HIP_PACKAGE_VERSION_MAJOR
;
#ifdef HIP_PACKAGE_VERSION_MINOR
version
.
minor
=
HIP_PACKAGE_VERSION_MINOR
;
#else
version
.
minor
=
0
;
#endif
#ifdef HIP_PACKAGE_VERSION_PATCH
version
.
patch
=
HIP_PACKAGE_VERSION_PATCH
;
#else
version
.
patch
=
0
;
#endif
#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
fdt_log
(
LogLevel
::
Info
,
"HipCompilerVersion"
,
"...assuming 3.2.0 (hip-clang RC)"
);
version
.
major
=
3
;
version
.
minor
=
2
;
version
.
patch
=
0
;
#endif
}
fdt_log
()
<<
version
.
major
<<
'.'
<<
version
.
minor
<<
'.'
<<
version
.
patch
<<
std
::
endl
;
return
version
;
}
external_tool_version_t
HipCompilerVersion
()
{
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
static
auto
once
=
HipCompilerVersionImpl
();
return
once
;
}
bool
operator
>
(
const
external_tool_version_t
&
lhs
,
const
external_tool_version_t
&
rhs
)
{
if
(
lhs
.
major
>
rhs
.
major
)
return
true
;
else
if
(
lhs
.
major
==
rhs
.
major
)
{
if
(
lhs
.
minor
>
rhs
.
minor
)
return
true
;
else
if
(
lhs
.
minor
==
rhs
.
minor
)
return
(
lhs
.
patch
>
rhs
.
patch
);
else
return
false
;
}
else
return
false
;
}
bool
operator
<
(
const
external_tool_version_t
&
lhs
,
const
external_tool_version_t
&
rhs
)
{
return
rhs
>
lhs
;
}
bool
operator
>=
(
const
external_tool_version_t
&
lhs
,
const
external_tool_version_t
&
rhs
)
{
return
!
(
lhs
<
rhs
);
}
bool
operator
<=
(
const
external_tool_version_t
&
lhs
,
const
external_tool_version_t
&
rhs
)
{
return
!
(
lhs
>
rhs
);
}
}
// namespace online_compile
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment