"docs/git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "f389848010870356b2426f1fd68bb635050f3b80"
Commit ae98b52a authored by Chao Liu's avatar Chao Liu
Browse files

remove online compilation from CK

parent cb954213
cmake_minimum_required(VERSION 2.8.3) cmake_minimum_required(VERSION 2.8.3)
project(modular_convolution) project(composable_kernel)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
include(TargetFlags)
include(AddKernels)
## C++ ## C++
enable_language(CXX) enable_language(CXX)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
......
function(add_kernels SRC_DIR KERNEL_FILES)
set(INIT_KERNELS_LIST)
set(KERNELS_DECLS)
foreach(KERNEL_FILE ${KERNEL_FILES})
if("${CMAKE_VERSION}" VERSION_LESS 3.0)
configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
else()
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
endif()
get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
string(TOUPPER "${BASE_NAME}" KEY_NAME)
string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n")
string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n")
list(APPEND INIT_KERNELS_LIST " { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }")
endforeach()
string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp)
endfunction()
function(add_kernel_includes SRC_DIR KERNEL_FILES)
set(INIT_KERNELS_LIST)
foreach(KERNEL_FILE ${KERNEL_FILES})
if("${CMAKE_VERSION}" VERSION_LESS 3.0)
configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
else()
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
endif()
get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
get_filename_component(FILE_NAME ${KERNEL_FILE} NAME)
string(TOUPPER "${BASE_NAME}" KEY_NAME)
string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
list(APPEND INIT_KERNELS_LIST " { \"${FILE_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
endforeach()
string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp)
endfunction()
function(get_target_property2 VAR TARGET PROPERTY)
get_target_property(_pflags ${TARGET} ${PROPERTY})
if(_pflags)
set(${VAR} ${_pflags} PARENT_SCOPE)
else()
set(${VAR} "" PARENT_SCOPE)
endif()
endfunction()
macro(append_flags FLAGS TARGET PROPERTY PREFIX)
get_target_property2(_pflags ${TARGET} ${PROPERTY})
foreach(FLAG ${_pflags})
if(TARGET ${FLAG})
target_flags(_pflags2 ${FLAG})
string(APPEND ${FLAGS} " ${_pflags2}")
else()
string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
endif()
endforeach()
endmacro()
macro(append_link_flags FLAGS TARGET PROPERTY)
get_target_property2(_pflags ${TARGET} ${PROPERTY})
foreach(FLAG ${_pflags})
if(TARGET ${FLAG})
target_flags(_pflags2 ${FLAG})
string(APPEND ${FLAGS} " ${_pflags2}")
elseif(FLAG MATCHES "^-.*")
string(APPEND ${FLAGS} " ${FLAG}")
elseif(EXISTS ${FLAG})
string(APPEND ${FLAGS} " ${FLAG}")
else()
string(APPEND ${FLAGS} " -l${FLAG}")
endif()
endforeach()
endmacro()
function(target_flags FLAGS TARGET)
set(_flags)
append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
# message("_flags: ${_flags}")
set(${FLAGS} ${_flags} PARENT_SCOPE)
endfunction()
add_subdirectory(host_tensor) add_subdirectory(host_tensor)
add_subdirectory(online_compile)
add_subdirectory(driver_offline) add_subdirectory(driver_offline)
add_subdirectory(driver_online)
include_directories(BEFORE
include
${PROJECT_BINARY_DIR}/host/online_compile/include
${PROJECT_SOURCE_DIR}/host/online_compile/include
${PROJECT_SOURCE_DIR}/host/host_tensor/include
${PROJECT_SOURCE_DIR}/host/solver/include
${PROJECT_SOURCE_DIR}/composable_kernel/include
${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
${PROJECT_SOURCE_DIR}/external/rocm/include
${PROJECT_SOURCE_DIR}/external/half/include
)
set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
target_link_libraries(conv_fwd_driver_online PRIVATE online_compile)
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "handle.hpp"
#include "hipCheck.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V6R1_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
enum ConvForwardAlgo
{
V4R4NCHW, // 0
V6R1NCHW, // 1
V4R4XDLNCHW, // 2
V4R4XDLNHWC // 3
};
int main(int argc, char* argv[])
{
using namespace ck;
using namespace ck::driver;
using size_t = std::size_t;
hipStream_t stream;
online_compile::Handle* handle;
MY_HIP_CHECK(hipStreamCreate(&stream));
handle = new online_compile::Handle(stream);
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
if(argc != 22)
{
printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(atoi(argv[2]));
const bool do_verification = atoi(argv[3]);
const int init_method = atoi(argv[4]);
const bool do_log = atoi(argv[5]);
const int nrepeat = atoi(argv[6]);
const index_t N = atoi(argv[7]);
const index_t K = atoi(argv[8]);
const index_t C = atoi(argv[9]);
const index_t Y = atoi(argv[10]);
const index_t X = atoi(argv[11]);
const index_t Hi = atoi(argv[12]);
const index_t Wi = atoi(argv[13]);
const index_t conv_stride_h = atoi(argv[14]);
const index_t conv_stride_w = atoi(argv[15]);
const index_t conv_dilation_h = atoi(argv[16]);
const index_t conv_dilation_w = atoi(argv[17]);
const index_t in_left_pad_h = atoi(argv[18]);
const index_t in_left_pad_w = atoi(argv[19]);
const index_t in_right_pad_h = atoi(argv[20]);
const index_t in_right_pad_w = atoi(argv[21]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#if 1
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 0
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#elif 1
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
switch(layout)
{
case ConvTensorLayout::NCHW:
// NCHW
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(C);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
break;
case ConvTensorLayout::NHWC:
// NHWC
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi);
in_lengths_host[3] = static_cast<std::size_t>(C);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(Y);
wei_lengths_host[2] = static_cast<std::size_t>(X);
wei_lengths_host[3] = static_cast<std::size_t>(C);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K);
break;
default: throw std::runtime_error("wrong! not implemented");
}
Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host);
std::cout << "layout: " << layout << std::endl;
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 4:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 5:
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
auto f_make_for_device_nchw = [&]() {
const auto in_lengths_dev = make_tuple(N, C, Hi, Wi);
const auto wei_lengths_dev = make_tuple(K, C, Y, X);
const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
};
auto f_make_for_device_nhwc = [&]() {
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
};
const auto conv_strides = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads = make_tuple(in_right_pad_h, in_right_pad_w);
#if USE_CONV_FWD_V4R4_NCHW
if(algo == ConvForwardAlgo::V4R4NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
#if USE_CONV_FWD_V6R1_NCHW
if(algo == ConvForwardAlgo::V6R1NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
#if 1
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
get_datatype_enum_from_type<in_data_t>::value,
get_datatype_enum_from_type<acc_data_t>::value,
get_datatype_enum_from_type<out_data_t>::value,
256,
4,
1,
128,
32,
8,
4,
4,
1,
{8, 2},
{8, 2},
{4, 1, 1, 1, 1},
{2, 1, 1, 128, 1},
{4, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
{1, 4, 1, 1, 1},
{8, 1, 1, 32, 1},
{1, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
4,
true,
true};
#elif 0
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
get_datatype_enum_from_type<in_data_t>::value,
get_datatype_enum_from_type<acc_data_t>::value,
get_datatype_enum_from_type<out_data_t>::value,
256,
4,
2,
128,
32,
8,
4,
4,
1,
{8, 2},
{8, 2},
{4, 1, 1, 1, 2},
{2, 1, 1, 128, 1},
{4, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
{1, 4, 1, 1, 2},
{8, 1, 1, 32, 1},
{1, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
4,
true,
true};
#elif 1
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
get_datatype_enum_from_type<in_data_t>::value,
get_datatype_enum_from_type<acc_data_t>::value,
get_datatype_enum_from_type<out_data_t>::value,
256,
4,
4,
128,
32,
8,
4,
4,
1,
{8, 2},
{8, 2},
{4, 1, 1, 1, 4},
{2, 1, 1, 128, 1},
{4, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
{1, 4, 1, 1, 4},
{8, 1, 1, 32, 1},
{1, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
4,
true,
true};
#endif
online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
compile_param,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
if(algo == ConvForwardAlgo::V4R4XDLNCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
if(algo == ConvForwardAlgo::V4R4XDLNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
&default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
if(do_verification)
{
host_direct_convolution(in,
wei,
out_host,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
layout);
check_error(out_host, out_device);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
delete handle;
MY_HIP_CHECK(hipStreamDestroy(stream));
}
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
using namespace ck;
std::string out;
out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
std::to_string(pt->KPerThread) + "_";
out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
std::to_string(pt->M1N1ThreadClusterN10) + "x" +
std::to_string(pt->M1N1ThreadClusterM11) + "x" +
std::to_string(pt->M1N1ThreadClusterN11) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
using namespace ck;
std::string out;
out +=
" -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
" -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
" -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
" -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
" -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
" -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
online_compile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace ck::driver;
using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
using size_t = std::size_t;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto descs =
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto a_k_m_grid_desc = descs[I0];
const auto c_m_n_grid_desc = descs[I2];
const auto M = c_m_n_grid_desc.GetLength(I0);
const auto N = c_m_n_grid_desc.GetLength(I1);
const auto K = a_k_m_grid_desc.GetLength(I0);
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
const bool hasMainKBlockLoop = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_k_n0_n1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
get_definition_string_from_tunable(tunable) +
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
get_network_config_string_from_tunable(tunable) + "_" +
std::to_string(hasMainKBlockLoop) + "_" +
std::to_string(hasDoubleTailKBlockLoop);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_k_m0_m1_grid_desc_dev_buf,
b_k_n0_n1_grid_desc_dev_buf,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_k_m0_m1_grid_desc_dev_buf),
(const void*)(b_k_n0_n1_grid_desc_dev_buf),
(const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1];
const auto K = out_n_k_ho_wo_lengths[I1];
const auto Ho = out_n_k_ho_wo_lengths[I2];
const auto Wo = out_n_k_ho_wo_lengths[I3];
const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
using namespace ck;
std::string out;
out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
std::to_string(pt->K1) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
using namespace ck;
std::string out;
out +=
" -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
" -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
" -DCK_PARAM_K1=" + std::to_string(pt->K1) +
" -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
" -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
online_compile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace ck::driver;
using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
using size_t = std::size_t;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto n = in_n_c_hi_wi_desc.GetLength(I0);
const auto c = in_n_c_hi_wi_desc.GetLength(I1);
const auto hi = in_n_c_hi_wi_desc.GetLength(I2);
const auto wi = in_n_c_hi_wi_desc.GetLength(I3);
const auto k = wei_k_c_y_x_desc.GetLength(I0);
const auto y = wei_k_c_y_x_desc.GetLength(I2);
const auto x = wei_k_c_y_x_desc.GetLength(I3);
const auto ho = out_n_k_ho_wo_desc.GetLength(I2);
const auto wo = out_n_k_ho_wo_desc.GetLength(I3);
const auto M = k;
const auto N = n * ho * wo;
const auto K = c * y * x;
const auto K0 = K / tunable->K1;
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_k_n0_n1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " + " -DCK_USE_AMD_XDLOPS" +
get_definition_string_from_tunable(tunable);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
get_network_config_string_from_tunable(tunable);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_k_m0_m1_grid_desc_dev_buf,
b_k_n0_n1_grid_desc_dev_buf,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_k_m0_m1_grid_desc_dev_buf),
(const void*)(b_k_n0_n1_grid_desc_dev_buf),
(const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1];
const auto K = out_n_k_ho_wo_lengths[I1];
const auto Ho = out_n_k_ho_wo_lengths[I2];
const auto Wo = out_n_k_ho_wo_lengths[I3];
const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
using namespace ck;
std::string out;
out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
std::to_string(pt->K1) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
using namespace ck;
std::string out;
out +=
" -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
" -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
" -DCK_PARAM_K1=" + std::to_string(pt->K1) +
" -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
" -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
online_compile::Handle* handle,
const InLengths& in_n_hi_wi_c_lengths,
const WeiLengths& wei_k_y_x_c_lengths,
const OutLengths& out_n_ho_wo_k_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_hi_wi_c,
const Tensor<TInWei>& wei_k_y_x_c,
Tensor<TOut>& out_n_ho_wo_k,
const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
using size_t = std::size_t;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_hi_wi_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
const auto wei_k_y_x_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
const auto out_n_ho_wo_k_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
const auto n = in_n_hi_wi_c_desc.GetLength(I0);
const auto hi = in_n_hi_wi_c_desc.GetLength(I1);
const auto wi = in_n_hi_wi_c_desc.GetLength(I2);
const auto c = in_n_hi_wi_c_desc.GetLength(I3);
const auto k = wei_k_y_x_c_desc.GetLength(I0);
const auto y = wei_k_y_x_c_desc.GetLength(I1);
const auto x = wei_k_y_x_c_desc.GetLength(I2);
const auto ho = out_n_ho_wo_k_desc.GetLength(I1);
const auto wo = out_n_ho_wo_k_desc.GetLength(I2);
const auto M = k;
const auto N = n * ho * wo;
const auto K = c * y * x;
const auto K0 = K / tunable->K1;
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
// these buffers are usually provided by the user application
DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data());
wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data());
out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_k0_n_k1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_m0_m1_m2_n_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " -DCK_USE_AMD_XDLOPS ";
param += get_definition_string_from_tunable(tunable);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
get_network_config_string_from_tunable(tunable);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
static_cast<index_t>(in_n_hi_wi_c_lengths[I2]),
static_cast<index_t>(in_n_hi_wi_c_lengths[I3]),
static_cast<index_t>(wei_k_y_x_c_lengths[I0]),
static_cast<index_t>(wei_k_y_x_c_lengths[I1]),
static_cast<index_t>(wei_k_y_x_c_lengths[I2]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_k0_m_k1_grid_desc_dev_buf,
b_k0_n_k1_grid_desc_dev_buf,
c_m0_m1_m2_n_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()),
(const void*)(a_k0_m_k1_grid_desc_dev_buf),
(const void*)(b_k0_n_k1_grid_desc_dev_buf),
(const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_hi_wi_c_lengths[I0];
const auto C = in_n_hi_wi_c_lengths[I3];
const auto Ho = out_n_ho_wo_k_lengths[I1];
const auto Wo = out_n_ho_wo_k_lengths[I2];
const auto K = out_n_ho_wo_k_lengths[I3];
const auto Y = wei_k_y_x_c_lengths[I1];
const auto X = wei_k_y_x_c_lengths[I2];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time2;
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data());
}
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "convolution_problem_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
online_compile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
ck::index_t nrepeat)
{
using namespace ck;
using namespace ck::driver;
using size_t = std::size_t;
std::cout << __func__ << std::endl;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
out_n_k_ho_wo_lengths[I1],
in_n_c_hi_wi_lengths[I1],
wei_k_c_y_x_lengths[I2],
wei_k_c_y_x_lengths[I3],
in_n_c_hi_wi_lengths[I2],
in_n_c_hi_wi_lengths[I3],
out_n_k_ho_wo_lengths[I2],
out_n_k_ho_wo_lengths[I3],
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
get_datatype_enum_from_type<TInWei>::value,
get_datatype_enum_from_type<TInWei>::value,
get_datatype_enum_from_type<TOut>::value};
if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
compile_param))
{
throw std::runtime_error("wrong! IsValidCompileParameter fail");
}
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// workspace is used for save transformed tensor descritpors created by prepare kernel
DeviceMem workspace_dev_buf(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
const auto block_size = std::size_t(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
const auto grid_size = std::size_t(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
const std::vector<size_t> vld1 = {1, 1, 1};
const std::vector<size_t> vgd1 = {1, 1, 1};
const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
std::string compile_param_string =
get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
std::string network_config = compile_param_string;
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat + 1; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name,
network_config_1,
program_name,
kernel_name,
vld1,
vgd1,
compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
(void*)(workspace_dev_buf.GetDeviceBuffer()));
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name,
network_config_2,
program_name,
kernel_name,
vld2,
vgd2,
compile_param_string)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(workspace_dev_buf.GetDeviceBuffer()));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
nrepeat;
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
nrepeat;
float perf = (float)(conv_problem_desc.CalculateFlop()) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
## for online-compiling of HIP kernels
set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
## reset to avoid the C++ options from the parent project
set(CMAKE_CXX_FLAGS "")
message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
# look for and register clang-offload-bundler
if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
PATH_SUFFIXES bin
PATHS
/opt/rocm/llvm
${CMAKE_INSTALL_PREFIX}/llvm
)
endif()
if(OLC_OFFLOADBUNDLER_BIN)
message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
else()
# look for and register extractkernel
message(STATUS "clang-offload-bundler not found")
find_program(EXTRACTKERNEL_BIN extractkernel
PATH_SUFFIXES bin
PATHS
/opt/rocm/hip
/opt/rocm/hcc
/opt/rocm
${CMAKE_INSTALL_PREFIX}/hip
${CMAKE_INSTALL_PREFIX}/hcc
${CMAKE_INSTALL_PREFIX}
)
if(EXTRACTKERNEL_BIN)
message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
else()
message(FATAL_ERROR "extractkernel not found")
endif()
endif()
option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
set(BOOST_COMPONENTS filesystem)
add_definitions(-DBOOST_ALL_NO_LIB=1)
find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
# HIP is always required
find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}")
target_flags(HIP_COMPILER_FLAGS hip::device)
# Remove cuda arch flags
string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
option(ENABLE_DEBUG "Build to enable debugging" ON)
if(ENABLE_DEBUG)
set(OLC_DEBUG 1)
else()
set(OLC_DEBUG 0)
endif()
configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h")
include_directories(BEFORE
${PROJECT_BINARY_DIR}/host/online_compile/include
)
message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
set(MCONV_KERNEL_INCLUDES
${COMPOSABLE_KERNEL_INCLUDE_1}
${COMPOSABLE_KERNEL_INCLUDE_2}
)
file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp")
add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}")
add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}")
set(ONLINE_COMPILATION_SOURCE
${PROJECT_BINARY_DIR}/kernel.cpp
${PROJECT_BINARY_DIR}/kernel_includes.cpp
)
include_directories(BEFORE
${PROJECT_BINARY_DIR}/host/online_compile/include
include
)
set(OLC_HIP_UTILITY_CPPS
hip_utility/logger.cpp
hip_utility/tmp_dir.cpp
hip_utility/md5.cpp
hip_utility/exec_utils.cpp
hip_utility/target_properties.cpp
hip_utility/handlehip.cpp
hip_utility/kernel_build_params.cpp
hip_utility/hip_build_utils.cpp
hip_utility/hipoc_program.cpp
hip_utility/hipoc_kernel.cpp
hip_utility/kernel_cache.cpp
hip_utility/binary_cache.cpp
)
list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
## addkernels provide the tool to create inlined kernels in one header
add_subdirectory(addkernels)
function(inline_kernels_src KERNELS KERNEL_INCLUDES)
set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
add_custom_command(
OUTPUT ${KERNEL_SRC_HPP_PATH}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
COMMENT "Inlining All kernels"
)
configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
endfunction()
inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
COMMENT "Inlining HIP kernel includes"
)
## the library target
add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE})
target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/)
target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR})
target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
target_link_libraries(online_compile PRIVATE hip::device)
target_link_libraries(online_compile INTERFACE hip::host)
target_link_libraries(online_compile PRIVATE Boost::filesystem)
target_compile_features(online_compile PUBLIC)
set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS online_compile LIBRARY DESTINATION lib)
################################################################################
#
# MIT License
#
# Copyright (c) 2017 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
################################################################################
set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp)
add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE})
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "include_inliner.hpp"
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
void Bin2Hex(std::istream& source,
std::ostream& target,
const std::string& variable,
bool nullTerminate,
size_t bufferSize,
size_t lineSize)
{
source.seekg(0, std::ios::end);
std::unique_ptr<unsigned char[]> buffer(new unsigned char[bufferSize]);
std::streamoff sourceSize = source.tellg();
std::streamoff blockStart = 0;
if(variable.length() != 0)
{
target << "extern const size_t " << variable << "_SIZE;" << std::endl;
target << "extern const unsigned char " << variable << "[];" << std::endl;
target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";"
<< std::endl;
target << "const unsigned char " << variable << "[] = {" << std::endl;
}
target << std::setbase(16) << std::setfill('0');
source.seekg(0, std::ios::beg);
while(blockStart < sourceSize)
{
source.read(reinterpret_cast<char*>(buffer.get()), bufferSize);
std::streamoff pos = source.tellg();
std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart;
std::streamoff i = 0;
while(i < blockSize)
{
size_t j = i;
size_t end = std::min<size_t>(i + lineSize, blockSize);
for(; j < end; j++)
target << "0x" << std::setw(2) << static_cast<unsigned>(buffer[j]) << ",";
target << std::endl;
i = end;
}
blockStart += blockSize;
}
if(nullTerminate)
target << "0x00," << std::endl;
if(variable.length() != 0)
{
target << "};" << std::endl;
}
}
void PrintHelp()
{
std::cout << "Usage: bin2hex {<option>}" << std::endl;
std::cout << "Option format: -<option name>[ <option value>]" << std::endl;
std::cout << std::endl;
std::cout << "Options:" << std::endl;
std::cout
<< "[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
<< std::endl;
std::cout << " -t[arget] <path>: target file. Default: std out." << std::endl;
std::cout << " -l[ine-size] <number>: bytes in one line. Default: 16." << std::endl;
std::cout << " -b[uffer] <number>: read buffer size. Default: 512." << std::endl;
std::cout << " -g[uard] <string>: guard name. Default: no guard" << std::endl;
std::cout << " -n[o-recurse] : dont expand include files recursively. Default: off"
<< std::endl;
}
[[gnu::noreturn]] void WrongUsage(const std::string& error)
{
std::cout << "Wrong usage: " << error << std::endl;
std::cout << std::endl;
PrintHelp();
std::exit(1);
}
[[gnu::noreturn]] void UnknownArgument(const std::string& arg)
{
std::ostringstream ss;
ss << "unknown argument - " << arg;
WrongUsage(ss.str());
}
void Process(const std::string& sourcePath,
std::ostream& target,
size_t bufferSize,
size_t lineSize,
bool recurse,
bool as_extern)
{
std::string fileName(sourcePath);
std::string extension, root;
std::stringstream inlinerTemp;
auto extPos = fileName.rfind('.');
auto slashPos = fileName.rfind('/');
if(extPos != std::string::npos)
{
extension = fileName.substr(extPos + 1);
fileName = fileName.substr(0, extPos);
}
if(slashPos != std::string::npos)
{
root = fileName.substr(0, slashPos + 1);
fileName = fileName.substr(slashPos + 1);
}
std::string variable(fileName);
std::ifstream sourceFile(sourcePath, std::ios::in | std::ios::binary);
std::istream* source = &sourceFile;
if(!sourceFile.good())
{
std::cerr << "File not found: " << sourcePath << std::endl;
std::exit(1);
}
const auto is_asm = extension == "s";
const auto is_cl = extension == "cl";
const auto is_hip = extension == "cpp";
const auto is_header = extension == "hpp";
if(is_asm || is_cl || is_hip || is_header)
{
IncludeInliner inliner;
try
{
if(is_asm)
inliner.Process(
sourceFile, inlinerTemp, root, sourcePath, ".include", false, recurse);
else if(is_cl || is_header)
inliner.Process(
sourceFile, inlinerTemp, root, sourcePath, "#include", true, recurse);
else if(is_hip)
inliner.Process(
sourceFile, inlinerTemp, root, sourcePath, "<#not_include>", true, false);
}
catch(const InlineException& ex)
{
std::cerr << ex.What() << std::endl;
std::cerr << ex.GetTrace() << std::endl;
std::exit(1);
}
source = &inlinerTemp;
}
std::transform(variable.begin(), variable.end(), variable.begin(), ::toupper);
if(as_extern && variable.length() != 0)
{
variable = "APP_KERNEL_" + variable;
}
Bin2Hex(*source, target, variable, true, bufferSize, lineSize);
}
int main(int argsn, char** args)
{
if(argsn == 1)
{
PrintHelp();
return 2;
}
std::string guard;
size_t bufferSize = 512;
size_t lineSize = 16;
std::ofstream targetFile;
std::ostream* target = &std::cout;
bool recurse = true;
bool as_extern = false;
int i = 0;
while(++i < argsn && **args != '-')
{
std::string arg(args[i] + 1);
std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
if(arg == "s" || arg == "source")
{
if(guard.length() > 0)
{
*target << "#ifndef " << guard << std::endl;
*target << "#define " << guard << std::endl;
}
*target << "#include <cstddef>" << std::endl;
while(++i < argsn)
{
Process(args[i], *target, bufferSize, lineSize, recurse, as_extern);
}
if(guard.length() > 0)
{
*target << "#endif" << std::endl;
}
return 0;
}
else if(arg == "t" || arg == "target")
{
targetFile.open(args[++i], std::ios::out);
target = &targetFile;
}
else if(arg == "l" || arg == "line-size")
lineSize = std::stol(args[++i]);
else if(arg == "b" || arg == "buffer")
bufferSize = std::stol(args[++i]);
else if(arg == "g" || arg == "guard")
guard = args[++i];
else if(arg == "n" || arg == "no-recurse")
recurse = false;
else if(arg == "e" || arg == "extern")
as_extern = true;
else
UnknownArgument(arg);
}
WrongUsage("source key is required");
}
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <algorithm>
#include <exception>
#include <fstream>
#include <sstream>
#ifdef _WIN32
#include <windows.h>
#endif
#ifdef __linux__
#include <linux/limits.h>
#include <cstdlib>
#endif // !WIN32
#include "include_inliner.hpp"
namespace PathHelpers {
static int GetMaxPath()
{
#ifdef _WIN32
return MAX_PATH;
#else
return PATH_MAX;
#endif
}
static std::string GetAbsolutePath(const std::string& path)
{
std::string result(GetMaxPath(), ' ');
#ifdef _WIN32
const auto retval = GetFullPathName(path.c_str(), result.size(), &result[0], nullptr);
if(retval == 0)
return "";
#else
auto* const retval = realpath(path.c_str(), &result[0]);
if(retval == nullptr)
return "";
#endif
return result;
}
} // namespace PathHelpers
std::string IncludeFileExceptionBase::What() const
{
std::ostringstream ss;
ss << GetMessage() << ": <" << _file << ">";
return ss.str();
}
void IncludeInliner::Process(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
const std::string& directive,
bool allow_angle_brackets,
bool recurse)
{
ProcessCore(input, output, root, file_name, 0, directive, allow_angle_brackets, recurse);
}
void IncludeInliner::ProcessCore(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
int line_number,
const std::string& directive,
bool allow_angle_brackets,
bool recurse)
{
if(_include_depth >= include_depth_limit)
throw InlineStackOverflowException(GetIncludeStackTrace(0));
_include_depth++;
_included_stack_head =
std::make_shared<SourceFileDesc>(file_name, _included_stack_head, line_number);
auto current_line = 0;
auto next_include_optional = false;
while(!input.eof())
{
std::string line;
std::string word;
std::getline(input, line);
std::istringstream line_parser(line);
line_parser >> word;
current_line++;
std::transform(word.begin(), word.end(), word.begin(), ::tolower);
const auto include_optional = next_include_optional;
next_include_optional = false;
if(!word.empty() && word == "//inliner-include-optional")
{
if(include_optional)
throw IncludeExpectedException(GetIncludeStackTrace(current_line));
next_include_optional = true;
continue;
}
if(!word.empty() && word == directive && recurse)
{
auto first_quote_pos = line.find('"', static_cast<int>(line_parser.tellg()) + 1);
std::string::size_type second_quote_pos;
if(first_quote_pos != std::string::npos)
{
second_quote_pos = line.find('"', first_quote_pos + 1);
if(second_quote_pos == std::string::npos)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
}
else
{
if(!allow_angle_brackets)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
first_quote_pos = line.find('<', static_cast<int>(line_parser.tellg()) + 1);
if(first_quote_pos == std::string::npos)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
second_quote_pos = line.find('>', first_quote_pos + 1);
if(second_quote_pos == std::string::npos)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
}
const std::string include_file_path =
line.substr(first_quote_pos + 1, second_quote_pos - first_quote_pos - 1);
const std::string abs_include_file_path(
PathHelpers::GetAbsolutePath(root + "/" + include_file_path)); // NOLINT
if(abs_include_file_path.empty())
{
if(include_optional)
continue;
throw IncludeNotFoundException(include_file_path,
GetIncludeStackTrace(current_line));
}
std::ifstream include_file(abs_include_file_path, std::ios::in);
if(!include_file.good())
throw IncludeCantBeOpenedException(include_file_path,
GetIncludeStackTrace(current_line));
ProcessCore(include_file,
output,
root,
include_file_path,
current_line,
directive,
allow_angle_brackets,
recurse);
}
else
{
if(include_optional)
throw IncludeExpectedException(GetIncludeStackTrace(current_line));
if(output.tellp() > 0)
output << std::endl;
output << line;
}
}
auto prev_file = _included_stack_head->included_from;
_included_stack_head = prev_file;
_include_depth--;
}
std::string IncludeInliner::GetIncludeStackTrace(int line)
{
std::ostringstream ss;
if(_included_stack_head == nullptr)
return "";
auto item = _included_stack_head;
ss << " " << item->path << ":" << line;
while(item->included_from != nullptr)
{
ss << std::endl << " from " << item->included_from->path << ":" << item->included_line;
item = item->included_from;
}
return ss.str();
}
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef SOURCE_INLINER_HPP
#define SOURCE_INLINER_HPP
#include "source_file_desc.hpp"
#include <ostream>
#include <memory>
#include <stack>
class InlineException : public std::exception
{
public:
InlineException(const std::string& trace) : _trace(trace) {}
virtual std::string What() const = 0;
const std::string& GetTrace() const { return _trace; }
private:
std::string _trace;
};
class InlineStackOverflowException : public InlineException
{
public:
InlineStackOverflowException(const std::string& trace) : InlineException(trace) {}
std::string What() const override
{
return "Include stack depth limit has been reached, possible circle includes";
}
};
class IncludeExpectedException : public InlineException
{
public:
IncludeExpectedException(const std::string& trace) : InlineException(trace) {}
std::string What() const override { return "Include directive expected"; }
};
class WrongInlineDirectiveException : public InlineException
{
public:
WrongInlineDirectiveException(const std::string& trace) : InlineException(trace) {}
std::string What() const override { return "Include directive has wrong format"; }
};
class IncludeFileExceptionBase : public InlineException
{
public:
IncludeFileExceptionBase(const std::string& file, const std::string& trace)
: InlineException(trace), _file(file)
{
}
std::string What() const override;
virtual std::string GetMessage() const = 0;
private:
std::string _file;
};
class IncludeNotFoundException : public IncludeFileExceptionBase
{
public:
IncludeNotFoundException(const std::string& file, const std::string& trace)
: IncludeFileExceptionBase(file, trace)
{
}
std::string GetMessage() const override
{
return "Include file not found (if it is optional put //inliner-include-optional on line "
"before it)";
}
};
class IncludeCantBeOpenedException : public IncludeFileExceptionBase
{
public:
IncludeCantBeOpenedException(const std::string& file, const std::string& trace)
: IncludeFileExceptionBase(file, trace)
{
}
std::string GetMessage() const override { return "Can not open include file"; }
};
class IncludeInliner
{
public:
int include_depth_limit = 256;
void Process(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
const std::string& directive,
bool allow_angle_brackets,
bool recurse);
std::string GetIncludeStackTrace(int line);
private:
int _include_depth = 0;
std::shared_ptr<SourceFileDesc> _included_stack_head = nullptr;
void ProcessCore(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
int line_number,
const std::string& directive,
bool allow_angle_brackets,
bool recurse);
};
#endif // !SOURCE_INLINER_HPP
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef SOURCE_FILE_DESC_HPP
#define SOURCE_FILE_DESC_HPP
#include <string>
#include <memory>
class SourceFileDesc
{
public:
const std::string path;
int included_line;
std::shared_ptr<SourceFileDesc> included_from;
SourceFileDesc(const std::string& path_, std::shared_ptr<SourceFileDesc> from, int line)
: path(path_), included_line(line), included_from(from)
{
}
};
#endif // SOURCE_FILE_DESC_HPP
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <binary_cache.hpp>
#include <handle.hpp>
#include <md5.hpp>
#include <env.hpp>
#include <stringutils.hpp>
#include <logger.hpp>
#include <target_properties.hpp>
#include <boost/filesystem.hpp>
#include <fstream>
#include <iostream>
namespace online_compile {
OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
OLC_DECLARE_ENV_VAR(HOME)
static boost::filesystem::path ComputeCachePath()
{
const char* home_dir = GetStringEnv(HOME{});
if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
{
home_dir = "/tmp";
}
auto p = boost::filesystem::path{home_dir} / "_hip_binary_kernels_";
if(!boost::filesystem::exists(p))
boost::filesystem::create_directories(p);
return p;
}
boost::filesystem::path GetCachePath()
{
static const boost::filesystem::path user_path = ComputeCachePath();
return user_path;
}
static bool IsCacheDisabled() { return online_compile::IsEnabled(OLC_DISABLE_CACHE{}); }
boost::filesystem::path
GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
{
// std::string filename = (is_kernel_str ? online_compile::md5(name) : name) + ".o";
std::string filename = name + ".o";
return GetCachePath() / online_compile::md5(device + ":" + args) / filename;
}
boost::filesystem::path LoadBinary(const TargetProperties& target,
const size_t num_cu,
const std::string& name,
const std::string& args)
{
if(online_compile::IsCacheDisabled())
return {};
(void)num_cu;
auto f = GetCacheFile(target.DbId(), name, args);
if(boost::filesystem::exists(f))
{
return f.string();
}
else
{
return {};
}
}
void SaveBinary(const boost::filesystem::path& binary_path,
const TargetProperties& target,
const std::string& name,
const std::string& args)
{
if(online_compile::IsCacheDisabled())
{
boost::filesystem::remove(binary_path);
}
else
{
auto p = GetCacheFile(target.DbId(), name, args);
boost::filesystem::create_directories(p.parent_path());
boost::filesystem::rename(binary_path, p);
}
}
} // namespace online_compile
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <exec_utils.hpp>
#include <manage_ptr.hpp>
#include <istream>
#include <ostream>
#include <string>
#include <cstdio>
#include <array>
#include <cassert>
#ifdef __linux__
#include <unistd.h>
#include <cstdio>
#include <sys/wait.h>
#endif // __linux__
namespace online_compile {
namespace exec {
int Run(const std::string& p, std::istream* in, std::ostream* out)
{
#ifdef __linux__
const auto redirect_stdin = (in != nullptr);
const auto redirect_stdout = (out != nullptr);
assert(!(redirect_stdin && redirect_stdout));
const auto file_mode = redirect_stdout ? "r" : "w";
OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
if(!pipe)
throw std::runtime_error("online_compile::exec::Run(): popen(" + p + ", " + file_mode +
") failed");
if(redirect_stdin || redirect_stdout)
{
std::array<char, 1024> buffer{};
if(redirect_stdout)
{
while(feof(pipe.get()) == 0)
if(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
*out << buffer.data();
}
else
{
while(!in->eof())
{
in->read(buffer.data(), buffer.size() - 1);
buffer[in->gcount()] = 0;
if(fputs(buffer.data(), pipe.get()) == EOF)
throw std::runtime_error("online_compile::exec::Run(): fputs() failed");
}
}
}
auto status = pclose(pipe.release());
return WEXITSTATUS(status);
#else
(void)p;
(void)in;
(void)out;
return -1;
#endif // __linux__
}
} // namespace exec
} // namespace online_compile
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <handle.hpp>
#include <binary_cache.hpp>
#include <env.hpp>
#include <kernel_cache.hpp>
#include <stringutils.hpp>
#include <target_properties.hpp>
#include <hipCheck.hpp>
#include <write_file.hpp>
#include <boost/filesystem.hpp>
#include <boost/lexical_cast.hpp>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <algorithm>
#include <cassert>
#include <chrono>
#include <thread>
OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
namespace online_compile {
std::size_t GetAvailableMemory()
{
size_t free, total;
MY_HIP_CHECK(hipMemGetInfo(&free, &total));
return free;
}
int get_device_id() // Get random device
{
int device;
MY_HIP_CHECK(hipGetDevice(&device));
return device;
}
void set_device(int id) { MY_HIP_CHECK(hipSetDevice(id)); }
int set_default_device()
{
int n;
MY_HIP_CHECK(hipGetDeviceCount(&n));
// Pick device based on process id
auto pid = ::getpid();
assert(pid > 0);
set_device(pid % n);
return (pid % n);
}
struct HandleImpl
{
using StreamPtr = std::shared_ptr<typename std::remove_pointer<hipStream_t>::type>;
HandleImpl() {}
StreamPtr create_stream()
{
hipStream_t result;
MY_HIP_CHECK(hipStreamCreate(&result));
return StreamPtr{result, &hipStreamDestroy};
}
static StreamPtr reference_stream(hipStream_t s) { return StreamPtr{s, null_deleter{}}; }
std::string get_device_name() const
{
hipDeviceProp_t props;
MY_HIP_CHECK(hipGetDeviceProperties(&props, device));
const std::string name(props.gcnArchName);
return name;
}
StreamPtr stream = nullptr;
int device = -1;
KernelCache cache;
TargetProperties target_properties;
};
Handle::Handle(hipStream_t stream) : impl(new HandleImpl())
{
this->impl->device = get_device_id();
if(stream == nullptr)
this->impl->stream = HandleImpl::reference_stream(nullptr);
else
this->impl->stream = HandleImpl::reference_stream(stream);
this->impl->target_properties.Init(this);
}
Handle::Handle() : impl(new HandleImpl())
{
this->impl->device = get_device_id();
this->impl->stream = HandleImpl::reference_stream(nullptr);
this->impl->target_properties.Init(this);
}
Handle::~Handle() {}
void Handle::SetStream(hipStream_t streamID) const
{
this->impl->stream = HandleImpl::reference_stream(streamID);
this->impl->target_properties.Init(this);
}
hipStream_t Handle::GetStream() const { return impl->stream.get(); }
KernelInvoke Handle::AddKernel(const std::string& algorithm,
const std::string& network_config,
const std::string& program_name,
const std::string& kernel_name,
const std::vector<size_t>& vld,
const std::vector<size_t>& vgd,
const std::string& params,
std::size_t cache_index) const
{
auto obj = this->impl->cache.AddKernel(
*this, algorithm, network_config, program_name, kernel_name, vld, vgd, params, cache_index);
return this->Run(obj);
}
void Handle::ClearKernels(const std::string& algorithm, const std::string& network_config) const
{
this->impl->cache.ClearKernels(algorithm, network_config);
}
const std::vector<Kernel>& Handle::GetKernelsImpl(const std::string& algorithm,
const std::string& network_config) const
{
return this->impl->cache.GetKernels(algorithm, network_config);
}
bool Handle::HasKernel(const std::string& algorithm, const std::string& network_config) const
{
return this->impl->cache.HasKernels(algorithm, network_config);
}
KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
Program Handle::LoadProgram(const std::string& program_name, std::string params) const
{
if((!online_compile::EndsWith(program_name, ".mlir-cpp")) &&
(!online_compile::EndsWith(program_name, ".mlir")))
{
params += " -mcpu=" + this->GetTargetProperties().Name();
}
auto hsaco = online_compile::LoadBinary(
this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
if(hsaco.empty())
{
auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
auto path = online_compile::GetCachePath() / boost::filesystem::unique_path();
if(p.IsCodeObjectInMemory())
online_compile::WriteFile(p.GetCodeObjectBlob(), path);
else
boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
online_compile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
return p;
}
else
{
return HIPOCProgram{program_name, hsaco};
}
}
bool Handle::HasProgram(const std::string& program_name, const std::string& params) const
{
return this->impl->cache.HasProgram(program_name, params);
}
void Handle::AddProgram(Program prog,
const std::string& program_name,
const std::string& params) const
{
this->impl->cache.AddProgram(prog, program_name, params);
}
void Handle::Finish() const { MY_HIP_CHECK(hipStreamSynchronize(this->GetStream())); }
std::size_t Handle::GetLocalMemorySize() const
{
int result;
MY_HIP_CHECK(hipDeviceGetAttribute(
&result, hipDeviceAttributeMaxSharedMemoryPerBlock, this->impl->device));
return result;
}
std::size_t Handle::GetGlobalMemorySize() const
{
size_t result;
MY_HIP_CHECK(hipDeviceTotalMem(&result, this->impl->device));
return result;
}
std::size_t Handle::GetMaxComputeUnits() const
{
int result;
const char* const num_cu = online_compile::GetStringEnv(OLC_DEVICE_CU{});
if(num_cu != nullptr && strlen(num_cu) > 0)
{
return boost::lexical_cast<std::size_t>(num_cu);
}
MY_HIP_CHECK(
hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device));
return result;
}
std::size_t Handle::GetWavefrontWidth() const
{
hipDeviceProp_t props{};
MY_HIP_CHECK(hipGetDeviceProperties(&props, this->impl->device));
auto result = static_cast<size_t>(props.warpSize);
return result;
}
std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); }
std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); }
const TargetProperties& Handle::GetTargetProperties() const
{
return this->impl->target_properties;
}
std::ostream& Handle::Print(std::ostream& os) const
{
os << "stream: " << this->impl->stream << ", device_id: " << this->impl->device;
return os;
}
} // namespace online_compile
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <hip_build_utils.hpp>
#include <stringutils.hpp>
#include <tmp_dir.hpp>
#include <env.hpp>
#include <target_properties.hpp>
#include <write_file.hpp>
#include <exec_utils.hpp>
#include <logger.hpp>
#include <config.h>
#include <boost/optional.hpp>
#include <sstream>
#include <string>
#include <stdexcept>
#include <iostream>
OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_VERBOSE)
OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
namespace online_compile {
bool IsHccCompiler()
{
static const auto isHcc = EndsWith(OLC_HIP_COMPILER, "hcc");
return isHcc;
}
bool IsHipClangCompiler()
{
static const auto isClangXX = EndsWith(OLC_HIP_COMPILER, "clang++");
return isClangXX;
}
namespace {
inline bool ProduceCoV3()
{
// Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
return (HipCompilerVersion() >= external_tool_version_t{3, 0, -1});
}
/// Returns option for enabling/disabling CO v3 generation for the compiler
/// that builds HIP kernels, depending on compiler version etc.
inline const std::string& GetCoV3Option(const bool enable)
{
/// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
/// It's unclear why... HCC included in ROCm 2.8 does not support it,
/// perhaps it suits for some older HCC?
///
/// These options are Ok for ROCm 3.0:
static const std::string option_enable{"-mcode-object-v3"};
static const std::string no_option{};
if(enable)
return option_enable;
else
return no_option;
}
} // namespace
static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
const std::string& filename,
std::string src,
std::string params,
const TargetProperties& target,
const bool testing_mode,
const bool sources_already_reside_on_filesystem)
{
#ifdef __linux__
// Write out the include files
// Let's assume includes are overkill for feature tests & optimize'em out.
if(!testing_mode)
{
auto inc_list = GetHipKernelIncList();
auto inc_path = tmp_dir->path;
boost::filesystem::create_directories(inc_path);
for(auto inc_file : inc_list)
{
auto inc_src = GetKernelInc(inc_file);
WriteFile(inc_src, inc_path / inc_file);
}
}
// Sources produced by MLIR-cpp already reside in tmp dir.
if(!sources_already_reside_on_filesystem)
{
src += "\nint main() {}\n";
WriteFile(src, tmp_dir->path / filename);
}
// cppcheck-suppress unreadVariable
const LcOptionTargetStrings lots(target);
auto env = std::string("");
if(IsHccCompiler())
{
params += " -amdgpu-target=" + target.Name();
params += " " + GetCoV3Option(ProduceCoV3());
}
else if(IsHipClangCompiler())
{
if(params.find("-std=") == std::string::npos)
params += " --std=c++11";
if(HipCompilerVersion() < external_tool_version_t{4, 1, 0})
params += " --cuda-gpu-arch=" + lots.device;
else
params += " --cuda-gpu-arch=" + lots.device + lots.xnack;
params += " --cuda-device-only";
params += " -c";
params += " -O3 ";
}
params += " -Wno-unused-command-line-argument -I. ";
params += OLC_STRINGIZE(HIP_COMPILER_FLAGS);
if(IsHccCompiler())
{
env += std::string("KMOPTLLC=\"-mattr=+enable-ds128 ");
if(HipCompilerVersion() >= external_tool_version_t{2, 8, 0})
env += " --amdgpu-spill-vgpr-to-agpr=0";
env += '\"';
}
else if(IsHipClangCompiler())
{
params += " -mllvm --amdgpu-spill-vgpr-to-agpr=0";
params += " -mllvm -amdgpu-early-inline-all=true";
params += " -mllvm -amdgpu-function-calls=false";
}
if(online_compile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
{
params += " -v";
}
if(online_compile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
{
if(IsHccCompiler())
{
params += " -gline-tables-only";
env += " KMDUMPISA=1";
env += " KMDUMPLLVM=1";
}
else if(IsHipClangCompiler())
{
params += " -gline-tables-only";
params += " -save-temps";
}
}
// hip version
params +=
std::string(" -DHIP_PACKAGE_VERSION_FLAT=") + std::to_string(HIP_PACKAGE_VERSION_FLAT);
params += " ";
auto bin_file = tmp_dir->path / (filename + ".o");
// compile
const std::string redirector = testing_mode ? " 1>/dev/null 2>&1" : "";
tmp_dir->Execute(env + std::string(" ") + OLC_HIP_COMPILER,
params + filename + " -o " + bin_file.string() + redirector);
if(!boost::filesystem::exists(bin_file))
throw std::runtime_error(filename + " failed to compile");
#ifdef EXTRACTKERNEL_BIN
if(IsHccCompiler())
{
// call extract kernel
tmp_dir->Execute(EXTRACTKERNEL_BIN, " -i " + bin_file.string());
auto hsaco =
std::find_if(boost::filesystem::directory_iterator{tmp_dir->path}, {}, [](auto entry) {
return (entry.path().extension() == ".hsaco");
});
if(hsaco == boost::filesystem::directory_iterator{})
{
fdt_log(LogLevel::Info, "HipBuild", "failed to find *.hsaco in ")
<< hsaco->path().string() << std::endl;
}
return hsaco->path();
}
#endif
return bin_file;
#else
(void)filename;
(void)params;
throw std::runtimer_error("HIP kernels are only supported in Linux");
#endif
}
boost::filesystem::path HipBuild(boost::optional<TmpDir>& tmp_dir,
const std::string& filename,
std::string src,
std::string params,
const TargetProperties& target,
const bool sources_already_reside_on_filesystem)
{
return HipBuildImpl(
tmp_dir, filename, src, params, target, false, sources_already_reside_on_filesystem);
}
void bin_file_to_str(const boost::filesystem::path& file, std::string& buf)
{
std::ifstream bin_file_ptr(file.string().c_str(), std::ios::binary);
std::ostringstream bin_file_strm;
bin_file_strm << bin_file_ptr.rdbuf();
buf = bin_file_strm.str();
}
static external_tool_version_t HipCompilerVersionImpl()
{
external_tool_version_t version;
if(IsHccCompiler())
{
const std::string path(OLC_HIP_COMPILER);
const std::string mandatory_prefix("(based on HCC ");
do
{
if(path.empty() || !std::ifstream(path).good())
break;
std::stringstream out;
if(online_compile::exec::Run(path + " --version", nullptr, &out) != 0)
break;
std::string line;
while(!out.eof())
{
std::getline(out, line);
fdt_log() << line;
auto begin = line.find(mandatory_prefix);
if(begin == std::string::npos)
continue;
begin += mandatory_prefix.size();
int v3, v2, v1 = v2 = v3 = -1;
char c2, c1 = c2 = 'X';
std::istringstream iss(line.substr(begin));
iss >> v1 >> c1 >> v2 >> c2 >> v3;
if(!iss.fail() && v1 >= 0)
{
version.major = v1;
if(c1 == '.' && v2 >= 0)
{
version.minor = v2;
if(c2 == '.' && v3 >= 0)
version.patch = v3;
}
}
break;
}
} while(false);
}
else
{
#ifdef HIP_PACKAGE_VERSION_MAJOR
fdt_log(
LogLevel::Info, "HipCompilerVersion", "Read version information from HIP package...");
version.major = HIP_PACKAGE_VERSION_MAJOR;
#ifdef HIP_PACKAGE_VERSION_MINOR
version.minor = HIP_PACKAGE_VERSION_MINOR;
#else
version.minor = 0;
#endif
#ifdef HIP_PACKAGE_VERSION_PATCH
version.patch = HIP_PACKAGE_VERSION_PATCH;
#else
version.patch = 0;
#endif
#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
fdt_log(LogLevel::Info, "HipCompilerVersion", "...assuming 3.2.0 (hip-clang RC)");
version.major = 3;
version.minor = 2;
version.patch = 0;
#endif
}
fdt_log() << version.major << '.' << version.minor << '.' << version.patch << std::endl;
return version;
}
external_tool_version_t HipCompilerVersion()
{
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
static auto once = HipCompilerVersionImpl();
return once;
}
bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
if(lhs.major > rhs.major)
return true;
else if(lhs.major == rhs.major)
{
if(lhs.minor > rhs.minor)
return true;
else if(lhs.minor == rhs.minor)
return (lhs.patch > rhs.patch);
else
return false;
}
else
return false;
}
bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
return rhs > lhs;
}
bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
return !(lhs < rhs);
}
bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
return !(lhs > rhs);
}
} // namespace online_compile
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment