Unverified Commit 12649254 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

reorganize files to prepare for MIOpen integration (#51)

* change olc cmake

* adding online compile to fwd-v4r5r2

* update scripts

* remane fwd-v4r5r2 to fwd-v6r1

* clean up
parent fbdf4332
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r5r2_nchw_kcyx_nkhw.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "driver_dynamic_contraction_v1r2.hpp"
template <typename TInWei,
......@@ -14,7 +14,7 @@ template <typename TInWei,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
void device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw(
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
......@@ -43,11 +43,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
const auto in_n_c_hi_wi_desc =
const auto in_desc_n_c_hi_wi =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
const auto wei_desc_k_c_y_x =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
const auto out_desc_n_k_ho_wo =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
#if 1
......@@ -58,32 +58,32 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
constexpr index_t GN0 = 4;
constexpr index_t GK1 = 1;
constexpr index_t GemmGM1PerBlockGM11 = 128;
constexpr index_t GemmGN1PerBlockGN11 = 32;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GM1PerBlockGM11 = 128;
constexpr index_t GN1PerBlockGN11 = 32;
constexpr index_t GK0PerBlock = 8;
constexpr index_t GemmM1PerThreadM111 = 4;
constexpr index_t GemmN1PerThreadN111 = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t BM1PerThreadBM11 = 4;
constexpr index_t BN1PerThreadBN11 = 4;
constexpr index_t BK0PerThread = 1;
constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
constexpr index_t BM10BN10ThreadClusterBM100 = 8;
constexpr index_t BM10BN10ThreadClusterBN100 = 8;
constexpr index_t BM10BN10ThreadClusterBM101 = 2;
constexpr index_t BM10BN10ThreadClusterBN101 = 2;
using GemmABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
using GemmABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
using GemmABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
using GemmABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 1>;
using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 1>;
using GemmBBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 4, 1, 1, 1>;
using GemmBBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 4, 1, 1, 1>;
using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
using GemmBBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
using GemmBBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
constexpr index_t GemmCThreadTransferDstScalarPerVector_BN1 = 1;
constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
#elif 1
// [8, 1, 128, 2] * [8, 4, 32, 2] = [1, 128, 4, 32] for fp16
// cdata = 64, BlockSize = 256
......@@ -92,48 +92,48 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
constexpr index_t GN0 = 4;
constexpr index_t GK1 = 2;
constexpr index_t GemmGM1PerBlockGM11 = 128;
constexpr index_t GemmGN1PerBlockGN11 = 32;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GM1PerBlockGM11 = 128;
constexpr index_t GN1PerBlockGN11 = 32;
constexpr index_t GK0PerBlock = 8;
constexpr index_t GemmM1PerThreadM111 = 4;
constexpr index_t GemmN1PerThreadN111 = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t BM1PerThreadBM11 = 4;
constexpr index_t BN1PerThreadBN11 = 4;
constexpr index_t BK0PerThread = 1;
constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
constexpr index_t BM10BN10ThreadClusterBM100 = 8;
constexpr index_t BM10BN10ThreadClusterBN100 = 8;
constexpr index_t BM10BN10ThreadClusterBM101 = 2;
constexpr index_t BM10BN10ThreadClusterBN101 = 2;
using GemmABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 2>;
using GemmABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 2>;
using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
using GemmABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
using GemmABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 2>;
using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 2>;
using GemmBBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 4, 1, 1, 2>;
using GemmBBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 4, 1, 1, 2>;
using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
using GemmBBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
using GemmBBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 2>;
using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 2>;
constexpr index_t GemmCThreadTransferDstScalarPerVector_BN1 = 1;
constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
#endif
const auto descs =
transform_forward_convolution_into_contraction_v4r5r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
Number<GN0>{},
Number<GK1>{});
const auto wei_gk0_gm0_gm1_gk1_grid_desc = descs[I0];
const auto in_gk0_gn0_gn1_gk1_grid_desc = descs[I1];
const auto out_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_desc_k_c_y_x,
in_desc_n_c_hi_wi,
out_desc_n_k_ho_wo,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
Number<GN0>{},
Number<GK1>{});
const auto wei_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
const auto in_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
const auto out_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr auto wei_grid_iterator_hacks =
......@@ -189,36 +189,36 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
TAcc,
TOut,
InMemoryDataOperation::Set,
decltype(wei_gk0_gm0_gm1_gk1_grid_desc),
decltype(in_gk0_gn0_gn1_gk1_grid_desc),
decltype(out_gm0_gm1_gn0_gn1_grid_desc),
GemmGM1PerBlockGM11,
GemmGN1PerBlockGN11,
GemmKPerBlock,
GemmM1PerThreadM111,
GemmN1PerThreadN111,
GemmKPerThread,
GemmM11N11ThreadClusterM1100,
GemmM11N11ThreadClusterN1100,
GemmM11N11ThreadClusterM1101,
GemmM11N11ThreadClusterN1101,
GemmABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
GemmABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
decltype(wei_grid_desc_gk0_gm0_gm1_gk1),
decltype(in_grid_desc_gk0_gn0_gn1_gk1),
decltype(out_grid_desc_gm0_gm1_gn0_gn1),
GM1PerBlockGM11,
GN1PerBlockGN11,
GK0PerBlock,
BM1PerThreadBM11,
BN1PerThreadBN11,
BK0PerThread,
BM10BN10ThreadClusterBM100,
BM10BN10ThreadClusterBN100,
BM10BN10ThreadClusterBM101,
BM10BN10ThreadClusterBN101,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
Sequence<1, 2, 3, 0, 4>, // ABlockTransferThreadClusterArrangeOrder
Sequence<3, 2, 1, 0, 4>, // ABlockTransferSrcAccessOrder
GemmABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
GemmABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
Sequence<0, 1, 2, 3, 4>, // ABlockTransferSrcVectorTensorContiguousDimOrder
GemmBBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
GemmBBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
Sequence<0, 4, 1, 2, 3>, // BBlockTransferThreadClusterArrangeOrder
Sequence<4, 3, 2, 0, 1>, // BBlockTransferSrcAccessOrder
GemmBBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
GemmBBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
Sequence<0, 1, 2, 3, 4>, // BBlockTransferSrcVectorTensorContiguousDimOrder
Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
5, // CThreadTransferSrcDstVectorDim
GemmCThreadTransferDstScalarPerVector_BN1,
CThreadTransferDstScalarPerVector_BN1,
decltype(wei_grid_iterator_hacks),
decltype(in_grid_iterator_hacks),
decltype(out_grid_iterator_hacks),
......@@ -227,9 +227,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
wei_gk0_gm0_gm1_gk1_grid_desc,
in_gk0_gn0_gn1_gk1_grid_desc,
out_gm0_gm1_gn0_gn1_grid_desc,
wei_grid_desc_gk0_gm0_gm1_gk1,
in_grid_desc_gk0_gn0_gn1_gk1,
out_grid_desc_gm0_gm1_gn0_gn1,
wei_grid_iterator_hacks,
in_grid_iterator_hacks,
out_grid_iterator_hacks,
......@@ -238,7 +238,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
nrepeat);
float perf = (float)calculate_convolution_flops(
in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
......
include_directories(BEFORE
include
${PROJECT_BINARY_DIR}/host/online_compilation/include
${PROJECT_SOURCE_DIR}/host/online_compilation/include
${PROJECT_SOURCE_DIR}/host/host_tensor/include
${PROJECT_SOURCE_DIR}/composable_kernel/include
${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
${PROJECT_SOURCE_DIR}/external/rocm/include
${PROJECT_SOURCE_DIR}/external/half/include
)
set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
target_link_libraries(conv_fwd_driver_online PRIVATE online_compilation)
......@@ -12,26 +12,22 @@
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp"
#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#include "handle.hpp"
#include "hipCheck.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V4R5_NCHW 1
#define USE_CONV_FWD_V6R1_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
#include "conv_tunables.hpp"
#include "handle.hpp"
#include "hipCheck.hpp"
enum ConvForwardAlgo
{
V4R4NCHW, // 0
V4R5NCHW, // 1
V6R1NCHW, // 1
V4R4XDLNCHW, // 2
V4R4XDLNHWC // 3
};
......@@ -94,15 +90,17 @@ int main(int argc, char* argv[])
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#if 1
constexpr index_t in_vector_size = 1;
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
constexpr index_t in_vector_size = 16;
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#elif 1
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
......@@ -230,9 +228,9 @@ int main(int argc, char* argv[])
tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc<in_data_t,
acc_data_t,
out_data_t>(
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
acc_data_t,
out_data_t>(
handle,
tmp[I0],
tmp[I1],
......@@ -249,8 +247,8 @@ int main(int argc, char* argv[])
}
#endif
#if USE_CONV_FWD_V4R5_NCHW
if(algo == ConvForwardAlgo::V4R5NCHW)
#if USE_CONV_FWD_V6R1_NCHW
if(algo == ConvForwardAlgo::V6R1NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
......@@ -259,12 +257,11 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nchw();
tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
const auto tunable = tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw{};
device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc<in_data_t,
acc_data_t,
out_data_t>(
online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw<in_data_t,
acc_data_t,
out_data_t>(
handle,
tmp[I0],
tmp[I1],
......@@ -294,22 +291,22 @@ int main(int argc, char* argv[])
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_olc<in_data_t,
acc_data_t,
out_data_t>(
handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
......@@ -326,22 +323,22 @@ int main(int argc, char* argv[])
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
&default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_olc<in_data_t,
acc_data_t,
out_data_t>(
handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
......
#ifndef CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
{
int32_t BlockSize;
int32_t MPerBlock;
int32_t NPerBlock;
int32_t KPerBlock;
int32_t M1PerThread;
int32_t N1PerThread;
int32_t KPerThread;
int32_t M1N1ThreadClusterM10;
int32_t M1N1ThreadClusterN10;
int32_t M1N1ThreadClusterM11;
int32_t M1N1ThreadClusterN11;
std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
int32_t ABlockTransferSrcVectorDim;
int32_t ABlockTransferSrcScalarPerVector;
int32_t ABlockTransferDstScalarPerVector_M1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
int32_t BBlockTransferSrcVectorDim;
int32_t BBlockTransferSrcScalarPerVector;
int32_t BBlockTransferDstScalarPerVector_N1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 6> CThreadTransferSrcDstAccessOrder;
int32_t CThreadTransferSrcDstVectorDim;
int32_t CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = {
256, 128, 128, 8, 4, 4, 1,
8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
{2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128},
{0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2},
5, 1};
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
{
int32_t BlockSize;
int32_t MPerBlock;
int32_t NPerBlock;
int32_t KPerBlock;
int32_t MPerWave;
int32_t NPerWave;
int32_t K1;
int32_t MRepeat;
int32_t NRepeat;
std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
int32_t ABlockTransferSrcVectorDim;
int32_t ABlockTransferSrcScalarPerVector;
int32_t ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
int32_t BBlockTransferSrcVectorDim;
int32_t BBlockTransferSrcScalarPerVector;
int32_t BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder;
int32_t CThreadTransferSrcDstVectorDim;
int32_t CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
256, // BlockSize
128, // MPerBlock,
128, // NPerBlock,
4, // KPerBlock,
32, // MPerWave,
32, // NPerWave,
4, // K1,
2, // MRepeat,
2, // NRepeat,
{1, 2, 4}, // ABlockTransferThreadSliceLengths_K0_M_K1,
{4, 64, 1}, // ABlockTransferThreadClusterLengths_K0_M_K1,
{1, 0, 2}, // ABlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // ABlockTransferSrcAccessOrder,
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector,
4, // ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun,
{1, 2, 4}, // BBlockTransferThreadSliceLengths_K0_N_K1,
{4, 64, 1}, // BBlockTransferThreadClusterLengths_K0_N_K1,
{0, 2, 1}, // BBlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // BBlockTransferSrcAccessOrder,
1, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_K1
false, // BThreadTransferSrcResetCoordinateAfterRun
{3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
7, // CThreadTransferSrcDstVectorDim,
1 // CThreadTransferDstScalarPerVector
};
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
{
int32_t BlockSize;
int32_t MPerBlock;
int32_t NPerBlock;
int32_t KPerBlock;
int32_t MPerWave;
int32_t NPerWave;
int32_t K1;
int32_t MRepeat;
int32_t NRepeat;
std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
int32_t ABlockTransferSrcVectorDim;
int32_t ABlockTransferSrcScalarPerVector;
int32_t ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
int32_t BBlockTransferSrcVectorDim;
int32_t BBlockTransferSrcScalarPerVector;
int32_t BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder;
int32_t CThreadTransferSrcDstVectorDim;
int32_t CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
256, // BlockSize
128, // MPerBlock,
128, // NPerBlock,
4, // KPerBlock,
32, // MPerWave,
32, // NPerWave,
4, // K1,
2, // MRepeat,
2, // NRepeat,
{1, 2, 4}, // ABlockTransferThreadSliceLengths_K0_M_K1,
{4, 64, 1}, // ABlockTransferThreadClusterLengths_K0_M_K1,
{1, 0, 2}, // ABlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // ABlockTransferSrcAccessOrder,
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector,
4, // ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun,
{1, 2, 4}, // BBlockTransferThreadSliceLengths_K0_N_K1,
{4, 64, 1}, // BBlockTransferThreadClusterLengths_K0_N_K1,
{1, 0, 2}, // BBlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // BBlockTransferSrcAccessOrder,
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_K1
false, // BThreadTransferSrcResetCoordinateAfterRun
{2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
7, // CThreadTransferSrcDstVectorDim,
1 // CThreadTransferDstScalarPerVector
};
#endif
#ifndef CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw
{
int32_t BlockSize = 256;
int32_t GN0 = 4;
int32_t GK1 = 1;
int32_t GM1PerBlockGM11 = 128;
int32_t GN1PerBlockGN11 = 32;
int32_t GK0PerBlock = 8;
int32_t BM1PerThreadBM11 = 4;
int32_t BN1PerThreadBN11 = 4;
int32_t BK0PerThread = 1;
int32_t BM10BN10ThreadClusterBM100 = 2;
int32_t BM10BN10ThreadClusterBN100 = 2;
int32_t BM10BN10ThreadClusterBM101 = 8;
int32_t BM10BN10ThreadClusterBN101 = 8;
std::array<int32_t, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {4, 1, 1, 1, 1};
std::array<int32_t, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
2, 1, 1, 128, 1};
std::array<int32_t, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
4, 1, 1, 1, 1};
std::array<int32_t, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
1, 1, 1, 1, 1};
std::array<int32_t, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {1, 4, 1, 1, 1};
std::array<int32_t, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
8, 1, 1, 32, 1};
std::array<int32_t, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
1, 1, 1, 1, 1};
std::array<int32_t, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
1, 1, 1, 1, 1};
int32_t CThreadTransferDstScalarPerVector = 1;
};
#endif
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "olc_driver_common.hpp"
#include "conv_tunables.hpp"
#include "handle.hpp"
#include "conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
......@@ -211,7 +209,7 @@ template <typename TInWei,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc(
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
......
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "olc_driver_common.hpp"
#include "conv_tunables.hpp"
#include "handle.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
......@@ -208,7 +206,7 @@ template <typename TInWei,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_olc(
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
......
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "olc_driver_common.hpp"
#include "conv_tunables.hpp"
#include "handle.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
......@@ -209,7 +207,7 @@ template <typename TInWei,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_olc(
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
olCompile::Handle* handle,
const InLengths& in_n_hi_wi_c_lengths,
const WeiLengths& wei_k_y_x_c_lengths,
......
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
#include "olc_driver_common.hpp"
#include "conv_tunables.hpp"
#include "handle.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw {
namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
std::string out;
std::string out("DAT_");
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
......@@ -24,80 +22,97 @@ static std::string get_network_config_string_from_types()
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->GM1PerBlockGM11) + "x" + std::to_string(pt->GN1PerBlockGN11) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
std::to_string(pt->KPerThread) + "_";
out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
std::to_string(pt->M1N1ThreadClusterN10) + "x" +
std::to_string(pt->M1N1ThreadClusterM11) + "x" +
std::to_string(pt->M1N1ThreadClusterN11) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[3]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_GM11) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
out += std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[3]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_GN11) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
out += std::to_string(tunable.BlockSize) + "_";
out += std::to_string(tunable.GN0) + "x" + std::to_string(tunable.GK1) + "_";
out += std::to_string(tunable.GM1PerBlockGM11) + "x" + std::to_string(tunable.GN1PerBlockGN11) +
"x" + std::to_string(tunable.GK0PerBlock) + "_";
out += std::to_string(tunable.BM1PerThreadBM11) + "x" +
std::to_string(tunable.BN1PerThreadBN11) + "x" + std::to_string(tunable.BK0PerThread) +
"_";
out += std::to_string(tunable.BM10BN10ThreadClusterBM100) + "x" +
std::to_string(tunable.BM10BN10ThreadClusterBN100) + "x" +
std::to_string(tunable.BM10BN10ThreadClusterBM101) + "x" +
std::to_string(tunable.BM10BN10ThreadClusterBN101) + "_";
out += std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
out +=
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
out += std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
"_";
out += std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
"_";
out += std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
out +=
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
out += std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
"_";
out += std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
"_";
out += std::to_string(tunable.CThreadTransferDstScalarPerVector);
return (out);
};
......@@ -108,114 +123,120 @@ static std::string get_definition_string_from_types()
std::string out;
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
" -DCK_PARAM_ACC_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(pt->GM1PerBlockGM11) +
" -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(pt->GN1PerBlockGN11) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
" -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
" -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
" -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
" -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
" -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[3]);
out += " -DCK_PARAM_BlockSize=" + std::to_string(tunable.BlockSize);
out += " -DCK_PARAM_GN0=" + std::to_string(tunable.GN0);
out += " -DCK_PARAM_GK1=" + std::to_string(tunable.GK1);
out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(tunable.GM1PerBlockGM11) +
" -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(tunable.GN1PerBlockGN11) +
" -DCK_PARAM_GK0PerBlock=" + std::to_string(tunable.GK0PerBlock);
out += " -DCK_PARAM_BM1PerThreadBM11=" + std::to_string(tunable.BM1PerThreadBM11) +
" -DCK_PARAM_BN1PerThreadBN11=" + std::to_string(tunable.BN1PerThreadBN11) +
" -DCK_PARAM_BK0PerThread=" + std::to_string(tunable.BK0PerThread);
out += " -DCK_PARAM_BM10BN10ThreadClusterBM100=" +
std::to_string(tunable.BM10BN10ThreadClusterBM100) +
" -DCK_PARAM_BM10BN10ThreadClusterBN100=" +
std::to_string(tunable.BM10BN10ThreadClusterBN100) +
" -DCK_PARAM_BM10BN10ThreadClusterBM101=" +
std::to_string(tunable.BM10BN10ThreadClusterBM101) +
" -DCK_PARAM_BM10BN10ThreadClusterBN101=" +
std::to_string(tunable.BM10BN10ThreadClusterBN101);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_GM11=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_GM11);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[3]);
" -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]);
out += " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
out += " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_GN11=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_GN11);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
" -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]);
out += " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
out += " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
std::to_string(tunable.CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
} // namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
......@@ -227,7 +248,7 @@ template <typename TInWei,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw(
olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
......@@ -239,15 +260,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable,
const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
using namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw;
using size_t = std::size_t;
constexpr index_t N0 = 4; // this could not be a tunable so far
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
......@@ -264,25 +283,27 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto descs =
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
tunable.GN0,
tunable.GK1);
const auto a_gk_gm0_gm1_grid_desc = descs[I0];
const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
const auto GM1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I1);
const auto GN1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I3);
const auto GK = a_gk_gm0_gm1_grid_desc.GetLength(I0);
const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
const auto GK = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
const index_t grid_size = (GM1 / tunable->GM1PerBlockGM11) * (GN1 / tunable->GN1PerBlockGN11);
const bool hasMainKBlockLoop = ((GK + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
const bool hasDoubleTailKBlockLoop = ((GK / tunable->KPerBlock) % 2 == 0);
const index_t grid_size = (GM1 / tunable.GM1PerBlockGM11) * (GN1 / tunable.GN1PerBlockGN11);
const bool hasMainKBlockLoop = ((GK + tunable.GK0PerBlock) / (2 * tunable.GK0PerBlock) > 1);
const bool hasDoubleTailKBlockLoop = ((GK / tunable.GK0PerBlock) % 2 == 0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
......@@ -299,20 +320,20 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
// workspace API
DeviceMem workspace_buf(4096);
void* a_gk_gm0_gm10_gm11_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_gk_gn0_gn10_gn11_grid_desc_dev_buf =
void* a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf =
void* c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf =
void* c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
const std::vector<size_t> vld = {static_cast<size_t>(tunable.BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable.BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable.BlockSize), 1, 1};
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw";
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v6r1_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
......@@ -320,10 +341,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
param += get_definition_string_from_types<TInWei, TAcc, TOut>() +
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) +
" -DCK_PARAM_N0=" + std::to_string(N0) + " " +
get_definition_string_from_tunable(tunable);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_V" +
std::to_string(hasDoubleTailKBlockLoop) + "_" + std::to_string(N0) + "_" +
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
std::to_string(hasDoubleTailKBlockLoop) + "_" +
get_network_config_string_from_tunable(tunable);
std::vector<float> kernel1_times;
......@@ -334,7 +355,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare";
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
......@@ -354,13 +375,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_gk_gm0_gm10_gm11_grid_desc_dev_buf,
b_gk_gn0_gn10_gn11_grid_desc_dev_buf,
c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf,
c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf);
a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf,
b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf,
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf,
c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf);
timer2.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw";
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
......@@ -368,10 +389,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_gk_gm0_gm10_gm11_grid_desc_dev_buf),
(const void*)(b_gk_gn0_gn10_gn11_grid_desc_dev_buf),
(const void*)(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf),
(const void*)(c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf));
(const void*)(a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf),
(const void*)(b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf),
(const void*)(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf),
(const void*)(c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
......
include_directories(BEFORE
include
)
set(HOST_TENSOR_SOURCE
src/host_tensor.cpp;
src/device.cpp;
)
## the library target
add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
target_link_libraries(host_tensor PRIVATE hip::device)
target_link_libraries(host_tensor INTERFACE hip::host)
target_compile_features(host_tensor PUBLIC)
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS host_tensor LIBRARY DESTINATION lib)
......@@ -2,7 +2,8 @@
#define DEVICE_HPP
#include <memory>
#include "config.hpp"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
struct DeviceMem
{
......@@ -30,7 +31,6 @@ struct KernelTimer
std::unique_ptr<KernelTimerImpl> impl;
};
#if CK_DEVICE_BACKEND_AMD
using device_stream_t = hipStream_t;
template <typename... Args, typename F>
......@@ -83,44 +83,4 @@ float launch_and_time_kernel(F kernel,
return timer.GetElapsedTime() / nrepeat;
}
#elif CK_DEVICE_BACKEND_NVIDIA
using device_stream_t = cudaStream_t;
template <typename... Args, typename F>
void launch_kernel(F kernel,
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
cudaStream_t stream_id,
Args... args)
{
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...};
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, stream_id);
}
template <typename... Args, typename F>
float launch_and_time_kernel(F kernel,
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
cudaStream_t stream_id,
Args... args)
{
KernelTimer timer;
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...};
timer.Start();
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, stream_id);
timer.End();
return timer.GetElapsedTime();
}
#endif
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment