"research/object_detection/g3doc/preparing_inputs.md" did not exist on "477ed41e7e4e8a8443bc633846eb01e2182dc68a"
Commit 3dd0cc31 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge

parents c21521a1 198593d5
......@@ -362,173 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
......@@ -564,111 +397,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
p_out_global);
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
p_out_global);
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
p_out_global);
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
p_out_global);
}
}
......@@ -903,160 +740,9 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
const bool has_main_k_block_loop = (GemmK + GemmKPerBlock) / (2 * GemmKPerBlock) > 1;
const bool has_double_tail_k_block_loop = (GemmK / GemmKPerBlock) % 2 == 0;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc,
p_wei_global,
in_gemmk_gemmn_global_desc,
p_in_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc,
p_wei_global,
in_gemmk_gemmn_global_desc,
p_in_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc,
p_wei_global,
in_gemmk_gemmn_global_desc,
p_in_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc,
p_wei_global,
in_gemmk_gemmn_global_desc,
p_in_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
const bool has_double_tail_k_block_loop = (GemmK / GemmKPerBlock) % 2 == 0;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
......@@ -1072,12 +758,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
......@@ -1087,15 +773,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
wei_gemmk_gemmm_global_desc,
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
in_gemmk_gemmn_global_desc,
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
......@@ -1104,12 +786,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
......@@ -1119,15 +801,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
wei_gemmk_gemmm_global_desc,
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
in_gemmk_gemmn_global_desc,
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
......@@ -1136,12 +814,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
......@@ -1151,15 +829,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
wei_gemmk_gemmm_global_desc,
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
in_gemmk_gemmn_global_desc,
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
......@@ -1168,12 +842,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
decltype(wei_gemmk_gemmm_global_desc),
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
decltype(in_gemmk_gemmn_global_desc),
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
......@@ -1183,15 +857,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
wei_gemmk_gemmm_global_desc,
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
in_gemmk_gemmn_global_desc,
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
......@@ -1237,111 +907,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
p_out_global);
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
p_out_global);
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
p_out_global);
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
p_out_global);
}
}
......@@ -1694,173 +1368,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
......@@ -1896,111 +1403,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
p_out_global);
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
p_out_global);
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
p_out_global);
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
p_out_global);
}
}
......
......@@ -363,171 +363,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
......@@ -561,111 +396,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
p_out_global);
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
p_out_global);
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
p_out_global);
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
p_out_global);
}
}
......@@ -1017,171 +856,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
......@@ -1215,114 +889,117 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
p_out_global);
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
p_out_global);
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
true>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
p_out_global);
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
false>;
launch_kernel(kernel,
launch_kernel(
kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(void __CONSTANT__*)
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
(void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
(void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
p_out_global);
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
......
......@@ -181,7 +181,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
#if 1
// GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
BlockSize,
......@@ -372,7 +371,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#endif
}
};
} // namespace ck
......
......@@ -133,12 +133,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
constexpr auto KPerThreadSubC = 4;
constexpr auto HPerThreadSubC = 2;
constexpr auto WPerThreadSubC = 2;
constexpr auto HoPerThreadSubC = 2;
constexpr auto WoPerThreadSubC = 2;
static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % HPerThreadSubC == 0, "");
static_assert(WPerThread % WPerThreadSubC == 0, "");
static_assert(HPerThread % HoPerThreadSubC == 0, "");
static_assert(WPerThread % WoPerThreadSubC == 0, "");
// thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
......@@ -161,8 +162,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx),
decltype(b_thread_mtx),
decltype(c_thread_mtx),
HPerThreadSubC,
WPerThreadSubC>{};
HoPerThreadSubC,
WoPerThreadSubC>{};
// loop over k
#pragma unroll
for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop)
......@@ -176,10 +177,10 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
p_a_thread);
#pragma unroll
for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HPerThreadSubC)
for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HoPerThreadSubC)
{
#pragma unroll
for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WPerThreadSubC)
for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WoPerThreadSubC)
{
threadwise_gemm.Run(p_a_thread,
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
......
......@@ -11,6 +11,47 @@
namespace ck {
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by __CONSTANT__ void pointer
// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization
template <typename GridwiseGemm,
typename AGlobalDesc,
typename FloatA,
typename BGlobalDesc,
typename FloatB,
typename CGlobalDesc,
typename FloatC,
bool HasMainKBlockLoop,
bool HasDoubleTailKBlockLoop>
__global__ void run_gridwise_dynamic_gemm_v1(const void __CONSTANT__* p_a_k_m_global_desc,
const FloatA* __restrict__ p_a_global,
const void __CONSTANT__* p_b_k_n_global_desc,
const FloatB* __restrict__ p_b_global,
const void __CONSTANT__* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global)
{
// first cast void __CONSTANT__* to void*
// second cast void* to Desc*
// the copy constructor of tensor descriptor doesn't take address_space(4)
const auto a_k_m_global_desc =
*reinterpret_cast<const AGlobalDesc*>((const void*)p_a_k_m_global_desc);
const auto b_k_n_global_desc =
*reinterpret_cast<const BGlobalDesc*>((const void*)p_b_k_n_global_desc);
const auto c_m0_m1_n0_n1_global_desc =
*reinterpret_cast<const CGlobalDesc*>((const void*)p_c_m0_m1_n0_n1_global_desc);
GridwiseGemm{}.Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#endif
template <index_t BlockSize,
typename FloatAB,
typename FloatAcc,
......@@ -427,7 +468,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
}
}
// pass tensor descriptor by reference
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc& a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global,
......@@ -452,57 +492,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
// pass tensor descriptors by pointers
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc* p_a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global,
const BGlobalDesc* p_b_k_n_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_k_m_global_desc = *p_a_k_m_global_desc;
const auto b_k_n_global_desc = *p_b_k_n_global_desc;
const auto c_m0_m1_n0_n1_global_desc = *p_c_m0_m1_n0_n1_global_desc;
Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
// pass tensor descriptors by void*
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const void* p_a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global,
const void* p_b_k_n_global_desc,
const FloatAB* __restrict__ p_b_global,
const void* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_k_m_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_k_m_global_desc);
const auto b_k_n_global_desc = *reinterpret_cast<const BGlobalDesc*>(p_b_k_n_global_desc);
const auto c_m0_m1_n0_n1_global_desc =
*reinterpret_cast<const CGlobalDesc*>(p_c_m0_m1_n0_n1_global_desc);
Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
};
} // namespace ck
......
......@@ -272,21 +272,28 @@ __device__ void amd_assembly_outer_product_1x4(int8x8_t a,
int32_t& c2,
int32_t& c3)
{
amd_assembly_outer_product_1x4(a.Vectors(Number<4>{})[Number<0>{}],
b0.Vectors(Number<4>{})[Number<0>{}],
b1.Vectors(Number<4>{})[Number<0>{}],
b2.Vectors(Number<4>{})[Number<0>{}],
b3.Vectors(Number<4>{})[Number<0>{}],
const int8x4_t* p_a_int8x4_t = reinterpret_cast<const int8x4_t*>(&a);
const int8x4_t* p_b0_int8x4_t = reinterpret_cast<const int8x4_t*>(&b0);
const int8x4_t* p_b1_int8x4_t = reinterpret_cast<const int8x4_t*>(&b1);
const int8x4_t* p_b2_int8x4_t = reinterpret_cast<const int8x4_t*>(&b2);
const int8x4_t* p_b3_int8x4_t = reinterpret_cast<const int8x4_t*>(&b3);
amd_assembly_outer_product_1x4(p_a_int8x4_t[0],
p_b0_int8x4_t[0],
p_b1_int8x4_t[0],
p_b2_int8x4_t[0],
p_b3_int8x4_t[0],
c0,
c1,
c2,
c3);
amd_assembly_outer_product_1x4(a.Vectors(Number<4>{})[Number<1>{}],
b0.Vectors(Number<4>{})[Number<1>{}],
b1.Vectors(Number<4>{})[Number<1>{}],
b2.Vectors(Number<4>{})[Number<1>{}],
b3.Vectors(Number<4>{})[Number<1>{}],
amd_assembly_outer_product_1x4(p_a_int8x4_t[1],
p_b0_int8x4_t[1],
p_b1_int8x4_t[1],
p_b2_int8x4_t[1],
p_b3_int8x4_t[1],
c0,
c1,
c2,
......@@ -302,22 +309,30 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
int32_t& c1,
int32_t& c2,
int32_t& c3)
{
amd_assembly_outer_product_1x4(a.Vectors(Number<8>{})[Number<0>{}],
b0.Vectors(Number<8>{})[Number<0>{}],
b1.Vectors(Number<8>{})[Number<0>{}],
b2.Vectors(Number<8>{})[Number<0>{}],
b3.Vectors(Number<8>{})[Number<0>{}],
const int8x8_t* p_a_int8x8_t = reinterpret_cast<const int8x8_t*>(&a);
const int8x8_t* p_b0_int8x8_t = reinterpret_cast<const int8x8_t*>(&b0);
const int8x8_t* p_b1_int8x8_t = reinterpret_cast<const int8x8_t*>(&b1);
const int8x8_t* p_b2_int8x8_t = reinterpret_cast<const int8x8_t*>(&b2);
const int8x8_t* p_b3_int8x8_t = reinterpret_cast<const int8x8_t*>(&b3);
amd_assembly_outer_product_1x4(p_a_int8x8_t[0],
p_b0_int8x8_t[0],
p_b1_int8x8_t[0],
p_b2_int8x8_t[0],
p_b3_int8x8_t[0],
c0,
c1,
c2,
c3);
amd_assembly_outer_product_1x4(a.Vectors(Number<8>{})[Number<1>{}],
b0.Vectors(Number<8>{})[Number<1>{}],
b1.Vectors(Number<8>{})[Number<1>{}],
b2.Vectors(Number<8>{})[Number<1>{}],
b3.Vectors(Number<8>{})[Number<1>{}],
amd_assembly_outer_product_1x4(p_a_int8x8_t[1],
p_b0_int8x8_t[1],
p_b1_int8x8_t[1],
p_b2_int8x8_t[1],
p_b3_int8x8_t[1],
c0,
c1,
c2,
......
......@@ -7,13 +7,20 @@
#endif
#include "bfloat16_dev.hpp"
// address space for kernel parameter
#define __CONSTANT__ __attribute__((address_space(4)))
// device backend
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
#define CK_AMD_GPU_GFX906 0
#define CK_AMD_GPU_GFX908 0
#if 0
#define CK_AMD_GPU_GFX906 1
#elif 0
#define CK_AMD_GPU_GFX908 1
#elif 1
#define CK_AMD_GPU_GFX1030 1
#endif
// HIP version
#ifndef CK_HIP_VERSION_FLAT
......@@ -29,9 +36,9 @@
#endif
// buffer resourse
#if CK_AMD_GPU_GFX906 || CK_AMD_GPU_GFX908
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif CK_AMD_GPU_GFX1030
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif
......@@ -104,9 +111,8 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
// pass tensor descriptor by value, pointer or void*
// pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
......
......@@ -224,7 +224,6 @@ struct vector_type<T, 16>
__host__ __device__ constexpr auto& Vectors(Number<16>) { return data_.d16x1_; }
};
// fp32
using float2_t = typename vector_type<float, 2>::type;
using float4_t = typename vector_type<float, 4>::type;
......@@ -439,8 +438,8 @@ struct vector_type<int8_t, 16>
// hack for int8x4_t, because compiler does not have native support for int8x4_t
// int8x4_t is defined as int32_t
using int8x4_t = typename vector_type<int8_t, 4>::type;
using int8x8_t = vector_type<int8_t, 8>;
using int8x16_t = vector_type<int8_t, 16>;
using int8x8_t = typename vector_type<int8_t, 8>::type;
using int8x16_t = typename vector_type<int8_t, 16>::type;
// data type conversion
template <typename T>
......
......@@ -175,26 +175,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
#endif
constexpr auto conv_driver =
// DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
#if 0
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
#else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad<
#endif
BlockSize,
typename vector_type<TInWei, InWeiVectorSize>::type,
TAcc,
TOut,
KPerBlock,
HoPerBlock,
WoPerBlock,
EPerBlock,
KPerThread,
HoPerThread,
WoPerThread,
EPerThread,
ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K,
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W>{};
typename vector_type<TInWei, InWeiVectorSize>::type, TAcc, TOut, KPerBlock,
HoPerBlock, WoPerBlock, EPerBlock, KPerThread, HoPerThread, WoPerThread,
EPerThread, ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K, ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K, BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W > {};
conv_driver.Run(wei_k_c0_y_x_desc,
in_n_c0_hi_wi_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment