Commit 6bf45709 authored by Chao Liu's avatar Chao Liu
Browse files

remove passing by pointer* (only use pass by value and void*), clean up

parent af13f822
...@@ -362,157 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -362,157 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
true>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(const ADesc __CONSTANT__*)reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
(const BDesc __CONSTANT__*)reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
(const CDesc __CONSTANT__*)reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global);
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
true,
false>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(const ADesc __CONSTANT__*)reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
(const BDesc __CONSTANT__*)reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
(const CDesc __CONSTANT__*)reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global);
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
true>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(const ADesc __CONSTANT__*)reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
(const BDesc __CONSTANT__*)reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
(const CDesc __CONSTANT__*)reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global);
}
else
{
const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
ADesc,
FloatAB,
BDesc,
FloatAB,
CDesc,
FloatC,
false,
false>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
(const ADesc __CONSTANT__*)reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
(const BDesc __CONSTANT__*)reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
(const CDesc __CONSTANT__*)reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global);
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) / out_n_k_ho_wo_global_desc) /
...@@ -1031,7 +880,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -1031,7 +880,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl; << std::endl;
} }
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER #elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc); using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc); using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
...@@ -1058,278 +907,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -1058,278 +907,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
run_gridwise_operation<gridwise_gemm, ADesc,
decltype(wei_gemmk_gemmm_global_desc)*, FloatAB,
const FloatAB*, BDesc,
decltype(in_gemmk_gemmn_global_desc)*, FloatAB,
const FloatAB*, CDesc,
decltype( FloatC,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*, true,
FloatC*, true>;
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
reinterpret_cast<const ADesc*>( 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), (void __CONSTANT__*)
p_wei_global, wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
reinterpret_cast<const BDesc*>( p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_in_global,
reinterpret_cast<const CDesc*>( (void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()), .GetDeviceBuffer(),
p_out_global, p_out_global);
integral_constant<bool, true>{},
integral_constant<bool, true>{});
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
run_gridwise_operation<gridwise_gemm, ADesc,
decltype(wei_gemmk_gemmm_global_desc)*, FloatAB,
const FloatAB*, BDesc,
decltype(in_gemmk_gemmn_global_desc)*, FloatAB,
const FloatAB*, CDesc,
decltype( FloatC,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*, true,
FloatC*, false>;
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
reinterpret_cast<const ADesc*>( 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), (void __CONSTANT__*)
p_wei_global, wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
reinterpret_cast<const BDesc*>( p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_in_global,
reinterpret_cast<const CDesc*>( (void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()), .GetDeviceBuffer(),
p_out_global, p_out_global);
integral_constant<bool, true>{},
integral_constant<bool, false>{});
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
run_gridwise_operation<gridwise_gemm, ADesc,
decltype(wei_gemmk_gemmm_global_desc)*, FloatAB,
const FloatAB*, BDesc,
decltype(in_gemmk_gemmn_global_desc)*, FloatAB,
const FloatAB*, CDesc,
decltype( FloatC,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*, false,
FloatC*, true>;
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
reinterpret_cast<const ADesc*>( 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), (void __CONSTANT__*)
p_wei_global, wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
reinterpret_cast<const BDesc*>( p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_in_global,
reinterpret_cast<const CDesc*>( (void __CONSTANT__*)
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()), .GetDeviceBuffer(),
p_out_global, p_out_global);
integral_constant<bool, false>{},
integral_constant<bool, true>{});
} }
else else
{ {
const auto kernel = const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
run_gridwise_operation<gridwise_gemm, ADesc,
decltype(wei_gemmk_gemmm_global_desc)*, FloatAB,
const FloatAB*, BDesc,
decltype(in_gemmk_gemmn_global_desc)*, FloatAB,
const FloatAB*, CDesc,
decltype( FloatC,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*, false,
FloatC*, false>;
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_wei_global,
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
p_in_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer(),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const void*,
const FloatAB*,
const void*,
const FloatAB*,
const void*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
...@@ -1682,173 +1368,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 ...@@ -1682,173 +1368,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) / out_n_k_ho_wo_global_desc) /
...@@ -1884,111 +1403,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 ...@@ -1884,111 +1403,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
......
...@@ -363,171 +363,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad ...@@ -363,171 +363,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time; (std::size_t(1000) * 1000 * 1000) / ave_time;
...@@ -561,111 +396,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad ...@@ -561,111 +396,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
...@@ -1017,171 +856,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1 ...@@ -1017,171 +856,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time; (std::size_t(1000) * 1000 * 1000) / ave_time;
...@@ -1215,114 +889,117 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1 ...@@ -1215,114 +889,117 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
timer.End(); timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
......
...@@ -11,70 +11,7 @@ ...@@ -11,70 +11,7 @@
namespace ck { namespace ck {
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by value
template <typename GridwiseGemm,
typename AGlobalDesc,
typename FloatA,
typename BGlobalDesc,
typename FloatB,
typename CGlobalDesc,
typename FloatC,
bool HasMainKBlockLoop,
bool HasDoubleTailKBlockLoop>
__global__ void run_gridwise_dynamic_gemm_v1(const AGlobalDesc a_k_m_global_desc,
const FloatA* __restrict__ p_a_global,
const BGlobalDesc b_k_n_global_desc,
const FloatB* __restrict__ p_b_global,
const CGlobalDesc c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global)
{
GridwiseGemm{}.Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
// pass tensor descriptor by __CONSTANT__ pointer
// __CONSTANT__ is needed to inform compiler pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization
template <typename GridwiseGemm,
typename AGlobalDesc,
typename FloatA,
typename BGlobalDesc,
typename FloatB,
typename CGlobalDesc,
typename FloatC,
bool HasMainKBlockLoop,
bool HasDoubleTailKBlockLoop>
__global__ void
run_gridwise_dynamic_gemm_v1(const AGlobalDesc __CONSTANT__* p_a_k_m_global_desc,
const FloatA* __restrict__ p_a_global,
const BGlobalDesc __CONSTANT__* p_b_k_n_global_desc,
const FloatB* __restrict__ p_b_global,
const CGlobalDesc __CONSTANT__* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global)
{
// cast pointer to address_space(1), because the copy constructor of tensor descriptor is for
// address_space(1)
const auto a_k_m_global_desc = *(const AGlobalDesc*)p_a_k_m_global_desc;
const auto b_k_n_global_desc = *(const BGlobalDesc*)p_b_k_n_global_desc;
const auto c_m0_m1_n0_n1_global_desc = *(const CGlobalDesc*)p_c_m0_m1_n0_n1_global_desc;
GridwiseGemm{}.Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by __CONSTANT__ void pointer // pass tensor descriptor by __CONSTANT__ void pointer
// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to // __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization // non-modifiable parameter address space, so compiler can enable corresponding optimization
......
...@@ -107,10 +107,9 @@ ...@@ -107,10 +107,9 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif #endif
// pass tensor descriptor by value, pointer or void* // pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment