"vscode:/vscode.git/clone" did not exist on "1fff527702399165f09dd880be43cfd8b8bae472"
Commit 3dd0cc31 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge

parents c21521a1 198593d5
...@@ -362,173 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -362,173 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
const decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
const decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
const decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) / out_n_k_ho_wo_global_desc) /
...@@ -564,111 +397,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -564,111 +397,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
...@@ -1035,173 +872,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -1035,173 +872,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) / out_n_k_ho_wo_global_desc) /
...@@ -1237,111 +907,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -1237,111 +907,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
...@@ -1694,173 +1368,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 ...@@ -1694,173 +1368,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc,
out_n_k_ho_wo_global_desc) / out_n_k_ho_wo_global_desc) /
...@@ -1896,111 +1403,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 ...@@ -1896,111 +1403,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
......
...@@ -363,171 +363,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad ...@@ -363,171 +363,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time; (std::size_t(1000) * 1000 * 1000) / ave_time;
...@@ -561,111 +396,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad ...@@ -561,111 +396,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
...@@ -1017,171 +856,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1 ...@@ -1017,171 +856,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using ADesc = decltype(wei_gemmk_gemmm_global_desc);
using BDesc = decltype(in_gemmk_gemmn_global_desc);
using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc));
DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc));
DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc));
wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc);
in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc);
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice(
&out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc);
index_t nrepeat = 100;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc)*,
const FloatAB*,
decltype(in_gemmk_gemmn_global_desc)*,
const FloatAB*,
decltype(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc)*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
reinterpret_cast<const ADesc*>(
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()),
p_wei_global,
reinterpret_cast<const BDesc*>(
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()),
p_in_global,
reinterpret_cast<const CDesc*>(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
.GetDeviceBuffer()),
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time; (std::size_t(1000) * 1000 * 1000) / ave_time;
...@@ -1215,114 +889,117 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1 ...@@ -1215,114 +889,117 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, true>, true,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, true>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, true>>; true>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, true>{}); .GetDeviceBuffer(),
p_out_global);
} }
else else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_dynamic_gemm_v1<gridwise_gemm,
const void*, ADesc,
const FloatAB*, FloatAB,
const void*, BDesc,
const FloatAB*, FloatAB,
const void*, CDesc,
FloatC*, FloatC,
integral_constant<bool, false>, false,
integral_constant<bool, false>>; false>;
launch_kernel(kernel, launch_kernel(
dim3(GridSize), kernel,
dim3(BlockSize), dim3(GridSize),
0, dim3(BlockSize),
0, 0,
wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), 0,
p_wei_global, (void __CONSTANT__*)
in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(),
p_in_global, p_wei_global,
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(),
.GetDeviceBuffer(), p_in_global,
p_out_global, (void __CONSTANT__*)
integral_constant<bool, false>{}, out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf
integral_constant<bool, false>{}); .GetDeviceBuffer(),
p_out_global);
} }
} }
timer.End(); timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat; float ave_time = timer.GetElapsedTime() / nrepeat;
......
...@@ -181,7 +181,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -181,7 +181,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0>{}));
#if 1
// GEMM // GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3< using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
BlockSize, BlockSize,
...@@ -372,7 +371,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -372,7 +371,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl; << std::endl;
} }
#endif
} }
}; };
} // namespace ck } // namespace ck
......
...@@ -133,12 +133,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -133,12 +133,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto EPerBlock = a_block_mtx.GetLength(I0); constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
constexpr auto KPerThreadSubC = 4; constexpr auto KPerThreadSubC = 4;
constexpr auto HPerThreadSubC = 2;
constexpr auto WPerThreadSubC = 2; constexpr auto HoPerThreadSubC = 2;
constexpr auto WoPerThreadSubC = 2;
static_assert(KPerThread % KPerThreadSubC == 0, ""); static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % HPerThreadSubC == 0, ""); static_assert(HPerThread % HoPerThreadSubC == 0, "");
static_assert(WPerThread % WPerThreadSubC == 0, ""); static_assert(WPerThread % WoPerThreadSubC == 0, "");
// thread A, B for GEMM // thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
...@@ -161,8 +162,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -161,8 +162,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx), constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx),
decltype(b_thread_mtx), decltype(b_thread_mtx),
decltype(c_thread_mtx), decltype(c_thread_mtx),
HPerThreadSubC, HoPerThreadSubC,
WPerThreadSubC>{}; WoPerThreadSubC>{};
// loop over k // loop over k
#pragma unroll #pragma unroll
for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop) for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop)
...@@ -176,10 +177,10 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -176,10 +177,10 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
p_a_thread); p_a_thread);
#pragma unroll #pragma unroll
for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HPerThreadSubC) for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HoPerThreadSubC)
{ {
#pragma unroll #pragma unroll
for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WPerThreadSubC) for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WoPerThreadSubC)
{ {
threadwise_gemm.Run(p_a_thread, threadwise_gemm.Run(p_a_thread,
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple( p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
......
...@@ -11,6 +11,47 @@ ...@@ -11,6 +11,47 @@
namespace ck { namespace ck {
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by __CONSTANT__ void pointer
// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization
template <typename GridwiseGemm,
typename AGlobalDesc,
typename FloatA,
typename BGlobalDesc,
typename FloatB,
typename CGlobalDesc,
typename FloatC,
bool HasMainKBlockLoop,
bool HasDoubleTailKBlockLoop>
__global__ void run_gridwise_dynamic_gemm_v1(const void __CONSTANT__* p_a_k_m_global_desc,
const FloatA* __restrict__ p_a_global,
const void __CONSTANT__* p_b_k_n_global_desc,
const FloatB* __restrict__ p_b_global,
const void __CONSTANT__* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global)
{
// first cast void __CONSTANT__* to void*
// second cast void* to Desc*
// the copy constructor of tensor descriptor doesn't take address_space(4)
const auto a_k_m_global_desc =
*reinterpret_cast<const AGlobalDesc*>((const void*)p_a_k_m_global_desc);
const auto b_k_n_global_desc =
*reinterpret_cast<const BGlobalDesc*>((const void*)p_b_k_n_global_desc);
const auto c_m0_m1_n0_n1_global_desc =
*reinterpret_cast<const CGlobalDesc*>((const void*)p_c_m0_m1_n0_n1_global_desc);
GridwiseGemm{}.Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#endif
template <index_t BlockSize, template <index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
...@@ -427,7 +468,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1 ...@@ -427,7 +468,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
} }
} }
// pass tensor descriptor by reference
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc& a_k_m_global_desc, __device__ void Run(const AGlobalDesc& a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_a_global,
...@@ -452,57 +492,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1 ...@@ -452,57 +492,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
integral_constant<bool, HasMainKBlockLoop>{}, integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{}); integral_constant<bool, HasDoubleTailKBlockLoop>{});
} }
// pass tensor descriptors by pointers
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc* p_a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global,
const BGlobalDesc* p_b_k_n_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_k_m_global_desc = *p_a_k_m_global_desc;
const auto b_k_n_global_desc = *p_b_k_n_global_desc;
const auto c_m0_m1_n0_n1_global_desc = *p_c_m0_m1_n0_n1_global_desc;
Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
// pass tensor descriptors by void*
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const void* p_a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global,
const void* p_b_k_n_global_desc,
const FloatAB* __restrict__ p_b_global,
const void* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_k_m_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_k_m_global_desc);
const auto b_k_n_global_desc = *reinterpret_cast<const BGlobalDesc*>(p_b_k_n_global_desc);
const auto c_m0_m1_n0_n1_global_desc =
*reinterpret_cast<const CGlobalDesc*>(p_c_m0_m1_n0_n1_global_desc);
Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
}; };
} // namespace ck } // namespace ck
......
...@@ -272,21 +272,28 @@ __device__ void amd_assembly_outer_product_1x4(int8x8_t a, ...@@ -272,21 +272,28 @@ __device__ void amd_assembly_outer_product_1x4(int8x8_t a,
int32_t& c2, int32_t& c2,
int32_t& c3) int32_t& c3)
{ {
amd_assembly_outer_product_1x4(a.Vectors(Number<4>{})[Number<0>{}],
b0.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_a_int8x4_t = reinterpret_cast<const int8x4_t*>(&a);
b1.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_b0_int8x4_t = reinterpret_cast<const int8x4_t*>(&b0);
b2.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_b1_int8x4_t = reinterpret_cast<const int8x4_t*>(&b1);
b3.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_b2_int8x4_t = reinterpret_cast<const int8x4_t*>(&b2);
const int8x4_t* p_b3_int8x4_t = reinterpret_cast<const int8x4_t*>(&b3);
amd_assembly_outer_product_1x4(p_a_int8x4_t[0],
p_b0_int8x4_t[0],
p_b1_int8x4_t[0],
p_b2_int8x4_t[0],
p_b3_int8x4_t[0],
c0, c0,
c1, c1,
c2, c2,
c3); c3);
amd_assembly_outer_product_1x4(a.Vectors(Number<4>{})[Number<1>{}], amd_assembly_outer_product_1x4(p_a_int8x4_t[1],
b0.Vectors(Number<4>{})[Number<1>{}], p_b0_int8x4_t[1],
b1.Vectors(Number<4>{})[Number<1>{}], p_b1_int8x4_t[1],
b2.Vectors(Number<4>{})[Number<1>{}], p_b2_int8x4_t[1],
b3.Vectors(Number<4>{})[Number<1>{}], p_b3_int8x4_t[1],
c0, c0,
c1, c1,
c2, c2,
...@@ -302,22 +309,30 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, ...@@ -302,22 +309,30 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
int32_t& c1, int32_t& c1,
int32_t& c2, int32_t& c2,
int32_t& c3) int32_t& c3)
{ {
amd_assembly_outer_product_1x4(a.Vectors(Number<8>{})[Number<0>{}],
b0.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_a_int8x8_t = reinterpret_cast<const int8x8_t*>(&a);
b1.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_b0_int8x8_t = reinterpret_cast<const int8x8_t*>(&b0);
b2.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_b1_int8x8_t = reinterpret_cast<const int8x8_t*>(&b1);
b3.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_b2_int8x8_t = reinterpret_cast<const int8x8_t*>(&b2);
const int8x8_t* p_b3_int8x8_t = reinterpret_cast<const int8x8_t*>(&b3);
amd_assembly_outer_product_1x4(p_a_int8x8_t[0],
p_b0_int8x8_t[0],
p_b1_int8x8_t[0],
p_b2_int8x8_t[0],
p_b3_int8x8_t[0],
c0, c0,
c1, c1,
c2, c2,
c3); c3);
amd_assembly_outer_product_1x4(a.Vectors(Number<8>{})[Number<1>{}], amd_assembly_outer_product_1x4(p_a_int8x8_t[1],
b0.Vectors(Number<8>{})[Number<1>{}], p_b0_int8x8_t[1],
b1.Vectors(Number<8>{})[Number<1>{}], p_b1_int8x8_t[1],
b2.Vectors(Number<8>{})[Number<1>{}], p_b2_int8x8_t[1],
b3.Vectors(Number<8>{})[Number<1>{}], p_b3_int8x8_t[1],
c0, c0,
c1, c1,
c2, c2,
......
...@@ -7,13 +7,20 @@ ...@@ -7,13 +7,20 @@
#endif #endif
#include "bfloat16_dev.hpp" #include "bfloat16_dev.hpp"
// address space for kernel parameter
#define __CONSTANT__ __attribute__((address_space(4)))
// device backend // device backend
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
// GPU ID // GPU ID
#define CK_AMD_GPU_GFX906 0 #if 0
#define CK_AMD_GPU_GFX908 0 #define CK_AMD_GPU_GFX906 1
#elif 0
#define CK_AMD_GPU_GFX908 1
#elif 1
#define CK_AMD_GPU_GFX1030 1 #define CK_AMD_GPU_GFX1030 1
#endif
// HIP version // HIP version
#ifndef CK_HIP_VERSION_FLAT #ifndef CK_HIP_VERSION_FLAT
...@@ -29,9 +36,9 @@ ...@@ -29,9 +36,9 @@
#endif #endif
// buffer resourse // buffer resourse
#if CK_AMD_GPU_GFX906 || CK_AMD_GPU_GFX908 #if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif CK_AMD_GPU_GFX1030 #elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif #endif
...@@ -104,9 +111,8 @@ ...@@ -104,9 +111,8 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif #endif
// pass tensor descriptor by value, pointer or void* // pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
// hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack: have underlying assumption that need to be satsified, otherwise it's a bug
......
...@@ -224,17 +224,16 @@ struct vector_type<T, 16> ...@@ -224,17 +224,16 @@ struct vector_type<T, 16>
__host__ __device__ constexpr auto& Vectors(Number<16>) { return data_.d16x1_; } __host__ __device__ constexpr auto& Vectors(Number<16>) { return data_.d16x1_; }
}; };
// fp32 // fp32
using float2_t = typename vector_type<float, 2>::type; using float2_t = typename vector_type<float, 2>::type;
using float4_t = typename vector_type<float, 4>::type; using float4_t = typename vector_type<float, 4>::type;
using float8_t = typename vector_type<float, 8>::type; using float8_t = typename vector_type<float, 8>::type;
// fp16 // fp16
using half_t = _Float16; using half_t = _Float16;
using half2_t = typename vector_type<half_t, 2>::type; using half2_t = typename vector_type<half_t, 2>::type;
using half4_t = typename vector_type<half_t, 4>::type; using half4_t = typename vector_type<half_t, 4>::type;
using half8_t = typename vector_type<half_t, 8>::type; using half8_t = typename vector_type<half_t, 8>::type;
using half16_t = typename vector_type<half_t, 16>::type; using half16_t = typename vector_type<half_t, 16>::type;
// bfp16 // bfp16
...@@ -439,8 +438,8 @@ struct vector_type<int8_t, 16> ...@@ -439,8 +438,8 @@ struct vector_type<int8_t, 16>
// hack for int8x4_t, because compiler does not have native support for int8x4_t // hack for int8x4_t, because compiler does not have native support for int8x4_t
// int8x4_t is defined as int32_t // int8x4_t is defined as int32_t
using int8x4_t = typename vector_type<int8_t, 4>::type; using int8x4_t = typename vector_type<int8_t, 4>::type;
using int8x8_t = vector_type<int8_t, 8>; using int8x8_t = typename vector_type<int8_t, 8>::type;
using int8x16_t = vector_type<int8_t, 16>; using int8x16_t = typename vector_type<int8_t, 16>::type;
// data type conversion // data type conversion
template <typename T> template <typename T>
......
...@@ -175,26 +175,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -175,26 +175,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
#endif #endif
constexpr auto conv_driver = constexpr auto conv_driver =
// DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad< #if 0
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
#else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad< DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad<
BlockSize, #endif
typename vector_type<TInWei, InWeiVectorSize>::type, BlockSize,
TAcc, typename vector_type<TInWei, InWeiVectorSize>::type, TAcc, TOut, KPerBlock,
TOut, HoPerBlock, WoPerBlock, EPerBlock, KPerThread, HoPerThread, WoPerThread,
KPerBlock, EPerThread, ABlockTransferThreadSliceLengths_E_K,
HoPerBlock, ABlockTransferThreadClusterLengths_E_K, ABlockTransferSrcScalarPerVector_E,
WoPerBlock, ABlockTransferDstScalarPerVector_K, BThreadTransferSrcScalarPerVector_W,
EPerBlock, CThreadTransferDstScalarPerVector_W > {};
KPerThread,
HoPerThread,
WoPerThread,
EPerThread,
ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K,
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W>{};
conv_driver.Run(wei_k_c0_y_x_desc, conv_driver.Run(wei_k_c0_y_x_desc,
in_n_c0_hi_wi_desc, in_n_c0_hi_wi_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment