Commit 8ce9fe57 authored by carlushuang's avatar carlushuang
Browse files

remove useless comment, add several new config for multi thread

parent b8ba0239
...@@ -213,9 +213,6 @@ struct BlockwiseGemmAvx2_MxN ...@@ -213,9 +213,6 @@ struct BlockwiseGemmAvx2_MxN
auto current_mr = ck::math::min(m_per_block - i_m, m_per_thread); auto current_mr = ck::math::min(m_per_block - i_m, m_per_thread);
param.p_a = &a_block_buf.p_data_[GetABlockStartOffset(a_block_desc, i_m, 0)]; param.p_a = &a_block_buf.p_data_[GetABlockStartOffset(a_block_desc, i_m, 0)];
// printf("YYYY: %d, i_m:%d, current_mr:%d, %d, %p\n",__LINE__, i_m, current_mr,
// GetABlockStartOffset(a_block_desc, i_m, 0), param.p_a);fflush(stdout);
for(ck::index_t i_n = 0; i_n < n_per_block; i_n += n_per_thread) for(ck::index_t i_n = 0; i_n < n_per_block; i_n += n_per_thread)
{ {
auto current_nr = ck::math::min(n_per_block - i_n, n_per_thread); auto current_nr = ck::math::min(n_per_block - i_n, n_per_thread);
...@@ -223,11 +220,6 @@ struct BlockwiseGemmAvx2_MxN ...@@ -223,11 +220,6 @@ struct BlockwiseGemmAvx2_MxN
param.p_b = &b_block_buf.p_data_[GetBBlockStartOffset(b_block_desc, 0, i_n)]; param.p_b = &b_block_buf.p_data_[GetBBlockStartOffset(b_block_desc, 0, i_n)];
param.p_c = &c_buf.p_data_[GetCBlockStartOffset(c_desc, i_m, i_n)]; param.p_c = &c_buf.p_data_[GetCBlockStartOffset(c_desc, i_m, i_n)];
// printf("YYYY: %d, i_n:%d, current_nr:%d, %d, %p, C:%d, %p\n",__LINE__, i_n,
// current_nr, GetBBlockStartOffset(b_block_desc, 0, i_n), param.p_b,
// GetCBlockStartOffset(c_desc, i_m, i_n),
// param.p_c);fflush(stdout);
ThreadwiseGemm_Dispatch::Run(&param, current_mr, current_nr); ThreadwiseGemm_Dispatch::Run(&param, current_mr, current_nr);
} }
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <utility> #include <utility>
#include <unistd.h> #include <unistd.h>
#include <omp.h> #include <omp.h>
#include <pthread.h>
namespace ck { namespace ck {
namespace cpu { namespace cpu {
...@@ -193,6 +194,23 @@ struct GridwiseGemmAvx2_MxN ...@@ -193,6 +194,23 @@ struct GridwiseGemmAvx2_MxN
int total_threads = omp_get_max_threads(); int total_threads = omp_get_max_threads();
#if 0
if(total_threads > 1){
#pragma omp parallel
{
int tid = omp_get_thread_num();
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(tid, &set);
if (sched_setaffinity(0, sizeof(set), &set) == -1) {
throw std::runtime_error("wrong! fail to set thread affinity");
}
}
}
#endif
// TODO: openmp aware ordering // TODO: openmp aware ordering
// //
if constexpr(std::is_same<BlockMNKAccessOrder, ck::Sequence<0, 1, 2>>::value) if constexpr(std::is_same<BlockMNKAccessOrder, ck::Sequence<0, 1, 2>>::value)
...@@ -234,8 +252,9 @@ struct GridwiseGemmAvx2_MxN ...@@ -234,8 +252,9 @@ struct GridwiseGemmAvx2_MxN
MemAlignmentByte); MemAlignmentByte);
DeviceAlignedMemCPU b_block_mem(k_per_block * n_per_block * sizeof(FloatB), DeviceAlignedMemCPU b_block_mem(k_per_block * n_per_block * sizeof(FloatB),
MemAlignmentByte); MemAlignmentByte);
DeviceAlignedMemCPU c_block_mem(m_per_block * n_per_block * sizeof(FloatC), DeviceAlignedMemCPU c_block_mem(
MemAlignmentByte); UseCLocalBuffer ? (m_per_block * n_per_block * sizeof(FloatC)) : 0,
MemAlignmentByte);
auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>( auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>(
reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf), reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf),
...@@ -298,26 +317,9 @@ struct GridwiseGemmAvx2_MxN ...@@ -298,26 +317,9 @@ struct GridwiseGemmAvx2_MxN
auto a_block_desc = GetABlockDescriptor(mc_size, kc_size); auto a_block_desc = GetABlockDescriptor(mc_size, kc_size);
auto b_block_desc = GetBBlockDescriptor(kc_size, nc_size); auto b_block_desc = GetBBlockDescriptor(kc_size, nc_size);
// printf("[tid:%d]==> i_m:%d, i_n:%d, i_k:%d, mc:%d, nc:%d, kc:%d(%d,
// %d)\n", tid, i_mc,
// i_nc, i_kc, mc_size, nc_size, kc_size, KPerBlock, GemmK); fflush(stdout);
a_threadwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf); a_threadwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
b_threadwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf); b_threadwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);
// for(auto i_elem = 0; i_elem < (mc_size * kc_size) ; i_elem++){
// printf("A ==> %3d : %f(0x%08x)\n", i_elem,
// (reinterpret_cast<float*>(a_block_buf.p_data_))[i_elem],
// (reinterpret_cast<uint32_t*>(a_block_buf.p_data_))[i_elem]);
//}
// for(auto i_elem = 0; i_elem < (kc_size * nc_size) ; i_elem++){
// printf("B ==> %3d : %f(0x%08x)\n", i_elem,
// (reinterpret_cast<float*>(b_block_buf.p_data_))[i_elem],
// (reinterpret_cast<uint32_t*>(b_block_buf.p_data_))[i_elem]);
// }
// printf("[%d] 2222 \n",__LINE__);
blockwise_gemm.Run(a_block_desc, blockwise_gemm.Run(a_block_desc,
a_block_buf, a_block_buf,
make_zero_multi_index<a_block_copy_dim>(), make_zero_multi_index<a_block_copy_dim>(),
...@@ -329,28 +331,13 @@ struct GridwiseGemmAvx2_MxN ...@@ -329,28 +331,13 @@ struct GridwiseGemmAvx2_MxN
make_zero_multi_index<2>(), make_zero_multi_index<2>(),
i_kc != 0); i_kc != 0);
// printf("[%d] 2222 \n",__LINE__);
if((i_kc + k_per_block) < GemmK) if((i_kc + k_per_block) < GemmK)
{ {
a_threadwise_copy.MoveSrcSliceWindow(a_grid_desc, a_move_k_step); a_threadwise_copy.MoveSrcSliceWindow(a_grid_desc, a_move_k_step);
b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc, b_move_k_step); b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc, b_move_k_step);
} }
// printf("[%d] 2222 \n",__LINE__);
// for(auto i_elem = 0; i_elem < (10) ; i_elem++){
// printf("C ==> %3d : %f(0x%08x)\n", i_elem,
// (reinterpret_cast<float*>(c_block_buf.p_data_))[i_elem],
// (reinterpret_cast<uint32_t*>(c_block_buf.p_data_))[i_elem]);
// }
} }
// for(auto i_elem = 0; i_elem < (c_block_mem.mMemSize / sizeof(FloatC)) ;
// i_elem++){
// printf("C ==> %3d : %f(0x%08x)\n", i_elem,
// (reinterpret_cast<float*>(c_block_buf.p_data_))[i_elem],
// (reinterpret_cast<uint32_t*>(c_block_buf.p_data_))[i_elem]);
// }
if constexpr(UseCLocalBuffer) if constexpr(UseCLocalBuffer)
c_threadwise_copy.Run(c_block_desc, c_block_buf, c_grid_desc, c_grid_buf); c_threadwise_copy.Run(c_block_desc, c_block_buf, c_grid_desc, c_grid_buf);
} }
...@@ -396,8 +383,9 @@ struct GridwiseGemmAvx2_MxN ...@@ -396,8 +383,9 @@ struct GridwiseGemmAvx2_MxN
MemAlignmentByte); MemAlignmentByte);
DeviceAlignedMemCPU b_block_mem(k_per_block * n_per_block * sizeof(FloatB), DeviceAlignedMemCPU b_block_mem(k_per_block * n_per_block * sizeof(FloatB),
MemAlignmentByte); MemAlignmentByte);
DeviceAlignedMemCPU c_block_mem(m_per_block * n_per_block * sizeof(FloatC), DeviceAlignedMemCPU c_block_mem(
MemAlignmentByte); UseCLocalBuffer ? (m_per_block * n_per_block * sizeof(FloatC)) : 0,
MemAlignmentByte);
auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>( auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>(
reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf), reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf),
......
...@@ -349,9 +349,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -349,9 +349,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
src_offset = i_n * Hi * Wi * C + i_hi * Wi * C + i_wi * C + i_c; src_offset = i_n * Hi * Wi * C + i_hi * Wi * C + i_wi * C + i_c;
i_gemm_k = idx_k; i_gemm_k = idx_k;
// printf("[%d] i_wo:%d, i_ho:%d, i_wi:%d, i_hi:%d, src_offset:%d\n",
// __LINE__, i_wo, i_ho, i_wi, i_hi, src_offset);
} }
} }
...@@ -447,7 +444,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -447,7 +444,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
if(i_ho_itr >= Ho) if(i_ho_itr >= Ho)
{ {
i_ho_itr = 0; i_ho_itr = 0;
// i_n++;
p_src += input_offset_ovf_hi_acc_n; p_src += input_offset_ovf_hi_acc_n;
} }
...@@ -468,26 +464,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -468,26 +464,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
ck::index_t i_wi_itr = i_wi; ck::index_t i_wi_itr = i_wi;
ck::index_t i_hi_itr = i_hi; ck::index_t i_hi_itr = i_hi;
// printf("[%d] i_m_itr:%d, i_wo_itr:%d, i_ho_itr:%d, i_wi_itr:%d, i_hi_itr:%d,
// src_offset:%d, input_offset_acc_wi:%d,
// input_offset_ovf_wi_acc_hi:%d,input_offset_ovf_hi_acc_n:%d, %p(%p)\n",
// __LINE__, i_m_itr, i_wo_itr, i_ho_itr, i_wi_itr, i_hi_itr,
// src_offset, input_offset_acc_wi, input_offset_ovf_wi_acc_hi,
// input_offset_ovf_hi_acc_n, src_buf.p_data_, p_src);
// printf("%p %p %p, %d, %x, %p\n",src_buf.p_data_, reinterpret_cast<const
// float*>(src_buf.p_data_) + 1, reinterpret_cast<const float*>(src_buf.p_data_)
// + ck::index_t(-1),
// sizeof(src_offset), *reinterpret_cast<uint32_t*>(&src_offset),
// reinterpret_cast<const float*>(src_buf.p_data_) + (-1088));
while(i_m_itr > 0) while(i_m_itr > 0)
{ {
// printf("[%d] i_m_itr:%d, i_wo_itr:%d, i_ho_itr:%d, i_wi_itr:%d,
// i_hi_itr:%d, src_offset:%d -> %p\n",
// __LINE__, i_m_itr, i_wo_itr, i_ho_itr, i_wi_itr, i_hi_itr, src_offset,
// p_src);
if((*reinterpret_cast<uint32_t*>(&i_hi_itr) < Hi) && if((*reinterpret_cast<uint32_t*>(&i_hi_itr) < Hi) &&
(*reinterpret_cast<uint32_t*>(&i_wi_itr) < Wi)) (*reinterpret_cast<uint32_t*>(&i_wi_itr) < Wi))
avx2_util::memcpy32_avx2(p_dst, p_src, k_per_block); avx2_util::memcpy32_avx2(p_dst, p_src, k_per_block);
...@@ -512,14 +490,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -512,14 +490,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
{ {
i_ho_itr = 0; i_ho_itr = 0;
i_hi_itr -= Ho * Sy; i_hi_itr -= Ho * Sy;
// i_n++;
p_src += input_offset_ovf_hi_acc_n; p_src += input_offset_ovf_hi_acc_n;
} }
i_m_itr--; i_m_itr--;
} }
// printf("[%d] \n", __LINE__);
} }
else else
{ {
...@@ -538,8 +513,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -538,8 +513,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
ck::index_t i_wi_itr_k = i_wi_itr; ck::index_t i_wi_itr_k = i_wi_itr;
ck::index_t i_hi_itr_k = i_hi_itr; ck::index_t i_hi_itr_k = i_hi_itr;
ck::index_t i_c_itr_k = i_c; ck::index_t i_c_itr_k = i_c;
ck::index_t i_y_itr_k = i_y; // ck::index_t i_y_itr_k = i_y;
ck::index_t i_x_itr_k = i_x; ck::index_t i_x_itr_k = i_x;
ck::index_t i_k_itr = k_per_block; ck::index_t i_k_itr = k_per_block;
while(i_k_itr > 0) while(i_k_itr > 0)
...@@ -566,7 +541,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -566,7 +541,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
if(i_x_itr_k >= Fx) if(i_x_itr_k >= Fx)
{ {
i_x_itr_k = 0; i_x_itr_k = 0;
i_y_itr_k++; // i_y_itr_k++;
i_wi_itr_k -= Dx * Fx; i_wi_itr_k -= Dx * Fx;
i_hi_itr_k += Dy; i_hi_itr_k += Dy;
p_src_k += input_offset_ovf_x_acc_y; p_src_k += input_offset_ovf_x_acc_y;
...@@ -594,7 +569,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -594,7 +569,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
{ {
i_ho_itr = 0; i_ho_itr = 0;
i_hi_itr -= Ho * Sy; i_hi_itr -= Ho * Sy;
// i_n++;
p_src += input_offset_ovf_hi_acc_n; p_src += input_offset_ovf_hi_acc_n;
} }
...@@ -626,40 +600,27 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -626,40 +600,27 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
if constexpr(GemmKSpecialization == if constexpr(GemmKSpecialization ==
ConvolutionForwardGemmKSpecialization_t::NHWC_GemmKLoopOverC) ConvolutionForwardGemmKSpecialization_t::NHWC_GemmKLoopOverC)
{ {
// c % k_per_block == 0, so every time k_per_block here is the same
// ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h
// iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w
// printf("222222 C:%d, src_offset:%d, i_c:%d, i_x:%d\n", C, src_offset, i_c, i_x);
// fflush(stdout);
// TODO: branch seems weird // TODO: branch seems weird
i_c += move_k; i_c += move_k;
src_offset += move_k; src_offset += move_k;
// printf("3333[%d] src_offset:%d\n", __LINE__, src_offset);
if(i_c >= C) if(i_c >= C)
{ {
i_c = 0; i_c = 0;
i_x++; i_x++;
i_wi += Dx; i_wi += Dx;
src_offset += Dx * C - C; src_offset += Dx * C - C;
// printf("3333[%d] src_offset:%d\n", __LINE__, src_offset);
} }
if(i_x >= Fx) if(i_x >= Fx)
{ {
i_x = 0; i_x = 0;
i_y++; // i_y++;
i_wi = i_wi - Fx * Dx; i_wi = i_wi - Fx * Dx;
i_hi += Dy; i_hi += Dy;
src_offset += Dy * Wi * C - Fx * Dx * C; src_offset += Dy * Wi * C - Fx * Dx * C;
// printf("3333[%d] src_offset:%d\n", __LINE__, src_offset);
} }
// printf("inp move:%d, i_c:%d, i_hi:%d, i_wi:%d src_offset:%d\n", move_k, i_c,
// i_hi, i_wi, src_offset); fflush(stdout);
} }
else else
{ {
......
...@@ -28,17 +28,24 @@ DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } ...@@ -28,17 +28,24 @@ DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment) DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
: mMemSize(mem_size), mAlignment(alignment) : mMemSize(mem_size), mAlignment(alignment)
{ {
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2 if(mem_size == 0)
{
mpDeviceBuf = nullptr;
}
else
{
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
void* p1; void* p1;
void** p2; void** p2;
int offset = alignment - 1 + sizeof(void*); int offset = alignment - 1 + sizeof(void*);
p1 = malloc(mem_size + offset); p1 = malloc(mem_size + offset);
assert(p1 != nullptr); assert(p1 != nullptr);
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1)); p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
p2[-1] = p1; p2[-1] = p1;
mpDeviceBuf = reinterpret_cast<void*>(p2); mpDeviceBuf = reinterpret_cast<void*>(p2);
}
} }
void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; } void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
...@@ -51,7 +58,11 @@ void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize) ...@@ -51,7 +58,11 @@ void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize)
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); } void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); } DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{
if(mpDeviceBuf != nullptr)
free((reinterpret_cast<void**>(mpDeviceBuf))[-1]);
}
struct KernelTimerImpl struct KernelTimerImpl
{ {
......
...@@ -55,30 +55,81 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver ...@@ -55,30 +55,81 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, GemmKLoopOverC , LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \ DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, GemmKLoopOverC , LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \ DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf> DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>
// clang-format on // clang-format on
using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple< using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
// clang-format off // clang-format off
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 64, 6, 16, true, true, false), DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 64, 6, 16, true, true, false),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 128, 6, 16, true, true, false), DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 128, 6, 16, true, true, false),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 128, 256, 128, 6, 16, true, true, false), DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 128, 256, 128, 6, 16, true, true, false),
// DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 144, 128, 4, 24, true, true, false),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 240, 128, 4, 24, true, true, false), DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 240, 128, 4, 24, true, true, false),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 256, 128, 6, 16, true, true, false), DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 256, 128, 6, 16, true, true, false),
// DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 768, 288, 128, 4, 24, true, true, false),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 768, 320, 128, 6, 16, true, true, false), DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 768, 320, 128, 6, 16, true, true, false),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 896, 352, 128, 6, 16, true, true, false), DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 896, 352, 128, 6, 16, true, true, false),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 1024, 416, 128, 6, 16, true, true, false)>; DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 1024, 416, 128, 6, 16, true, true, false)>;
// clang-format on // clang-format on
// use this in single thread, but gemm_n is not multiple of 8
using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_local_c_instances = std::tuple<
// clang-format off
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 64, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 128, 256, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 240, 128, 4, 24, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 256, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 768, 320, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 896, 352, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 1024, 416, 128, 6, 16, true, true, true)>;
// clang-format on
// use this in multi thread environment (need local C buffer to avoid cache coherence, although some
// time no local c is better...)
using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_mt_instances = std::tuple<
// clang-format off
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 48, 24, 128, 4, 24, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 72, 16, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 72, 32, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 96, 32, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 96, 64, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 120, 32, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 120, 64, 128, 6, 16, true, true, true),
// DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 64, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 256, 128, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 128, 256, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 240, 128, 4, 24, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 512, 256, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 768, 320, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 896, 352, 128, 6, 16, true, true, true),
DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 1024, 416, 128, 6, 16, true, true, true)>;
// clang-format on
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances) void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{ {
ck::tensor_operation::device::add_device_operation_instances( ck::tensor_operation::device::add_device_operation_instances(
instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{}); instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
} }
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_local_c_instances{});
}
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_mt_instances{});
}
} // namespace device_conv2d_fwd_avx2_instance } // namespace device_conv2d_fwd_avx2_instance
} // namespace device } // namespace device
} // namespace cpu } // namespace cpu
......
#pragma once #pragma once
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp" #include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp" #include "element_wise_operation_cpu.hpp"
#include "reference_conv_fwd.hpp" #include "reference_conv_fwd.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace cpu { namespace cpu {
namespace device { namespace device {
namespace device_conv2d_fwd_avx2_instance { namespace device_conv2d_fwd_avx2_instance {
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk( void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances); std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
} // namespace device_conv2d_fwd_avx2_instance void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
} // namespace device std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
} // namespace cpu
} // namespace tensor_operation void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
} // namespace ck std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
namespace ck { } // namespace device_conv2d_fwd_avx2_instance
namespace profiler { } // namespace device
} // namespace cpu
#define AVX2_DATA_ALIGNMENT } // namespace tensor_operation
} // namespace ck
template <int NDimSpatial,
typename InDataType, namespace ck {
typename WeiDataType, namespace profiler {
typename OutDataType,
typename InLayout, #define AVX2_DATA_ALIGNMENT
typename WeiLayout,
typename OutLayout> template <int NDimSpatial,
void profile_conv_cpu_fwd_impl(int do_verification, typename InDataType,
int init_method, typename WeiDataType,
bool do_log, typename OutDataType,
int nrepeat, typename InLayout,
ck::index_t N, typename WeiLayout,
ck::index_t K, typename OutLayout>
ck::index_t C, void profile_conv_cpu_fwd_impl(int do_verification,
std::vector<ck::index_t> input_spatial_lengths, int init_method,
std::vector<ck::index_t> filter_spatial_lengths, bool do_log,
std::vector<ck::index_t> output_spatial_lengths, int nrepeat,
std::vector<ck::index_t> conv_filter_strides, ck::index_t N,
std::vector<ck::index_t> conv_filter_dilations, ck::index_t K,
std::vector<ck::index_t> input_left_pads, ck::index_t C,
std::vector<ck::index_t> input_right_pads) std::vector<ck::index_t> input_spatial_lengths,
{ std::vector<ck::index_t> filter_spatial_lengths,
const ck::index_t Y = filter_spatial_lengths[0]; std::vector<ck::index_t> output_spatial_lengths,
const ck::index_t X = filter_spatial_lengths[1]; std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
const ck::index_t Hi = input_spatial_lengths[0]; std::vector<ck::index_t> input_left_pads,
const ck::index_t Wi = input_spatial_lengths[1]; std::vector<ck::index_t> input_right_pads)
{
const ck::index_t Ho = output_spatial_lengths[0]; const ck::index_t Y = filter_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1]; const ck::index_t X = filter_spatial_lengths[1];
auto f_host_tensor_descriptor = const ck::index_t Hi = input_spatial_lengths[0];
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { const ck::index_t Wi = input_spatial_lengths[1];
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || const ck::index_t Ho = output_spatial_lengths[0];
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) const ck::index_t Wo = output_spatial_lengths[1];
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), auto f_host_tensor_descriptor =
std::vector<std::size_t>({C_ * H * W, H * W, W, 1})); [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
} if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value || is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value || is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value) {
{ return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_})); }
} else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
}; is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); {
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
Tensor<OutDataType> out_n_k_ho_wo_host_result( std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); }
Tensor<OutDataType> out_n_k_ho_wo_device_result( };
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; Tensor<OutDataType> out_n_k_ho_wo_host_result(
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_device_result(
switch(init_method) f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
{
case 0: break; std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
case 1: std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break; switch(init_method)
default: {
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); case 0: break;
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); case 1:
} in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
using InElementOp = ck::tensor_operation::cpu::element_wise::PassThrough; break;
using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough; default:
using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough; in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
const auto in_element_op = InElementOp{}; }
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; using InElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
if(do_verification) using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
{
using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType, const auto in_element_op = InElementOp{};
WeiDataType, const auto wei_element_op = WeiElementOp{};
OutDataType, const auto out_element_op = OutElementOp{};
InElementOp,
WeiElementOp, if(do_verification)
OutElementOp>; {
using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
auto ref_conv = ReferenceConvFwdInstance{}; WeiDataType,
auto ref_invoker = ref_conv.MakeInvoker(); OutDataType,
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi, InElementOp,
wei_k_c_y_x, WeiElementOp,
out_n_k_ho_wo_host_result, OutElementOp>;
conv_filter_strides,
conv_filter_dilations, auto ref_conv = ReferenceConvFwdInstance{};
input_left_pads, auto ref_invoker = ref_conv.MakeInvoker();
input_right_pads, auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
in_element_op, wei_k_c_y_x,
wei_element_op, out_n_k_ho_wo_host_result,
out_element_op); conv_filter_strides,
conv_filter_dilations,
ref_invoker.Run(ref_argument); input_left_pads,
} input_right_pads,
in_element_op,
DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(), wei_element_op,
AVX2_DATA_ALIGNMENT); out_element_op);
DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT); ref_invoker.Run(ref_argument);
DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) * }
out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT); DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT);
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); AVX2_DATA_ALIGNMENT);
DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize); out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize); AVX2_DATA_ALIGNMENT);
using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough; in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>; memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize);
memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize);
// add device Conv instances
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
ck::tensor_operation::cpu::device::device_conv2d_fwd_instance:: using DeviceConvFwdNoOpPtr =
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs); ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
if(conv_ptrs.size() <= 0) // add device Conv instances
{ std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
throw std::runtime_error("wrong! no device Conv instance found");
} ck::tensor_operation::cpu::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
std::string best_conv_name;
float best_ave_time = 0; if(conv_ptrs.size() <= 0)
float best_gflops = 0; {
float best_gb_per_sec = 0; throw std::runtime_error("wrong! no device Conv instance found");
}
// profile device Conv instances
for(auto& conv_ptr : conv_ptrs) std::string best_conv_name;
{ float best_ave_time = 0;
auto argument_ptr = conv_ptr->MakeArgumentPointer( float best_gflops = 0;
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), float best_gb_per_sec = 0;
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), // profile device Conv instances
N, for(auto& conv_ptr : conv_ptrs)
K, {
C, auto argument_ptr = conv_ptr->MakeArgumentPointer(
input_spatial_lengths, static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
filter_spatial_lengths, static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
output_spatial_lengths, static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_filter_strides, N,
conv_filter_dilations, K,
input_left_pads, C,
input_right_pads, input_spatial_lengths,
in_element_op, filter_spatial_lengths,
wei_element_op, output_spatial_lengths,
out_element_op); conv_filter_strides,
conv_filter_dilations,
auto invoker_ptr = conv_ptr->MakeInvokerPointer(); input_left_pads,
input_right_pads,
if(conv_ptr->IsSupportedArgument(argument_ptr.get())) in_element_op,
{ wei_element_op,
std::string conv_name = conv_ptr->GetTypeString(); out_element_op);
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); auto invoker_ptr = conv_ptr->MakeInvokerPointer();
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + std::string conv_name = conv_ptr->GetTypeString();
sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo); float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
float gflops = static_cast<float>(flop) / 1.E6 / ave_time; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
float gb_per_sec = num_btype / 1.E6 / ave_time; std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec sizeof(OutDataType) * (N * K * Ho * Wo);
<< " GB/s, " << conv_name << std::endl;
float gflops = static_cast<float>(flop) / 1.E6 / ave_time;
if(gflops > best_gflops)
{ float gb_per_sec = num_btype / 1.E6 / ave_time;
best_conv_name = conv_name;
best_gflops = gflops; std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec
best_ave_time = ave_time; << " GB/s, " << conv_name << std::endl;
best_gb_per_sec = gb_per_sec;
} if(gflops > best_gflops)
{
if(do_verification) best_conv_name = conv_name;
{ best_gflops = gflops;
memcpy(out_n_k_ho_wo_device_result.mData.data(), best_ave_time = ave_time;
out_device_buf.mpDeviceBuf, best_gb_per_sec = gb_per_sec;
out_device_buf.mMemSize); }
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); if(do_verification)
{
if(do_log) memcpy(out_n_k_ho_wo_device_result.mData.data(),
{ out_device_buf.mpDeviceBuf,
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",") out_device_buf.mMemSize);
<< std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",") check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
<< std::endl;
LogRangeAsType<float>( if(do_log)
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",") {
<< std::endl; LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
LogRangeAsType<float>( << std::endl;
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",") LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl; << std::endl;
} LogRangeAsType<float>(
} std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",")
} << std::endl;
} LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, " << std::endl;
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; }
} }
}
} // namespace profiler }
} // namespace ck
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
}
} // namespace profiler
} // namespace ck
...@@ -26,6 +26,12 @@ using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough; ...@@ -26,6 +26,12 @@ using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk( void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances); std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
} // namespace device_conv2d_fwd_avx2_instance } // namespace device_conv2d_fwd_avx2_instance
} // namespace device } // namespace device
} // namespace cpu } // namespace cpu
...@@ -300,8 +306,22 @@ int main(int argc, char* argv[]) ...@@ -300,8 +306,22 @@ int main(int argc, char* argv[])
ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> && ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, float>) ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
{ {
ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance:: if(omp_get_max_threads() > 1)
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs); {
ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(conv_ptrs);
ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
}
else
{
if(K % 8 == 0)
ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
else
ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(conv_ptrs);
}
} }
if(conv_ptrs.size() <= 0) if(conv_ptrs.size() <= 0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment