Commit 4908fe3f authored by Chao Liu's avatar Chao Liu
Browse files

tweak on amd

parent a9b2b1dc
...@@ -335,15 +335,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -335,15 +335,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
// True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#else
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True); blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
blockwise_wei_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#endif
__syncthreads(); __syncthreads();
...@@ -367,14 +360,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -367,14 +360,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#else
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True); blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
blockwise_wei_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#endif
__syncthreads(); __syncthreads();
...@@ -447,7 +434,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -447,7 +434,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 0 #if 1
ThreadwiseGenericTensorSliceCopy_v1r2< ThreadwiseGenericTensorSliceCopy_v1r2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
...@@ -469,8 +456,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -469,8 +456,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
7, 7,
7, 7,
1, 1,
1>( 1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
.Run(p_out_thread, p_out_thread_on_global); .Run(p_out_thread, p_out_thread_on_global);
#endif #endif
} }
......
...@@ -244,6 +244,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -244,6 +244,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
// zero out threadwise output // zero out threadwise output
threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread); threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread);
const Float* p_wei_block_on_global = p_wei_global;
// LDS double buffer: preload data into LDS // LDS double buffer: preload data into LDS
{ {
blockwise_in_copy.Run(p_in_global, p_in_block_double); blockwise_in_copy.Run(p_in_global, p_in_block_double);
...@@ -273,13 +275,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -273,13 +275,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True); blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
blockwise_wei_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0];
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global,
p_wei_register_buffer);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
...@@ -297,13 +300,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -297,13 +300,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
// even iteration // even iteration
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True); blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
blockwise_wei_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0];
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
......
...@@ -237,7 +237,10 @@ struct MergedTensorCoordinate ...@@ -237,7 +237,10 @@ struct MergedTensorCoordinate
index_t normal_offset_diff = 0; index_t normal_offset_diff = 0;
static_for<0, nDim, 1>{}([&](auto idim) { static_for<0, nDim, 1>{}([&](auto idim) {
if(step_sizes[idim] != 0)
{
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{}); this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
}
}); });
return *this; return *this;
...@@ -249,7 +252,10 @@ struct MergedTensorCoordinate ...@@ -249,7 +252,10 @@ struct MergedTensorCoordinate
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!"); static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
static_for<0, nDim, 1>{}([&](auto idim) { static_for<0, nDim, 1>{}([&](auto idim) {
if(step_sizes[idim] != 0)
{
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{}); this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
}
}); });
return *this; return *this;
......
...@@ -402,6 +402,19 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -402,6 +402,19 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
}); });
} }
template <class T, bool PositiveDirection>
__device__ void
MoveSrcSlicingWindow(T step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
static_for<0, nDim, 1>{}([&](auto idim) {
if(step_sizes[idim] != 0)
{
MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
}
});
}
}; };
template <index_t BlockSize, template <index_t BlockSize,
...@@ -502,21 +515,6 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -502,21 +515,6 @@ struct BlockwiseGenericTensorSliceCopy_v2
private: private:
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
#if 0
using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v2<SrcDesc,
RegisterBufferDesc,
SrcCoordinate,
NormalTensorCoordinate<RegisterBufferDesc>,
SubLengths>;
using ThreadwiseStore =
ThreadwiseGenericTensorSliceCopy_v2<RegisterBufferDesc,
DstDesc,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
SubLengths>;
#else
using ThreadwiseLoad = using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc, ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
RegisterBufferDesc, RegisterBufferDesc,
...@@ -542,7 +540,7 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -542,7 +540,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
DstVectorAccessDim, DstVectorAccessDim,
1, 1,
DstDataPerAccess>; DstDataPerAccess>;
#endif
ThreadwiseLoad mThreadwiseLoad; ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore; ThreadwiseStore mThreadwiseStore;
}; };
......
...@@ -594,7 +594,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -594,7 +594,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2
DstCoordinate mDstSliceOrigin; DstCoordinate mDstSliceOrigin;
}; };
#if 1
// This threadwise copy allow vector access of src and dst. // This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst. // It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst. // It also allows the vector size to be different on src and dst.
...@@ -623,6 +622,49 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -623,6 +622,49 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
DstCoordinate dst_slice_origin) DstCoordinate dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{ {
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
} }
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1() __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
...@@ -725,9 +767,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -725,9 +767,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i]; p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}); });
}); });
...@@ -900,7 +939,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -900,7 +939,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
SrcCoordinate mSrcSliceOrigin; SrcCoordinate mSrcSliceOrigin;
DstCoordinate mDstSliceOrigin; DstCoordinate mDstSliceOrigin;
}; };
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1 #if 0
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128; constexpr index_t BPerBlock = 128;
...@@ -89,7 +89,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -89,7 +89,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
constexpr index_t OutThreadCopyDataPerAccess_B = 1; constexpr index_t OutThreadCopyDataPerAccess_B = 1;
#elif 0 // debug #elif 1
// 1x1 filter, 8x8 image
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128; constexpr index_t BPerBlock = 128;
...@@ -112,7 +113,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -112,7 +113,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t InBlockCopyDataPerAccess_B = 1; constexpr index_t InBlockCopyDataPerAccess_B = 4;
using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; using WeiBlockCopySubLengths_E_K = Sequence<4, 1>;
using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>;
...@@ -123,9 +124,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -123,9 +124,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
constexpr index_t OutThreadCopyDataPerAccess_B = 1; constexpr index_t OutThreadCopyDataPerAccess_B = 4;
#elif 1 #elif 0
// 1x1 filter, 8x8 image // 1x1 filter, 14x14 image
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128; constexpr index_t BPerBlock = 128;
...@@ -142,13 +143,13 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -142,13 +143,13 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4; constexpr index_t GemmDataPerReadB = 4;
using InBlockCopySubLengths_E_B = Sequence<1, 4>; using InBlockCopySubLengths_E_B = Sequence<2, 2>;
using InBlockCopyClusterLengths_E_B = Sequence<8, 32>; using InBlockCopyClusterLengths_E_B = Sequence<4, 64>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t InBlockCopyDataPerAccess_B = 4; constexpr index_t InBlockCopyDataPerAccess_B = 2;
using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; using WeiBlockCopySubLengths_E_K = Sequence<4, 1>;
using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>;
...@@ -159,7 +160,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -159,7 +160,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
constexpr index_t OutThreadCopyDataPerAccess_B = 4; constexpr index_t OutThreadCopyDataPerAccess_B = 2;
#endif #endif
constexpr index_t B = N * Ho * Wo; constexpr index_t B = N * Ho * Wo;
......
...@@ -87,7 +87,7 @@ int main(int argc, char* argv[]) ...@@ -87,7 +87,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 0
// 3x3, 34x34 // 3x3, 34x34
constexpr index_t N = 128; constexpr index_t N = 64;
constexpr index_t C = 256; constexpr index_t C = 256;
constexpr index_t HI = 34; constexpr index_t HI = 34;
constexpr index_t WI = 34; constexpr index_t WI = 34;
...@@ -228,7 +228,7 @@ int main(int argc, char* argv[]) ...@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128; constexpr index_t N = 128;
...@@ -244,7 +244,7 @@ int main(int argc, char* argv[]) ...@@ -244,7 +244,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 1x1 filter, 17x17 input // 1x1 filter, 17x17 input
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
constexpr index_t N = 128; constexpr index_t N = 128;
...@@ -379,7 +379,7 @@ int main(int argc, char* argv[]) ...@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment