"vscode:/vscode.git/clone" did not exist on "2a6b6b555cb0bf24d26e43a1734a4867bd89082e"
Commit bf975428 authored by Chao Liu's avatar Chao Liu
Browse files

add lds doble buffer to nchw padded v4r1 and v4r4

parent 2c93b305
...@@ -59,7 +59,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded ...@@ -59,7 +59,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto I5 = Number<5>{};
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
...@@ -330,7 +329,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded ...@@ -330,7 +329,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto I5 = Number<5>{};
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
......
...@@ -25,14 +25,14 @@ namespace ck { ...@@ -25,14 +25,14 @@ namespace ck {
// repeat-length on the merged dimension need to be 1. These sanity checks are performed // repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1 // in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize, template <index_t BlockSize,
class SrcDesc, typename SrcDesc,
class DstDesc, typename DstDesc,
class SliceLengths, typename SliceLengths,
class SubLengths, typename SubLengths,
class ThreadClusterLengths, typename ThreadClusterLengths,
class ThreadClusterArrangeOrder, typename ThreadClusterArrangeOrder,
class SrcDimAccessOrder, typename SrcDimAccessOrder,
class DstDimAccessOrder, typename DstDimAccessOrder,
index_t SrcVectorAccessDim, index_t SrcVectorAccessDim,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
...@@ -204,7 +204,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -204,7 +204,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
return GetRegisterBufferDescriptor().GetElementSpace(); return GetRegisterBufferDescriptor().GetElementSpace();
} }
template <class TData> template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src, __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const TData* __restrict__ p_buffer) const
{ {
...@@ -260,7 +260,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -260,7 +260,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
} }
template <class TData> template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer, __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const TData* __restrict__ p_dst) const
{ {
...@@ -315,7 +315,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -315,7 +315,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
} }
template <class TData> template <typename TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{ {
TData p_buffer[GetRegisterBufferSize()]; TData p_buffer[GetRegisterBufferSize()];
...@@ -406,7 +406,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -406,7 +406,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction) MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{ {
...@@ -423,14 +423,14 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -423,14 +423,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst. // memory layout (ordering of dimensions) can be different between src and dst.
template <index_t BlockSize, template <index_t BlockSize,
class SrcDesc, typename SrcDesc,
class DstDesc, typename DstDesc,
class SliceLengths, typename SliceLengths,
class SubLengths, typename SubLengths,
class ThreadClusterLengths, typename ThreadClusterLengths,
class ThreadClusterArrangeOrder, typename ThreadClusterArrangeOrder,
class SrcDimAccessOrder, typename SrcDimAccessOrder,
class DstDimAccessOrder, typename DstDimAccessOrder,
index_t SrcVectorAccessDim, index_t SrcVectorAccessDim,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
...@@ -482,19 +482,19 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -482,19 +482,19 @@ struct BlockwiseGenericTensorSliceCopy_v2
return RegisterBufferDesc::GetElementSpace(); return RegisterBufferDesc::GetElementSpace();
} }
template <class TData> template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{ {
mThreadwiseLoad.Run(p_src, p_buffer); mThreadwiseLoad.Run(p_src, p_buffer);
} }
template <class TData> template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{ {
mThreadwiseStore.Run(p_buffer, p_dst); mThreadwiseStore.Run(p_buffer, p_dst);
} }
template <class TData> template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const __device__ void Run(const TData* p_src, TData* p_dst) const
{ {
TData p_buffer[GetRegisterBufferSize()]; TData p_buffer[GetRegisterBufferSize()];
...@@ -503,14 +503,14 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -503,14 +503,14 @@ struct BlockwiseGenericTensorSliceCopy_v2
mThreadwiseStore.Run(p_buffer, p_dst); mThreadwiseStore.Run(p_buffer, p_dst);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction) MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction) MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{ {
...@@ -546,14 +546,14 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -546,14 +546,14 @@ struct BlockwiseGenericTensorSliceCopy_v2
// this version use TensorView and TensorCoordinate // this version use TensorView and TensorCoordinate
template <index_t BlockSize, template <index_t BlockSize,
class SrcTensor, typename SrcTensor,
class DstTensor, typename DstTensor,
class SliceLengths, typename SliceLengths,
class SubLengths, typename SubLengths,
class ThreadClusterLengths, typename ThreadClusterLengths,
class ThreadClusterArrangeOrder, typename ThreadClusterArrangeOrder,
class SrcDimAccessOrder, typename SrcDimAccessOrder,
class DstDimAccessOrder, typename DstDimAccessOrder,
index_t SrcVectorAccessDim, index_t SrcVectorAccessDim,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
...@@ -622,14 +622,14 @@ struct BlockwiseGenericTensorSliceCopy_v3 ...@@ -622,14 +622,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
mThreadwiseStore.Run(); mThreadwiseStore.Run();
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction) MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction) MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{ {
...@@ -669,14 +669,14 @@ struct BlockwiseGenericTensorSliceCopy_v3 ...@@ -669,14 +669,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
}; };
template <index_t BlockSize, template <index_t BlockSize,
class SrcDesc, typename SrcDesc,
class DstDesc, typename DstDesc,
class SliceLengths, typename SliceLengths,
class SubLengths, typename SubLengths,
class ThreadClusterLengths, typename ThreadClusterLengths,
class ThreadClusterArrangeOrder, typename ThreadClusterArrangeOrder,
class SrcDimAccessOrder, typename SrcDimAccessOrder,
class DstDimAccessOrder, typename DstDimAccessOrder,
index_t SrcVectorAccessDim, index_t SrcVectorAccessDim,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
...@@ -727,19 +727,19 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -727,19 +727,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
return RegisterBufferDesc::GetElementSpace(); return RegisterBufferDesc::GetElementSpace();
} }
template <class TData> template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{ {
mThreadwiseLoad.Run(p_src, p_buffer); mThreadwiseLoad.Run(p_src, p_buffer);
} }
template <class TData> template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{ {
mThreadwiseStore.Run(p_buffer, p_dst); mThreadwiseStore.Run(p_buffer, p_dst);
} }
template <class TData> template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const __device__ void Run(const TData* p_src, TData* p_dst) const
{ {
TData p_buffer[GetRegisterBufferSize()]; TData p_buffer[GetRegisterBufferSize()];
...@@ -748,16 +748,18 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -748,16 +748,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
mThreadwiseStore.Run(p_buffer, p_dst); mThreadwiseStore.Run(p_buffer, p_dst);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction) MoveSrcSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction) MoveDstSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
} }
......
...@@ -1072,16 +1072,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1072,16 +1072,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
} }
template <class T, bool PositiveDirection> template <class T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>) __device__ void MoveSrcSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{ {
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes; mSrcSliceOrigin += to_array(step_sizes);
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
} }
template <class T, bool PositiveDirection> template <class T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>) __device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{ {
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { static_if<PositiveDirection>{}([&](auto) {
mDstSliceOrigin += step_sizes; mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
......
...@@ -136,7 +136,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -136,7 +136,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = constexpr auto gridwise_conv =
#if 1 #if 0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
#else #else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "tensor.hpp" #include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
template <typename T, template <typename T,
typename InDesc, typename InDesc,
...@@ -101,44 +102,49 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, ...@@ -101,44 +102,49 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded< constexpr auto gridwise_conv =
GridSize, #if 0
BlockSize, GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
T, #else
decltype(in_nchw_desc), GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
decltype(wei_kcyx_desc), #endif
decltype(out_nkhw_desc), <GridSize,
ConvStrides, BlockSize,
ConvDilations, T,
LeftPads, decltype(in_nchw_desc),
RightPads, decltype(wei_kcyx_desc),
BPerBlock, decltype(out_nkhw_desc),
KPerBlock, ConvStrides,
EPerBlock, ConvDilations,
GemmNRepeat, LeftPads,
GemmMPerThreadSubC, RightPads,
GemmNPerThreadSubC, BPerBlock,
GemmMLevel0Cluster, KPerBlock,
GemmNLevel0Cluster, EPerBlock,
GemmMLevel1Cluster, GemmNRepeat,
GemmNLevel1Cluster, GemmMPerThreadSubC,
GemmKPerThreadLoop, GemmNPerThreadSubC,
GemmDataPerReadA, GemmMLevel0Cluster,
GemmDataPerReadB, GemmNLevel0Cluster,
InBlockCopySubLengths_E_N1_B_N2, GemmMLevel1Cluster,
InBlockCopyClusterLengths_E_N1_B_N2, GemmNLevel1Cluster,
InBlockCopyThreadClusterArrangeOrder, GemmKPerThreadLoop,
InBlockCopySrcAccessOrder, GemmDataPerReadA,
InBlockCopyDstAccessOrder, GemmDataPerReadB,
InBlockCopySrcDataPerRead_B, InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyDstDataPerWrite_N2, InBlockCopyClusterLengths_E_N1_B_N2,
WeiBlockCopySubLengths_E_K, InBlockCopyThreadClusterArrangeOrder,
WeiBlockCopyClusterLengths_E_K, InBlockCopySrcAccessOrder,
WeiBlockCopyThreadClusterArrangeOrder, InBlockCopyDstAccessOrder,
WeiBlockCopySrcAccessOrder, InBlockCopySrcDataPerRead_B,
WeiBlockCopyDstAccessOrder, InBlockCopyDstDataPerWrite_N2,
WeiBlockCopySrcDataPerRead_E, WeiBlockCopySubLengths_E_K,
WeiBlockCopyDstDataPerWrite_K>{}; WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{};
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "tensor.hpp" #include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp" #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
template <class T, template <class T,
class InDesc, class InDesc,
...@@ -166,43 +167,48 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc, ...@@ -166,43 +167,48 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded< constexpr auto gridwise_conv =
GridSize, #if 0
BlockSize, GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
T, #else
decltype(in_nchw_desc), GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer
decltype(wei_kcyx_desc), #endif
decltype(out_nkhw_desc), <GridSize,
ConvStrides, BlockSize,
ConvDilations, T,
LeftPads, decltype(in_nchw_desc),
RightPads, decltype(wei_kcyx_desc),
BPerBlock, decltype(out_nkhw_desc),
KPerBlock, ConvStrides,
EPerBlock, ConvDilations,
GemmMPerThreadSubC, LeftPads,
GemmNPerThreadSubC, RightPads,
GemmMLevel0Cluster, BPerBlock,
GemmNLevel0Cluster, KPerBlock,
GemmMLevel1Cluster, EPerBlock,
GemmNLevel1Cluster, GemmMPerThreadSubC,
GemmKPerThreadLoop, GemmNPerThreadSubC,
GemmDataPerReadA, GemmMLevel0Cluster,
GemmDataPerReadB, GemmNLevel0Cluster,
InBlockCopySubLengths_E_B, GemmMLevel1Cluster,
InBlockCopyClusterLengths_E_B, GemmNLevel1Cluster,
InBlockCopyThreadClusterArrangeOrder, GemmKPerThreadLoop,
InBlockCopySrcAccessOrder, GemmDataPerReadA,
InBlockCopyDstAccessOrder, GemmDataPerReadB,
InBlockCopyDataPerAccess_B, InBlockCopySubLengths_E_B,
WeiBlockCopySubLengths_E_K, InBlockCopyClusterLengths_E_B,
WeiBlockCopyClusterLengths_E_K, InBlockCopyThreadClusterArrangeOrder,
WeiBlockCopyThreadClusterArrangeOrder, InBlockCopySrcAccessOrder,
WeiBlockCopySrcAccessOrder, InBlockCopyDstAccessOrder,
WeiBlockCopyDstAccessOrder, InBlockCopyDataPerAccess_B,
WeiBlockCopySrcDataPerRead_E, WeiBlockCopySubLengths_E_K,
WeiBlockCopyDstDataPerWrite_K, WeiBlockCopyClusterLengths_E_K,
OutThreadCopyDataPerAccess_B>{}; WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
OutThreadCopyDataPerAccess_B>{};
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
......
...@@ -92,8 +92,8 @@ int main(int argc, char* argv[]) ...@@ -92,8 +92,8 @@ int main(int argc, char* argv[])
// 3x3, 34x34 // 3x3, 34x34
constexpr index_t N = 64; constexpr index_t N = 64;
constexpr index_t C = 256; constexpr index_t C = 256;
constexpr index_t HI = 34; constexpr index_t HI = 32;
constexpr index_t WI = 34; constexpr index_t WI = 32;
constexpr index_t K = 128; constexpr index_t K = 128;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
...@@ -101,8 +101,8 @@ int main(int argc, char* argv[]) ...@@ -101,8 +101,8 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<1, 1>;
#elif 0 #elif 0
// 1x1 filter, 8x8 image // 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment