Commit e1a67b69 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent f7be86b9
...@@ -51,7 +51,7 @@ template <index_t GridSize, ...@@ -51,7 +51,7 @@ template <index_t GridSize,
index_t WeiBlockCopyDstDataPerWrite_K> index_t WeiBlockCopyDstDataPerWrite_K>
struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
{ {
#if 0 #if 1
__device__ void Run(const Float* const __restrict__ p_in_global, __device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global, const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global) const Float* const __restrict__ p_out_global) const
......
...@@ -187,10 +187,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -187,10 +187,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
// weight tensor // weight tensor
// tensor descriptor in device memory, src of blockwise copy // tensor descriptor in device memory, src of blockwise copy
constexpr auto wei_e_k_global_desc = constexpr auto wei_e_k_global_desc =
#if 0
transform_tensor_descriptor(wei_k_c_y_x_global_desc, transform_tensor_descriptor(wei_k_c_y_x_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}), make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
#else // hack
make_native_tensor_descriptor_packed(Sequence<K, C * Y * X>{});
#endif
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
......
...@@ -47,7 +47,7 @@ struct NativeTensorCoordinate ...@@ -47,7 +47,7 @@ struct NativeTensorCoordinate
// mIndex is updated here, but some (or all) of its entries may never be used // mIndex is updated here, but some (or all) of its entries may never be used
mIndex += idx_diff; mIndex += idx_diff;
mOffset += tensor_desc_type::CalculateOffset(idx_diff); mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
return *this; return *this;
} }
...@@ -57,7 +57,7 @@ struct NativeTensorCoordinate ...@@ -57,7 +57,7 @@ struct NativeTensorCoordinate
// mIndex is updated here, but some (or all) of its entries may never be used // mIndex is updated here, but some (or all) of its entries may never be used
mIndex -= idx_diff; mIndex -= idx_diff;
mOffset -= tensor_desc_type::CalculateOffset(idx_diff); mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
return *this; return *this;
} }
......
...@@ -684,12 +684,10 @@ template <index_t BlockSize, ...@@ -684,12 +684,10 @@ template <index_t BlockSize,
struct BlockwiseGenericTensorSliceCopy_v4 struct BlockwiseGenericTensorSliceCopy_v4
{ {
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
using Index = MultiIndex<nDim>;
using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type; __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
using DstCoord = typename TensorCoordinate_v2<DstDesc>::type; const Index& dst_block_slice_origin)
__device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin,
DstCoord dst_block_slice_origin)
{ {
static_assert(nDim == SrcDesc::GetNumOfDimension() && static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() && nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
......
...@@ -966,8 +966,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -966,8 +966,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type; using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
using DstCoord = typename TensorCoordinate_v2<DstDesc>::type; using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin, __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
DstCoord dst_slice_origin) const Index& dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{ {
static_assert(nDim == SrcDesc::GetNumOfDimension() && static_assert(nDim == SrcDesc::GetNumOfDimension() &&
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment