"...composable_kernel-1.git" did not exist on "22d438ae9e02b674e1656263df07799ee06fb466"
Commit 39d92e7d authored by Chao Liu's avatar Chao Liu
Browse files

removing old implementation of tensor descriptor

parent 012b5253
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp"
...@@ -73,12 +72,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -73,12 +72,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto in_n_c_hi_wi_global_desc = constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{};
make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
constexpr auto wei_k_c_y_x_global_desc = constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides());
constexpr auto out_n_k_ho_wo_global_desc =
make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides());
constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
...@@ -119,11 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -119,11 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr index_t KBlockWork = K / KPerBlock; constexpr index_t KBlockWork = K / KPerBlock;
constexpr index_t BBlockWork = B / BPerBlock; constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc = constexpr auto block_work_desc = transform_tensor_descriptor(
make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{}); make_native_tensor_descriptor_packed(Sequence<KBlockWork, BBlockWork>{}),
make_tuple(Merge<Sequence<KBlockWork, BBlockWork>>{}),
make_tuple(Sequence<0, 1>{}),
make_tuple(Sequence<0>{}));
const auto block_work_multi_id = const auto block_work_multi_id = block_work_desc.CalculateLowerIndex(get_block_1d_id());
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock;
const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
...@@ -139,7 +137,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -139,7 +137,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
in_n_c_hip_wip_global_desc, in_n_c_hip_wip_global_desc,
make_tuple(Unmerge<Sequence<N0, N1, N2>>{}, make_tuple(UnMerge<Sequence<N0, N1, N2>>{},
PassThrough<C>{}, PassThrough<C>{},
Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{}, Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}), Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
...@@ -185,21 +183,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -185,21 +183,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
InBlockCopyDstDataPerWrite_N2>( InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#if 0
// weight tensor // weight tensor
// tensor descriptor in device memory, src of blockwise copy // tensor descriptor in device memory, src of blockwise copy
constexpr auto wei_k_e_global_desc =
unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3);
constexpr auto wei_e_k_global_desc = constexpr auto wei_e_k_global_desc =
transform_tensor_descriptor(wei_k_c_y_x_global_desc, reorder_tensor_descriptor_given_upper2lower(wei_k_e_global_desc, Sequence<1, 0>{});
make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#else // hack
constexpr auto wei_e_k_global_desc_old =
WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor(
wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides());
#endif
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
...@@ -340,10 +330,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -340,10 +330,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>( blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next);
p_in_register_buffer, p_in_block_next); blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next);
blockwise_wei_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
p_wei_register_buffer, p_wei_block_next);
} }
} }
...@@ -368,10 +356,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -368,10 +356,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>( blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer,
p_in_register_buffer, p_in_block_double + in_block_space); p_in_block_double + in_block_space);
blockwise_wei_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>( blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer,
p_wei_register_buffer, p_wei_block_double + wei_block_space); p_wei_block_double + wei_block_space);
// odd iteration // odd iteration
__syncthreads(); __syncthreads();
...@@ -393,12 +381,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -393,12 +381,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
Sequence<GemmMRepeat, GemmMPerThreadSubC, N1, 1, N2>{}); Sequence<GemmMRepeat, GemmMPerThreadSubC, N1, 1, N2>{});
// output memory layout descriptor in device memory // output memory layout descriptor in device memory
constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc_old = constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor(
OutGlobalDesc::Fold(I1, Number<K1>{}).Fold(I0, Number<N1>{}, Number<N2>{}); out_n_k_ho_wo_global_desc,
make_tuple(UnMerge<Sequence<N0, N1, N2>>{},
constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = make_native_tensor_descriptor( UnMerge<Sequence<K0, K1>>{},
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old.GetLengths(), PassThrough<Ho>{},
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old.GetStrides()); PassThrough<Wo>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{}));
// output merged global tensor descriptor, dst of threadwise copy // output merged global tensor descriptor, dst of threadwise copy
constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor( constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor(
......
...@@ -384,7 +384,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -384,7 +384,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr auto out_k0_k1_b0_b1_global_desc = transform_tensor_descriptor( constexpr auto out_k0_k1_b0_b1_global_desc = transform_tensor_descriptor(
out_k_b_global_desc, out_k_b_global_desc,
make_tuple(Unmerge<Sequence<K0, K1>>{}, Unmerge<Sequence<B0, B1>>{}), make_tuple(UnMerge<Sequence<K0, K1>>{}, UnMerge<Sequence<B0, B1>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
......
...@@ -252,7 +252,6 @@ struct Merge ...@@ -252,7 +252,6 @@ struct Merge
}); });
// highest dimension, no out-of-bound check // highest dimension, no out-of-bound check
if(borrow) if(borrow)
{ {
--idx_low_new(0); --idx_low_new(0);
...@@ -273,7 +272,7 @@ struct Merge ...@@ -273,7 +272,7 @@ struct Merge
// UpperLengths: Sequence<...> // UpperLengths: Sequence<...>
template <typename UpperLengths> template <typename UpperLengths>
struct Unmerge struct UnMerge
{ {
static constexpr index_t nDimLow = 1; static constexpr index_t nDimLow = 1;
static constexpr index_t nDimUp = UpperLengths::Size(); static constexpr index_t nDimUp = UpperLengths::Size();
......
...@@ -325,14 +325,14 @@ struct TensorCoordinate ...@@ -325,14 +325,14 @@ struct TensorCoordinate
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{ {
return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>(); return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{ {
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>(); return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
} }
......
...@@ -81,7 +81,7 @@ struct NativeTensorCoordinate ...@@ -81,7 +81,7 @@ struct NativeTensorCoordinate
__host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
private: private:
// mIndex may be saved and update, however, the value of some (or all) of its entries may // mIndex may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation // never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code. // as dead code.
// TODO: make sure compiler indeed remove these dead code // TODO: make sure compiler indeed remove these dead code
...@@ -178,7 +178,8 @@ struct TransformedTensorCoordinate ...@@ -178,7 +178,8 @@ struct TransformedTensorCoordinate
} }
private: private:
// mIndexUp may be calculated and update, however, the value of some (or all) of its entries may // mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// may
// never be used. Compiler should be able to remove these entries as well as its calculation // never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code. // as dead code.
// TODO: make sure compiler indeed remove these dead code // TODO: make sure compiler indeed remove these dead code
...@@ -192,7 +193,7 @@ struct TensorCoordinate_v2 ...@@ -192,7 +193,7 @@ struct TensorCoordinate_v2
private: private:
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{ {
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>( return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
...@@ -200,7 +201,7 @@ struct TensorCoordinate_v2 ...@@ -200,7 +201,7 @@ struct TensorCoordinate_v2
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{ {
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>( return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
...@@ -299,7 +299,7 @@ struct TransformedTensorDescriptor ...@@ -299,7 +299,7 @@ struct TransformedTensorDescriptor
return GetLowerTensorDescriptor().GetElementSpace(); return GetLowerTensorDescriptor().GetElementSpace();
} }
// TODO: right now return value is constexpr because use of non-constepxr lambda // TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) __host__ __device__ static constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up)
{ {
LowerIndex idx_low; LowerIndex idx_low;
......
...@@ -96,6 +96,81 @@ __host__ __device__ constexpr auto ...@@ -96,6 +96,81 @@ __host__ __device__ constexpr auto
LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{}); LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{});
} }
template <typename Lengths, typename Strides>
__host__ __device__ constexpr bool AreDimensionsUnfoldable(Lengths, Strides)
{
static_assert(Lengths::Size() == Strides::Size(), "wrong!");
bool flag = true;
for(index_t i = 0; i < Lengths::Size() - 1; ++i)
{
flag = flag && Strides::At(i) == Strides::At(i + 1) * Lengths::At(i + 1);
}
return flag;
}
// unfold only support NativeTennsorDescriptor, for now
template <index_t FirstUnfoldDim, index_t LastUnfoldDim, typename... Ts>
__host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescriptor<Ts...> desc,
Number<FirstUnfoldDim>,
Number<LastUnfoldDim>)
{
constexpr index_t nDim = desc.GetNumOfDimension();
static_assert(FirstUnfoldDim >= 0 && LastUnfoldDim < nDim && FirstUnfoldDim <= LastUnfoldDim,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!");
// left and right
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::type{};
constexpr auto middle =
typename arithmetic_sequence_gen<FirstUnfoldDim, LastUnfoldDim + 1, 1>::type{};
constexpr auto right = typename arithmetic_sequence_gen<LastUnfoldDim + 1, nDim, 1>::type{};
// sanity-checknfoldable
static_assert(AreDimensionsUnfoldable(desc.GetLengths(middle), desc.GetStrides(middle)),
"wrong! not unfoldable");
// unfolded length, stride
constexpr index_t unfold_length =
reduce_on_sequence(desc.GetLengths(middle), math::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = desc.GetStride(Number<LastUnfoldDim>{});
// new lengths, strides
constexpr auto new_lengths =
desc.GetLengths(left).PushBack(Number<unfold_length>{}).PushBack(desc.GetLengths(right));
constexpr auto new_strides =
desc.GetStrides(left).PushBack(Number<unfold_stride>{}).PushBack(desc.GetStrides(right));
return make_native_tensor_descriptor(new_lengths, new_strides);
}
#if 0
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename LeftPads,
typename RightPads>
__host__ __device__ constexpr auto
pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
constexpr auto non_pad_low_dim_ids = xxx;
return transform_tensor_descriptor(
LowerTensorDescriptor{},
make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
LeftPads,
RightPads>{})
.PushBack(PassThrough<xxxx>...),
make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
}
#endif
template <typename... NativeDimensions> template <typename... NativeDimensions>
__host__ __device__ void __host__ __device__ void
print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc) print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
......
...@@ -738,12 +738,12 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -738,12 +738,12 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{ {
#if 1 #if 1
mThreadwiseLoad.template Run_generic<TData, SrcAddressSpace, address_space_t::vgpr>( mThreadwiseLoad.template Run_generic<TData, SrcAddressSpace, address_space_t::generic>(
p_src, p_buffer); p_src, p_buffer);
#else #else
mThreadwiseLoad.template Run_optimized_src_address_calculation<TData, mThreadwiseLoad.template Run_optimized_src_address_calculation<TData,
SrcAddressSpace, SrcAddressSpace,
address_space_t::vgpr>( address_space_t::generic>(
p_src, p_buffer); p_src, p_buffer);
#endif #endif
} }
...@@ -752,11 +752,11 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -752,11 +752,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{ {
#if 1 #if 1
mThreadwiseStore.template Run_generic<TData, address_space_t::vgpr, DstAddressSpace>( mThreadwiseStore.template Run_generic<TData, address_space_t::generic, DstAddressSpace>(
p_buffer, p_dst); p_buffer, p_dst);
#else #else
mThreadwiseStore.template Run_optimized_dst_address_calculation<TData, mThreadwiseStore.template Run_optimized_dst_address_calculation<TData,
address_space_t::vgpr, address_space_t::generic,
DstAddressSpace>(p_buffer, DstAddressSpace>(p_buffer,
p_dst); p_dst);
#endif #endif
......
...@@ -37,11 +37,16 @@ typedef float float4_t __attribute__((ext_vector_type(4))); ...@@ -37,11 +37,16 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
// data type conversion
template <class T> template <class T>
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1) struct type_convert
{ {
d += s0 * s1; template <class X>
} __device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
} // namespace ck } // namespace ck
......
...@@ -33,18 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, ...@@ -33,18 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
#if 1
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcyx_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
#else
constexpr auto in_nchw_desc = constexpr auto in_nchw_desc =
make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides()); make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
constexpr auto wei_kcyx_desc = constexpr auto wei_kcyx_desc =
make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides()); make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
constexpr auto out_nkhw_desc = constexpr auto out_nkhw_desc =
make_native_tensor_descriptor(OutDesc::GetLegnths(), OutDesc::GetStrides()); make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
#endif
constexpr index_t N = out_nkhw_desc.GetLength(I0); constexpr index_t N = out_nkhw_desc.GetLength(I0);
constexpr index_t K = out_nkhw_desc.GetLength(I1); constexpr index_t K = out_nkhw_desc.GetLength(I1);
......
...@@ -295,7 +295,7 @@ int main(int argc, char* argv[]) ...@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128; constexpr index_t N = 128;
...@@ -341,7 +341,7 @@ int main(int argc, char* argv[]) ...@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>; using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>;
#elif 0 #elif 1
// 1x7 filter, 0x3 pad, 17x17 input // 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment