Commit e00de11e authored by Chao Liu's avatar Chao Liu
Browse files

changing class to POD

parent f16356d4
......@@ -9,6 +9,7 @@
namespace ck {
#if 1
// GemmM = K
// GemmN = N * Ho * Wo
// GemmK = C * Y * X
......@@ -903,6 +904,73 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
}
}
};
#else
template <index_t BlockSize,
typename Float,
typename AccFloat,
index_t GemmMPerBlock,
index_t GemmNPerBlock,
index_t GemmKPerBlock,
index_t GemmMPerThread,
index_t GemmNPerThread,
index_t GemmKPerThread,
index_t GemmMLevel0Cluster,
index_t GemmNLevel0Cluster,
index_t GemmMLevel1Cluster,
index_t GemmNLevel1Cluster,
typename GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
typename GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
index_t GemmABlockTransferSrcScalarPerVector_GemmK,
index_t GemmABlockTransferDstScalarPerVector_GemmM,
typename GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
typename GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
index_t GemmBBlockTransferSrcScalarPerVector_GemmN,
index_t GemmBBlockTransferDstScalarPerVector_GemmN,
index_t GemmCThreadTransferDstScalarPerVector_GemmN1>
struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{
template <typename... Wei, typename... In, typename... Out>
__host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads,
const Float* __restrict__ p_wei_global,
const Float* __restrict__ p_in_global,
Float* __restrict__ p_out_global) const
{
constexpr auto pass = DynamicPassThrough();
constexpr auto pad = DynamicLeftPad<false>();
constexpr auto freeze = DynamicFreeze();
constexpr auto desc = ck::DynamicTensorDescriptor<ck::Tuple<ck::DynamicUnMerge<4, false>>,
ck::Tuple<ck::Sequence<0>>,
ck::Tuple<ck::Sequence<1, 2, 3, 4>>,
ck::Sequence<1, 2, 3, 4>>();
static_assert(std::is_trivial<Sequence<1>>::value, "wrong");
static_assert(std::is_trivial<detail::TupleElementKey<0>>::value, "wrong");
static_assert(
std::is_trivial<detail::TupleElement<detail::TupleElementKey<0>, index_t>>::value,
"wrong");
static_assert(std::is_trivial<detail::TupleImpl<Sequence<0>, index_t>>::value, "wrong");
static_assert(std::is_trivial<Tuple<index_t>>::value, "wrong");
static_assert(std::is_trivial<MultiIndex<2>>::value, "wrong");
static_assert(std::is_trivial<MultiIndex<1>>::value, "wrong");
static_assert(std::is_trivial<DynamicPassThrough>::value, "wrong");
static_assert(std::is_trivial<DynamicUnMerge<2>>::value, "wrong");
static_assert(std::is_trivial<DynamicFreeze>::value, "wrong");
static_assert(std::is_trivial<remove_cv_t<decltype(desc)>>::value, "wrong");
static_assert(std::is_trivial<remove_reference_t<remove_cv_t<decltype(conv_strides)>>>::value, "wrong");
static_assert(
std::is_trivial<
remove_reference_t<remove_cv_t<decltype(wei_k_c_y_x_global_desc)>>>::value,
"wrong");
}
};
#endif
} // namespace ck
#endif
......@@ -11,25 +11,15 @@ struct DynamicPassThrough
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const UpperIndex up_lengths_;
UpperIndex up_lengths_;
__host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough& other)
: up_lengths_{other.up_lengths_}
{
}
__host__ __device__ constexpr DynamicPassThrough(DynamicPassThrough&& other)
: up_lengths_{other.up_lengths_}
{
}
__host__ __device__ constexpr DynamicPassThrough() = default;
__host__ __device__ constexpr DynamicPassThrough(const index_t& low_length)
: up_lengths_{make_multi_index(low_length)}
{
}
__host__ __device__ constexpr DynamicPassThrough() : up_lengths_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
......@@ -88,19 +78,11 @@ struct DynamicPad
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const UpperIndex up_lengths_;
const index_t left_pad_;
const index_t right_pad_;
UpperIndex up_lengths_;
index_t left_pad_;
index_t right_pad_;
__host__ __device__ constexpr DynamicPad(const DynamicPad& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{
}
__host__ __device__ constexpr DynamicPad(DynamicPad&& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{
}
__host__ __device__ constexpr DynamicPad() = default;
__host__ __device__ constexpr DynamicPad(const index_t& low_length,
const index_t& left_pad,
......@@ -111,8 +93,6 @@ struct DynamicPad
{
}
__host__ __device__ constexpr DynamicPad() : up_lengths_{0}, left_pad_{0}, right_pad_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
......@@ -173,26 +153,16 @@ struct DynamicLeftPad
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const UpperIndex up_lengths_;
const index_t left_pad_;
__host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
{
}
UpperIndex up_lengths_;
index_t left_pad_;
__host__ __device__ constexpr DynamicLeftPad(DynamicLeftPad&& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
{
}
__host__ __device__ constexpr DynamicLeftPad() = default;
__host__ __device__ constexpr DynamicLeftPad(const index_t& low_length, const index_t& left_pad)
: up_lengths_{make_multi_index(low_length + left_pad)}, left_pad_{left_pad}
{
}
__host__ __device__ constexpr DynamicLeftPad() : up_lengths_{0}, left_pad_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
......@@ -252,23 +222,11 @@ struct DynamicRightPad
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const UpperIndex up_lengths_;
const index_t low_length_;
const index_t right_pad_;
UpperIndex up_lengths_;
index_t low_length_;
index_t right_pad_;
__host__ __device__ constexpr DynamicRightPad(const DynamicRightPad& other)
: up_lengths_{other.up_lengths_},
low_length_{other.low_length_},
right_pad_{other.right_pad_}
{
}
__host__ __device__ constexpr DynamicRightPad(DynamicRightPad&& other)
: up_lengths_{other.up_lengths_},
low_length_{other.low_length_},
right_pad_{other.right_pad_}
{
}
__host__ __device__ constexpr DynamicRightPad() = default;
__host__ __device__ constexpr DynamicRightPad(const index_t& low_length,
const index_t& right_pad)
......@@ -278,10 +236,6 @@ struct DynamicRightPad
{
}
__host__ __device__ constexpr DynamicRightPad() : up_lengths_{0}, low_length_{0}, right_pad_{0}
{
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
......@@ -342,18 +296,10 @@ struct DynamicEmbed
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex coefficients_;
UpperIndex up_lengths_;
UpperIndex coefficients_;
__host__ __device__ constexpr DynamicEmbed(const DynamicEmbed& other)
: up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
{
}
__host__ __device__ constexpr DynamicEmbed(DynamicEmbed&& other)
: up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
{
}
__host__ __device__ constexpr DynamicEmbed() = default;
__host__ __device__ constexpr DynamicEmbed(const UpperIndex& up_lengths,
const UpperIndex& coefficients)
......@@ -362,19 +308,6 @@ struct DynamicEmbed
static_assert(UpperIndex::Size() == NDimUp, "wrong! # of dimensions not consistent");
}
template <typename UpperLengths, typename Coefficients>
__host__ __device__ constexpr DynamicEmbed(const UpperLengths& up_lengths,
const Coefficients& coefficients)
: up_lengths_{up_lengths}, coefficients_{coefficients}
{
}
__host__ __device__ constexpr DynamicEmbed()
: up_lengths_{make_zero_multi_index<NDimUp>()},
coefficients_{make_zero_multi_index<NDimUp>()}
{
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
......@@ -439,23 +372,11 @@ struct DynamicMerge
using LowerIndex = MultiIndex<NDimLow>;
using UpperIndex = MultiIndex<1>;
const LowerIndex low_lengths_;
const LowerIndex low_lengths_scan_;
const UpperIndex up_lengths_;
__host__ __device__ constexpr DynamicMerge(const DynamicMerge& other)
: low_lengths_{other.low_lengths_},
low_lengths_scan_{other.low_lengths_scan_},
up_lengths_{other.up_lengths_}
{
}
LowerIndex low_lengths_;
LowerIndex low_lengths_scan_;
UpperIndex up_lengths_;
__host__ __device__ constexpr DynamicMerge(DynamicMerge&& other)
: low_lengths_{other.low_lengths_},
low_lengths_scan_{other.low_lengths_scan_},
up_lengths_{other.up_lengths_}
{
}
__host__ __device__ constexpr DynamicMerge() = default;
__host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths},
......@@ -467,13 +388,6 @@ struct DynamicMerge
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
}
__host__ __device__ constexpr DynamicMerge()
: low_lengths_{make_zero_multi_index<NDimLow>()},
low_lengths_scan_{make_zero_multi_index<NDimLow>()},
up_lengths_{0}
{
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
......@@ -953,8 +867,10 @@ struct DynamicUnMerge
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex up_lengths_scan_;
UpperIndex up_lengths_;
UpperIndex up_lengths_scan_;
__host__ __device__ constexpr DynamicUnMerge() = default;
__host__ __device__ constexpr DynamicUnMerge(const UpperIndex& up_lengths)
: up_lengths_{up_lengths},
......@@ -963,12 +879,6 @@ struct DynamicUnMerge
{
}
__host__ __device__ constexpr DynamicUnMerge()
: up_lengths_{make_zero_multi_index<NDimUp>()},
up_lengths_scan_{make_zero_multi_index<NDimUp>()}
{
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
......@@ -1035,11 +945,11 @@ struct DynamicFreeze
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<0>;
const index_t low_idx_;
LowerIndex low_idx_;
__host__ __device__ constexpr DynamicFreeze(const index_t& low_idx) : low_idx_{low_idx} {}
__host__ __device__ constexpr DynamicFreeze() = default;
__host__ __device__ constexpr DynamicFreeze() : low_idx_{0} {}
__host__ __device__ constexpr DynamicFreeze(const index_t& low_idx) : low_idx_{low_idx} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
......@@ -1054,7 +964,7 @@ struct DynamicFreeze
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low(Number<0>{}) = low_idx_;
idx_low = low_idx_;
}
template <typename LowIdxDiff,
......
......@@ -58,6 +58,15 @@ struct DynamicTensorDescriptor
using Coordinate = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
public:
#if 1
__host__ __device__ explicit constexpr DynamicTensorDescriptor()
: DynamicTensorDescriptor(Transforms{}, index_t{0})
{
}
#else
__host__ __device__ constexpr DynamicTensorDescriptor() = default;
#endif
__host__ __device__ explicit constexpr DynamicTensorDescriptor(const Transforms& transforms,
index_t element_space_size)
: transforms_{transforms},
......@@ -71,11 +80,6 @@ struct DynamicTensorDescriptor
// TODO check dependency of dimensions is valid
}
__host__ __device__ explicit constexpr DynamicTensorDescriptor()
: DynamicTensorDescriptor(Transforms{}, index_t{0})
{
}
__host__ __device__ static constexpr index_t GetNumOfDimension()
{
return GetNumOfVisibleDimension();
......@@ -150,10 +154,10 @@ struct DynamicTensorDescriptor
}
// TODO make these private
const Transforms transforms_;
Transforms transforms_;
// TODO maybe hidden_lengths_ should use reference_wrapper (reference to transforms_'s member
// variable lengths_) to save space on stack?
const HiddenIndex hidden_lengths_;
HiddenIndex hidden_lengths_;
};
template <index_t NDimHidden, typename VisibleDimensionIds>
......
......@@ -49,11 +49,11 @@ make_dynamic_naive_tensor_descriptor_packed(const MultiIndex<N>& lengths)
const index_t element_space_size =
container_reduce(lengths, math::multiplies<index_t>{}, index_t{1});
return DynamicTensorDescriptor<decltype(transforms),
decltype(low_dim_hidden_idss),
decltype(up_dim_hidden_idss),
decltype(visible_dim_hidden_ids)>{transforms,
element_space_size};
return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
remove_cv_t<decltype(low_dim_hidden_idss)>,
remove_cv_t<decltype(up_dim_hidden_idss)>,
remove_cv_t<decltype(visible_dim_hidden_ids)>>{
transforms, element_space_size};
}
template <index_t N>
......
......@@ -17,13 +17,7 @@ struct TupleElementKey
template <typename Key, typename Data>
struct TupleElement
{
__host__ __device__ explicit constexpr TupleElement() : mData() {}
#if 0
__host__ __device__ explicit constexpr TupleElement(const TupleElement&) = default;
__host__ __device__ explicit constexpr TupleElement(TupleElement&&) = default;
#endif
__host__ __device__ explicit constexpr TupleElement() = default;
template <typename UData>
__host__ __device__ explicit constexpr TupleElement(const TupleElement<Key, UData>& te)
......@@ -70,38 +64,36 @@ struct TupleImpl;
template <index_t... Is, typename... Xs>
struct TupleImpl<Sequence<Is...>, Xs...> : TupleElement<TupleElementKey<Is>, Xs>...
{
__host__ __device__ explicit constexpr TupleImpl() : TupleElement<TupleElementKey<Is>, Xs>()...
{
static_assert(sizeof...(Is) == sizeof...(Xs), "wrong! inconsistent size");
}
#if 0
__host__ __device__ explicit constexpr TupleImpl(const TupleImpl&) = default;
#if 1
__host__ __device__ explicit constexpr TupleImpl() = default;
__host__ __device__ explicit constexpr TupleImpl(TupleImpl&&) = default;
#endif
template <index_t... Js, typename... Ys>
__host__ __device__ explicit constexpr TupleImpl(const TupleImpl<Sequence<Js...>, Ys...>& y)
: TupleElement<TupleElementKey<Is>, Xs>(
static_cast<const TupleElement<TupleElementKey<Js>, Ys>&>(y))...
template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 1, bool>::type = false>
__host__ __device__ explicit constexpr TupleImpl(Ys&&... ys)
: TupleElement<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
{
static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys),
"wrong! inconsistent size");
}
#else
__host__ __device__ explicit constexpr TupleImpl() = default;
template <index_t... Js, typename... Ys>
__host__ __device__ explicit constexpr TupleImpl(TupleImpl<Sequence<Js...>, Ys...>&& y)
: TupleElement<TupleElementKey<Is>, Xs>(
static_cast<TupleElement<TupleElementKey<Js>, Ys>&&>(y))...
template <typename Y,
typename std::enable_if<sizeof...(Is) == 1 && sizeof...(Xs) == 1 &&
!is_same<remove_cv_t<Y>, TupleImpl>::value,
bool>::type = false>
__host__ __device__ explicit constexpr TupleImpl(Y&& y)
: TupleElement<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
{
}
template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 1, bool>::type = false>
template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
__host__ __device__ explicit constexpr TupleImpl(Ys&&... ys)
: TupleElement<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
{
static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys),
"wrong! inconsistent size");
}
#endif
__host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); }
......@@ -126,34 +118,17 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
using base =
detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Xs), 1>::type, Xs...>;
__host__ __device__ explicit constexpr Tuple() : base() {}
#if 0
__host__ __device__ constexpr Tuple(const Tuple&) = default;
__host__ __device__ constexpr Tuple(Tuple&&) = default;
#endif
template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
__host__ __device__ explicit constexpr Tuple(const Tuple<Ys...>& y)
: base(static_cast<
const detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
Ys...>&>(y))
{
}
__host__ __device__ constexpr Tuple() = default;
template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
__host__ __device__ explicit constexpr Tuple(Tuple<Ys...>&& y)
: base(static_cast<
detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
Ys...>&&>(y))
template <typename Y,
typename std::enable_if<sizeof...(Xs) == 1 && !is_same<remove_cv_t<Y>, Tuple>::value,
bool>::type = false>
__host__ __device__ explicit constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
{
}
template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 1,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2,
bool>::type = false>
__host__ __device__ explicit constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
{
......
......@@ -11,12 +11,12 @@
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dummy_static_transform.hpp"
#include "device_dummy_dynamic_transform_v1.hpp"
#include "device_dummy_dynamic_transform.hpp"
//#include "device_dummy_static_transform.hpp"
//#include "device_dummy_dynamic_transform_v1.hpp"
//#include "device_dummy_dynamic_transform.hpp"
int main(int argc, char* argv[])
{
......
......@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -gline-tables-only -save-temps=$CWD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment