Commit 52423948 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

Merge branch 'master' into jd_redux

parents b97af4ec 98a2cfcc
......@@ -6,7 +6,7 @@
namespace ck {
template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths)
{
return reverse_inclusive_scan_sequence(
Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
......@@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
}
template <class Lengths, index_t Align>
__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
__host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, Number<Align>)
{
constexpr index_t L_back_align =
Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_tensor_strides_packed(
return calculate_tensor_strides_packed_old(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
}
......@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr auto GetElementSize()
{
return Number<accumulate_on_sequence(
Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
return Number<reduce_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
}
__host__ __device__ static constexpr auto GetElementSpace()
{
constexpr index_t element_space_unaligned = accumulate_on_sequence(
constexpr index_t element_space_unaligned = reduce_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
return Number<element_space_unaligned>{};
......@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor
constexpr auto multi_id = Sequence<Is...>{};
return Number<accumulate_on_sequence(
return Number<reduce_on_sequence(
multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{})>{};
}
......@@ -178,7 +177,7 @@ struct ConstantTensorDescriptor
{
constexpr auto IDim = IDim_{};
constexpr index_t stride = PackedStrides::Get(IDim);
multi_id.Set(IDim, id / stride);
multi_id(IDim) = id / stride;
id -= multi_id[IDim] * stride;
}
};
......@@ -187,12 +186,12 @@ struct ConstantTensorDescriptor
{
Array<index_t, nDim> multi_id;
using PackedStrides = decltype(calculate_tensor_strides_packed(GetLengths()));
using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths()));
// calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));
multi_id.Set(Number<nDim - 1>{}, id / PackedStrides::Get(Number<nDim - 1>{}));
multi_id(Number<nDim - 1>{}) = id / PackedStrides::Get(Number<nDim - 1>{});
return multi_id;
}
......@@ -204,7 +203,7 @@ struct ConstantTensorDescriptor
}
// This function doesn't do carry check on the highest dimension for positive stepping (or
// borrow check on the lowest dimension for negative stepping) , for performance reason. It is
// borrow check on the highest dimension for negative stepping) , for performance reason. It is
// the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// highest dimension for positive stepping (or on the lowest dimension for negative stepping)
template <bool PositiveDirection>
......@@ -304,14 +303,73 @@ struct ConstantTensorDescriptor
GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
}
template <index_t IDimVector, index_t DataPerVector>
struct lambda_IsVectorizationAllowed
{
bool& is_allowed;
__host__ __device__ constexpr lambda_IsVectorizationAllowed(bool& is_allowed_)
: is_allowed(is_allowed_)
{
}
template <index_t IDim_>
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
if(IDimVector != IDim && Strides::Get(IDim) % DataPerVector != 0)
{
is_allowed = false;
}
}
};
template <index_t IDimVector, index_t DataPerVector>
__host__ __device__ static constexpr bool IsVectorizationAllowed(Number<IDimVector>,
Number<DataPerVector>)
{
bool is_allowed = (Strides{}[IDimVector] == 1 || DataPerVector == 1) &&
Lengths{}[IDimVector] % DataPerVector == 0;
static_for<0, nDim, 1>{}(
lambda_IsVectorizationAllowed<IDimVector, DataPerVector>{is_allowed});
return is_allowed;
}
template <index_t IDim, index_t DataPerVector>
__host__ __device__ static constexpr auto Vectorize(Number<IDim>, Number<DataPerVector>)
{
constexpr auto idim = Number<IDim>{};
constexpr auto data_per_vector = Number<DataPerVector>{};
static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!");
using vectorized_lengths =
decltype(Lengths::Modify(Number<IDim>{}, Number<Lengths{}[IDim] / DataPerVector>{}));
using vectorized_strides =
decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{}));
return ConstantTensorDescriptor<vectorized_lengths, vectorized_strides>{};
}
template <index_t IDim, index_t SliceLen>
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
{
using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}));
using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{}));
return ConstantTensorDescriptor<slice_lengths, Strides>{};
}
template <index_t... Is>
__host__ __device__ static constexpr auto Slice(Sequence<Is...> slice_lengths)
{
static_assert(slice_lengths.GetSize() == nDim, "wrong!");
return ConstantTensorDescriptor<decltype(slice_lengths), Strides>{};
}
template <index_t IDim, index_t SliceLength, index_t SliceStride>
__host__ __device__ static constexpr auto
StridedSlice(Number<IDim>, Number<SliceLength>, Number<SliceStride>)
......@@ -330,7 +388,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
constexpr index_t fold_intervals_product =
accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
reduce_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{});
......@@ -388,7 +446,7 @@ struct ConstantTensorDescriptor
static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable");
// unfolded length, stride
constexpr index_t unfold_length = accumulate_on_sequence(
constexpr index_t unfold_length = reduce_on_sequence(
GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
......@@ -409,7 +467,7 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr auto Pack()
{
using packed_strides = decltype(calculate_tensor_strides_packed(Lengths{}));
using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
return ConstantTensorDescriptor<Lengths, packed_strides>{};
}
......@@ -431,7 +489,7 @@ struct ConstantTensorDescriptor
template <class Lengths>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
{
using Strides = decltype(calculate_tensor_strides_packed(Lengths{}));
using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
return ConstantTensorDescriptor<Lengths, Strides>{};
}
......@@ -444,7 +502,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
template <class Lengths, index_t Align>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
{
using Strides = decltype(calculate_tensor_strides_aligned(Lengths{}, Number<Align>{}));
using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{}));
return ConstantTensorDescriptor<Lengths, Strides>{};
}
......
#ifndef CK_DIMENSION_HPP
#define CK_DIMENSION_HPP
#include "common_header.hpp"
namespace ck {
template <index_t Length>
struct Dimension
{
__host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
};
template <index_t Length, index_t Stride>
struct NativeDimension
{
__host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
__host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
__host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
__host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
{
return i_diff * Stride;
}
};
} // namespace ck
#endif
#ifndef CK_TENSOR_COORDINATE_HELPER_HPP
#define CK_TENSOR_COORDINATE_HELPER_HPP
#include "tensor_coordiante_v2.hpp"
namespace ck {
template <typename TensorDesc>
__host__ __device__ constexpr auto
make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
{
return typename TensorCoordinate<TensorDesc>::type(idx);
}
} // namespace ck
#endif
#ifndef CK_TENSOR_VIEW_HPP
#define CK_TENSOR_VIEW_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate_deprecated.hpp"
namespace ck {
// TensorDesc is ConstantTensorDescriptor or ConstantMergedTensorDescriptor
template <class TensorDesc, class TData>
struct NormalTensorView
{
using type = NormalTensorView;
using tensor_desc_type = TensorDesc;
using coordinate_type = typename NormalTensorCoordinate_deprecated<TensorDesc>::type;
using data_type = TData;
static constexpr auto nDim = TensorDesc::GetNumOfDimension();
__host__ __device__ constexpr NormalTensorView(TData* p_data) : mpData{p_data} {}
__host__ __device__ constexpr NormalTensorView() : NormalTensorView{nullptr} {}
__host__ __device__ static constexpr auto GetNumOfDimension() { return nDim; }
__host__ __device__ static constexpr auto GetLengths() { return TensorDesc::GetLengths(); }
__host__ __device__ const TData& operator[](coordinate_type coord) const
{
return mpData[coord.GetOffset()];
}
__host__ __device__ TData& operator()(coordinate_type coord) const
{
return mpData[coord.GetOffset()];
}
template <class IDim, class DataPerVector>
__host__ __device__ static constexpr auto IsVectorizationAllowed(IDim, DataPerVector)
{
return TensorDesc::IsVectorizationAllowed(IDim{}, DataPerVector{});
}
template <class IDim, class DataPerVector>
__host__ __device__ auto Vectorize(IDim idim, DataPerVector data_per_vector) const
{
static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!");
using vector_t = typename vector_type<TData, data_per_vector>::MemoryType;
return NormalTensorView<decltype(TensorDesc::Vectorize(idim, data_per_vector)), vector_t>(
reinterpret_cast<vector_t*>(mpData));
}
template <index_t... Is>
__host__ __device__ auto Slice(coordinate_type slice_origin, Sequence<Is...> slice_lengths)
{
static_assert(slice_lengths.GetSize() == nDim, "wrong!");
return NormalTensorView<decltype(TensorDesc::Slice(slice_lengths)), TData>(
mpData + slice_origin.GetOffset());
}
template <class IDim, class SliceLen>
__host__ __device__ auto
Slice(coordinate_type slice_origin, IDim idim, SliceLen slice_len) const
{
return NormalTensorView<decltype(TensorDesc::Slice(idim, slice_len)), TData>(
mpData + slice_origin.GetOffset());
}
// slice_window is a slicing window on "*this"
template <class SliceWindow, class T, bool PositiveDirection>
__device__ void MoveSliceWindow(SliceWindow& slice_window,
T step_sizes,
integral_constant<bool, PositiveDirection>)
{
if(PositiveDirection)
{
slice_window.mpData += coordinate_type{step_sizes}.GetOffset();
}
else
{
slice_window.mpData -= coordinate_type{step_sizes}.GetOffset();
}
}
// private:
data_type* mpData;
};
template <class... Xs, class TData>
__host__ __device__ constexpr auto make_TensorView(ConstantTensorDescriptor<Xs...>, TData* p_data)
{
return NormalTensorView<ConstantTensorDescriptor<Xs...>, TData>{p_data};
}
} // namespace ck
#endif
#ifndef CK_TENSOR_VISIT_HPP
#define CK_TENSOR_VISIT_HPP
#include "common_header.hpp"
#include "dimension.hpp"
#include "dimension_transform.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate.hpp"
namespace ck {
template <class TensorDescriptor>
struct TensorVisit
{
using Index = typename TensorDescriptor::Index;
using Coordinate = typename TensorCoordinate<TensorDescriptor>::type;
__host__ __device__ static void Run_v1(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
ford<TensorDescriptor::GetLengths()>{}(
[&](auto idx_diff) { index_t offset = (coord_begin + idx_diff).GetOffset(); });
}
__host__ __device__ static void Run_v2(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
ford<TensorDescriptor::GetLengths()>{}([&](auto idx_diff) {
index_t offset_diff = coord_begin.GetOffsetDiff(idx_diff);
index_t offset = coord_begin.GetOffset() + offset_diff;
});
}
__host__ __device__ static void Run_v3(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
constexpr auto linear_dimensions = TensorDescriptor::GetLinearDimensions();
constexpr auto nonlinear_dimensions = TensorDescriptor::GetNonLinearDimensions();
constexpr auto lengths = TensorDescriptor::GetLengths();
constexpr auto linear_dimension_lengths_hack =
lambda_HackLengths{}(lengths, linear_dimensions);
constexpr auto nonlinear_dimension_lengths_hack =
lambda_HackLengths{}(lengths, nonlinear_dimensions);
ford<nonlinear_dimension_lengths_hack>{}([&](auto idx_diff_nonlinear_hack) {
// run-time component
index_t offset_diff_nonlinear = coord_begin.GetOffsetDiff(idx_diff_nonlinear_hack);
ford<linear_dimension_lengths_hack>{}([&](auto idx_diff_linear_hack) {
// compile-time component
index_t offset_diff_linear = coord_begin.GetOffsetDiff(idx_diff_linear_hack);
index_t offset =
coord_begin.GetOffset() + offset_diff_nonlinear + offset_diff_linear;
});
});
}
__host__ __device__ static void Run_v4(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
constexpr auto linear_dimensions = TensorDescriptor::GetLinearDimensions();
constexpr auto nonlinear_independent_dimension_groups =
TensorDescriptor::GetNonLinearIndependentDimensionGroups();
constexpr auto lengths = TensorDescriptor::GetLengths();
constexpr auto linear_dimension_lengths = lambda_HackLengths{}(lengths, linear_dimensions);
// run-time component
index_t offset_diff_nonlinear = 0;
template <index_t NGroup>
struct f_recursion
{
template <index_t IGroup>
__host__ __device__ void Run(Number<IGroup>)
{
constexpr auto nonlinear_independent_dimensions_igroup =
nonlinear_independent_dimension_groups.Get(igroup);
constexpr auto nonlinear_independent_lengths_igroup =
lambda_HackLengths{}(lengths, nonlinear_independent_dimensions_igroup);
ford<nonlinear_independent_lengths_igroup>{}(
[&](auto idx_diff_nonlinear_igroup_hack) {
// run-time component
offset_diff_nonlinear +=
coord_begin.GetOffsetDiff(idx_diff_nonlinear_igroup_hack);
Run(Number<IGroup + 1>{});
});
};
// inner-most work
template <>
__host__ __device__ void Run(Number<NGroup>)
{
ford<linear_dimension_lengths>{}([&](auto idx_diff_linear_hack) {
// compile-time component
index_t offset_diff_linear = coord_begin.GetOffsetDiff(idx_diff_linear_hack);
index_t offset =
coord_begin.GetOffset() + offset_diff_nonlinear + offset_diff_linear;
});
}
};
// run-time component
index_t offset_diff_nonlinear = 0;
f_recursion<nonlinear_independent_dimension_groups.GetSize()>{}.Run();
}
};
} // namespace ck
#endif
......@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3
}
}
__device__ constexpr index_t GetRegisterClipboardSize() const
__device__ constexpr index_t GetRegisterBufferSize() const
{
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
......@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3
return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0;
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3
}
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3
}
#if CK_USE_AMD_INLINE_ASM
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
Float* p_clipboard) const
__device__ void RunLoadRegisterBuffer_asm(const Float* __restrict__ p_src,
Float* p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3
}
}
__device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer_asm(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......
......@@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
reduce_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread)
{
......@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3
}
}
__device__ static constexpr index_t GetRegisterClipboardSize()
__device__ static constexpr index_t GetRegisterBufferSize()
{
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
......@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3
}
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment