Commit 51884fc2 authored by Chao Liu's avatar Chao Liu
Browse files

WIP: explicitly separate offset component into compile-time, block-invariant...

WIP: explicitly separate offset component into compile-time, block-invariant and per-thread components
parent 740da00a
......@@ -3,7 +3,8 @@
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
......@@ -172,6 +173,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(in_e_n1_b_n2_global_desc),
decltype(in_e_n1_b_n2_block_desc),
Sequence<0, 1, 0, 1>,
Sequence<1, 0, 1, 0>,
Sequence<1, 1, 1, 1>,
Sequence<0, 0, 0, 0>,
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
......@@ -213,6 +218,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(wei_e_k_global_desc),
decltype(wei_e_k_block_desc),
Sequence<1, 1>,
Sequence<0, 0>,
Sequence<1, 1>,
Sequence<0, 0>,
decltype(wei_e_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
......@@ -414,6 +423,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc),
decltype(out_k0_k1_n1_b_n2_global_desc),
Sequence<1, 1, 1, 1, 1>,
Sequence<0, 0, 0, 0, 0>,
Sequence<1, 1, 1, 0, 1>,
Sequence<0, 0, 0, 1, 0>,
decltype(
out_k0_k1_n1_b_n2_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 5, 1>::type,
......
......@@ -368,5 +368,49 @@ struct Embed
}
};
template <index_t LowerLength, index_t VectorSize>
struct Vectorize
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
__host__ __device__ constexpr Vectorize()
{
static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
"wrong! cannot evenly divide");
}
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Sequence<LowerLength / VectorSize>{};
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return VectorSize * idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return VectorSize * idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
// TODO: should this function be here? should it be specific for padding check?
__host__ __device__ static constexpr bool
IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
{
return false;
}
};
} // namespace ck
#endif
......@@ -101,17 +101,24 @@ struct NativeTensorDescriptor
return true;
}
__host__ __device__ static constexpr auto GetLinearDimensions()
__host__ __device__ static constexpr auto GetMaskOfLinearDimensions()
{
return typename arithmetic_sequence_gen<0, nDim, 1>::type{};
return typename uniform_sequence_gen<nDim, 1>::type{};
}
__host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions()
{
return typename uniform_sequence_gen<nDim, 0>::type{};
}
__host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
#if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
return Tuple<>{};
}
#endif
// TODO: should this function be here? should it be specific for padding check?
__host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */)
......@@ -233,7 +240,7 @@ struct TransformedTensorDescriptor
__host__ __device__ static constexpr auto GetUpperLengths()
{
constexpr auto tuple_of_up_lengths =
transform_tuple(lambda_GetUpperLengths{}, Transforms{});
transform_tuples(lambda_GetUpperLengths{}, Transforms{});
constexpr auto mingled_up_lengths = unpack(lambda_merge_sequences{}, tuple_of_up_lengths);
......@@ -346,67 +353,92 @@ struct TransformedTensorDescriptor
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
}
#if 1
#if 0
struct lambda_sequence_logic_or
{
template <typename... Seqs>
__host__ __device__ constexpr auto operator()(Seqs... seqs) const
{
// TODO: should use math::logic_or<bool>, after Sequence can take bool
return typename sequence_reduce<math::logic_or<index_t>, Seqs...>::type{};
return typename sequence_reduce<math::logic_or<bool>, Seqs...>::type{};
}
};
struct lambda_1
{
template <typename Transform>
__host__ __device__ constexpr auto operator()(const Transform& tran) const
// check only one transform at a time
template <typename Transform, typename LowDimensionId, typename UpDimensionId>
__host__ __device__ constexpr auto
operator()(const Transform& tran, LowDimensionId, UpDimensionId) const
{
return tran.GetUpperLengths();
}
};
// judge if transformation is linear
constexpr bool is_linear_transform = tran.IsLinearTransform();
template <index_t IDim>
__host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
{
// create tuple of linear dimension masks, for all transformations
constexpr auto tuple_of_linear_dimension_mask =
transform_tuple(lambda_1, Transforms{});
// judge if all lower dimension are linear
constexpr bool is_all_low_dim_linear = math::accumulate_on_sequence(
pick_sequence_elements_by_mask(
GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}),
math::logic_and<bool>{},
integral_constant<bool, true>{});
// reduce tuple of masks into one mask
constexpr auto linear_dimension_mask =
unpack(lambda_sequence_logic_or{}, tuple_of_linear_dimension_mask);
// judge if upper dimenisons are linear
constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear);
return linear_dimension_mask;
}
constexpr auto value_sequence =
typename uniform_sequence_gen<tran.GetNumOfUpperDimension(),
is_up_dim_nonlinear>::type{};
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
{
return GetMaskOfLinearDimensions().At(Number<IDim>{});
}
constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence(
typename uniform_sequence_gen<nDimUp, 0>::type{}, value_sequence, UpDimensionId{});
__host__ __device__ static constexpr auto GetLinearDimensions()
{
constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
return mask_of_up_nonlinear_dims;
};
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
}
__host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions()
{
// create tuple of linear dimension masks, for all transformations
constexpr auto tuple_of_nonlinear_dimension_mask =
transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{});
__host__ __device__ static constexpr auto GetNonLinearDimensions()
{
constexpr auto nonlinear_dimension_mask =
GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
// reduce tuple of masks into one mask
constexpr auto nonlinear_dimension_mask =
unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask);
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
}
return nonlinear_dimension_mask;
}
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
// not implemented
}
__host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
{
return GetMaskOfNonLinearDimensions().Transform(math::logic_not<bool>{});
}
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
{
return GetMaskOfLinearDimensions().At(Number<IDim>{});
}
__host__ __device__ static constexpr auto GetLinearDimensions()
{
constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
}
__host__ __device__ static constexpr auto GetNonLinearDimensions()
{
constexpr auto nonlinear_dimension_mask =
GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
}
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
// not implemented
}
#endif
// TODO: should this function be here? should it be specific for padding check?
......
......@@ -96,6 +96,19 @@ __host__ __device__ constexpr auto
LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{});
}
template <typename LowerTensorDescriptor, index_t VectorDim, index_t VectorSize>
__host__ __device__ constexpr auto
vectorize_tensor_descriptor(LowerTensorDescriptor, Number<VectorDim> vector_dim, Number<VectorSize>)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
return transform_tensor_descriptor(
LowerTensorDescriptor{},
Vectorize<LowerTensorDescriptor::GetLength(vector_dim), VectorSize>{},
typename arithmetic_sequence_gen<0, nDim, 1>::type{},
typename arithmetic_sequence_gen<0, nDim, 1>::type{});
}
template <typename... NativeDimensions>
__host__ __device__ void
print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
......
......@@ -680,6 +680,10 @@ struct BlockwiseGenericTensorSliceCopy_v3
template <index_t BlockSize,
typename SrcDesc,
typename DstDesc,
typename SrcLinearDimensionMask,
typename SrcNonLinearDimensionMask,
typename DstLinearDimensionMask,
typename DstNonLinearDimensionMask,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
......@@ -739,7 +743,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
{
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
#else
#elif 1
mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
#elif 0
// hardcoded: global to register
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
#endif
......@@ -750,7 +756,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
{
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
#else
#elif 1
// hardcoded: register to LDS
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
#endif
......@@ -784,21 +790,31 @@ struct BlockwiseGenericTensorSliceCopy_v4
private:
using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
RegisterBufferDesc,
SubLengths,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
DstDesc,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>;
using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
RegisterBufferDesc,
SrcLinearDimensionMask,
SrcNonLinearDimensionMask,
typename uniform_sequence_gen<nDim, 1>::type,
typename uniform_sequence_gen<nDim, 0>::type,
SubLengths,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseStore =
ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
DstDesc,
typename uniform_sequence_gen<nDim, 1>::type,
typename uniform_sequence_gen<nDim, 0>::type,
DstLinearDimensionMask,
DstNonLinearDimensionMask,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore;
......
......@@ -1136,6 +1136,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
// the other is device memory or LDS
template <typename SrcDesc,
typename DstDesc,
typename SrcLinearDimensionMask,
typename SrcNonLinearDimensionMask,
typename DstLinearDimensionMask,
typename DstNonLinearDimensionMask,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorAccessDim,
......@@ -1254,6 +1258,117 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
});
}
template <index_t... Lengths, index_t... Mask>
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
{
return Sequence<(Mask ? Lengths : 1)...>{};
}
template <class TData>
__device__ void Run_access_order_optimized_for_source_index_calculation(const TData* p_src,
TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// TODO:: don't use hack
constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{};
constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{};
// separate steps into linear and non-linear components
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
// loop over src's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}(
[&](auto nonlinear_dim_long_vector_access_id) {
// step-sizes along src's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move src cooridnate along nonlinear dimensions
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
// loop over src's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}(
[&](auto linear_dim_long_vector_access_id) {
// step-sizes along src's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// set 0
for(index_t i = 0; i < long_vector_size; ++i)
{
p_long_vector[i] = 0;
}
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
// move src cooridnate along linear dimensions
const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// TODO: good implementation?
const index_t src_linear_offset_diff =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
// check for padding
// TODO: still kind of messy
if(!src_coord.IsAnyLevelIndexInPaddingArea())
{
const index_t src_offset = src_coord.GetOffset();
const index_t buffer_offset = i * src_data_per_access;
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
}
}
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset =
(mDstSliceOrigin +
(nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id))
.GetOffset();
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
}
});
});
}
// memory-space
// 0: VGPR
// 1: LDS
......
......@@ -389,6 +389,20 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
return result;
}
// Array = Array * TData
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
{
Array<TData, NSize> result;
for(index_t i = 0; i < NSize; ++i)
{
result(i) = a[i] * v;
}
return result;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
......
......@@ -706,18 +706,18 @@ __host__ __device__ constexpr auto sequence_pop_back(Seq)
return sequence_pop_front(Seq::Reverse()).Reverse();
}
template <typename F, index_t... Xs>
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
{
return Sequence<f(Xs)...>{};
}
template <typename... Seqs>
__host__ __device__ constexpr auto merge_sequences(Seqs...)
{
return typename sequence_merge<Seqs...>::type{};
}
template <typename F, index_t... Xs>
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
{
return Sequence<f(Xs)...>{};
}
template <typename F, index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>)
{
......
......@@ -113,19 +113,33 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
namespace detail {
template <typename F, typename X, index_t... Is>
__host__ __device__ constexpr auto transform_tuple_impl(F f, const X& x, Sequence<Is...>)
__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
}
} // namespace detail
template <typename F, typename X>
__host__ __device__ constexpr auto transform_tuple(F f, const X& x)
__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
{
return detail::transform_tuple_impl(
return detail::transform_tuples_impl(
f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
{
return detail::transform_tuples_impl(
f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
} // namespace ck
#endif
......@@ -3,7 +3,7 @@
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
template <typename T,
......
......@@ -9,17 +9,17 @@
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
struct GeneratorTensor_1
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment