"test/srt/git@developer.sourcefind.cn:change/sglang.git" did not exist on "28f80b12440df9d04fa9701942ac7900ed50b58a"
Commit 51884fc2 authored by Chao Liu's avatar Chao Liu
Browse files

WIP: explicitly separate offset component into compile-time, block-invariant...

WIP: explicitly separate offset component into compile-time, block-invariant and per-thread components
parent 740da00a
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp" #include "blockwise_gemm.hpp"
...@@ -172,6 +173,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -172,6 +173,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
BlockwiseGenericTensorSliceCopy_v4<BlockSize, BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(in_e_n1_b_n2_global_desc), decltype(in_e_n1_b_n2_global_desc),
decltype(in_e_n1_b_n2_block_desc), decltype(in_e_n1_b_n2_block_desc),
Sequence<0, 1, 0, 1>,
Sequence<1, 0, 1, 0>,
Sequence<1, 1, 1, 1>,
Sequence<0, 0, 0, 0>,
decltype(in_e_n1_b_n2_block_desc.GetLengths()), decltype(in_e_n1_b_n2_block_desc.GetLengths()),
InBlockCopySubLengths_E_N1_B_N2, InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2, InBlockCopyClusterLengths_E_N1_B_N2,
...@@ -213,6 +218,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -213,6 +218,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
BlockwiseGenericTensorSliceCopy_v4<BlockSize, BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(wei_e_k_global_desc), decltype(wei_e_k_global_desc),
decltype(wei_e_k_block_desc), decltype(wei_e_k_block_desc),
Sequence<1, 1>,
Sequence<0, 0>,
Sequence<1, 1>,
Sequence<0, 0>,
decltype(wei_e_k_block_desc.GetLengths()), decltype(wei_e_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_E_K, WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K, WeiBlockCopyClusterLengths_E_K,
...@@ -414,6 +423,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -414,6 +423,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc), ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc),
decltype(out_k0_k1_n1_b_n2_global_desc), decltype(out_k0_k1_n1_b_n2_global_desc),
Sequence<1, 1, 1, 1, 1>,
Sequence<0, 0, 0, 0, 0>,
Sequence<1, 1, 1, 0, 1>,
Sequence<0, 0, 0, 1, 0>,
decltype( decltype(
out_k0_k1_n1_b_n2_thread_desc.GetLengths()), out_k0_k1_n1_b_n2_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 5, 1>::type, arithmetic_sequence_gen<0, 5, 1>::type,
......
...@@ -368,5 +368,49 @@ struct Embed ...@@ -368,5 +368,49 @@ struct Embed
} }
}; };
template <index_t LowerLength, index_t VectorSize>
struct Vectorize
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
__host__ __device__ constexpr Vectorize()
{
static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
"wrong! cannot evenly divide");
}
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Sequence<LowerLength / VectorSize>{};
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return VectorSize * idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return VectorSize * idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
// TODO: should this function be here? should it be specific for padding check?
__host__ __device__ static constexpr bool
IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
{
return false;
}
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -101,17 +101,24 @@ struct NativeTensorDescriptor ...@@ -101,17 +101,24 @@ struct NativeTensorDescriptor
return true; return true;
} }
__host__ __device__ static constexpr auto GetLinearDimensions() __host__ __device__ static constexpr auto GetMaskOfLinearDimensions()
{ {
return typename arithmetic_sequence_gen<0, nDim, 1>::type{}; return typename uniform_sequence_gen<nDim, 1>::type{};
}
__host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions()
{
return typename uniform_sequence_gen<nDim, 0>::type{};
} }
__host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; } __host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
#if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{ {
return Tuple<>{}; return Tuple<>{};
} }
#endif
// TODO: should this function be here? should it be specific for padding check? // TODO: should this function be here? should it be specific for padding check?
__host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */) __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */)
...@@ -233,7 +240,7 @@ struct TransformedTensorDescriptor ...@@ -233,7 +240,7 @@ struct TransformedTensorDescriptor
__host__ __device__ static constexpr auto GetUpperLengths() __host__ __device__ static constexpr auto GetUpperLengths()
{ {
constexpr auto tuple_of_up_lengths = constexpr auto tuple_of_up_lengths =
transform_tuple(lambda_GetUpperLengths{}, Transforms{}); transform_tuples(lambda_GetUpperLengths{}, Transforms{});
constexpr auto mingled_up_lengths = unpack(lambda_merge_sequences{}, tuple_of_up_lengths); constexpr auto mingled_up_lengths = unpack(lambda_merge_sequences{}, tuple_of_up_lengths);
...@@ -346,67 +353,92 @@ struct TransformedTensorDescriptor ...@@ -346,67 +353,92 @@ struct TransformedTensorDescriptor
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
} }
#if 1 #if 0
struct lambda_sequence_logic_or struct lambda_sequence_logic_or
{ {
template <typename... Seqs> template <typename... Seqs>
__host__ __device__ constexpr auto operator()(Seqs... seqs) const __host__ __device__ constexpr auto operator()(Seqs... seqs) const
{ {
// TODO: should use math::logic_or<bool>, after Sequence can take bool // TODO: should use math::logic_or<bool>, after Sequence can take bool
return typename sequence_reduce<math::logic_or<index_t>, Seqs...>::type{}; return typename sequence_reduce<math::logic_or<bool>, Seqs...>::type{};
} }
}; };
struct lambda_1 struct lambda_1
{ {
template <typename Transform> // check only one transform at a time
__host__ __device__ constexpr auto operator()(const Transform& tran) const template <typename Transform, typename LowDimensionId, typename UpDimensionId>
__host__ __device__ constexpr auto
operator()(const Transform& tran, LowDimensionId, UpDimensionId) const
{ {
return tran.GetUpperLengths(); // judge if transformation is linear
} constexpr bool is_linear_transform = tran.IsLinearTransform();
};
template <index_t IDim> // judge if all lower dimension are linear
__host__ __device__ static constexpr bool GetMaskOfLinearDimensions() constexpr bool is_all_low_dim_linear = math::accumulate_on_sequence(
{ pick_sequence_elements_by_mask(
// create tuple of linear dimension masks, for all transformations GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}),
constexpr auto tuple_of_linear_dimension_mask = math::logic_and<bool>{},
transform_tuple(lambda_1, Transforms{}); integral_constant<bool, true>{});
// reduce tuple of masks into one mask // judge if upper dimenisons are linear
constexpr auto linear_dimension_mask = constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear);
unpack(lambda_sequence_logic_or{}, tuple_of_linear_dimension_mask);
return linear_dimension_mask; constexpr auto value_sequence =
} typename uniform_sequence_gen<tran.GetNumOfUpperDimension(),
is_up_dim_nonlinear>::type{};
template <index_t IDim> constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence(
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>) typename uniform_sequence_gen<nDimUp, 0>::type{}, value_sequence, UpDimensionId{});
{
return GetMaskOfLinearDimensions().At(Number<IDim>{});
}
__host__ __device__ static constexpr auto GetLinearDimensions() return mask_of_up_nonlinear_dims;
{ };
constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
return pick_sequence_elements_by_mask( __host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions()
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask); {
} // create tuple of linear dimension masks, for all transformations
constexpr auto tuple_of_nonlinear_dimension_mask =
transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{});
__host__ __device__ static constexpr auto GetNonLinearDimensions() // reduce tuple of masks into one mask
{ constexpr auto nonlinear_dimension_mask =
constexpr auto nonlinear_dimension_mask = unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask);
GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
return pick_sequence_elements_by_mask( return nonlinear_dimension_mask;
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask); }
}
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() __host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
{ {
// not implemented return GetMaskOfNonLinearDimensions().Transform(math::logic_not<bool>{});
} }
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
{
return GetMaskOfLinearDimensions().At(Number<IDim>{});
}
__host__ __device__ static constexpr auto GetLinearDimensions()
{
constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
}
__host__ __device__ static constexpr auto GetNonLinearDimensions()
{
constexpr auto nonlinear_dimension_mask =
GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
}
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
// not implemented
}
#endif #endif
// TODO: should this function be here? should it be specific for padding check? // TODO: should this function be here? should it be specific for padding check?
......
...@@ -96,6 +96,19 @@ __host__ __device__ constexpr auto ...@@ -96,6 +96,19 @@ __host__ __device__ constexpr auto
LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{}); LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{});
} }
template <typename LowerTensorDescriptor, index_t VectorDim, index_t VectorSize>
__host__ __device__ constexpr auto
vectorize_tensor_descriptor(LowerTensorDescriptor, Number<VectorDim> vector_dim, Number<VectorSize>)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
return transform_tensor_descriptor(
LowerTensorDescriptor{},
Vectorize<LowerTensorDescriptor::GetLength(vector_dim), VectorSize>{},
typename arithmetic_sequence_gen<0, nDim, 1>::type{},
typename arithmetic_sequence_gen<0, nDim, 1>::type{});
}
template <typename... NativeDimensions> template <typename... NativeDimensions>
__host__ __device__ void __host__ __device__ void
print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc) print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
......
...@@ -680,6 +680,10 @@ struct BlockwiseGenericTensorSliceCopy_v3 ...@@ -680,6 +680,10 @@ struct BlockwiseGenericTensorSliceCopy_v3
template <index_t BlockSize, template <index_t BlockSize,
typename SrcDesc, typename SrcDesc,
typename DstDesc, typename DstDesc,
typename SrcLinearDimensionMask,
typename SrcNonLinearDimensionMask,
typename DstLinearDimensionMask,
typename DstNonLinearDimensionMask,
typename SliceLengths, typename SliceLengths,
typename SubLengths, typename SubLengths,
typename ThreadClusterLengths, typename ThreadClusterLengths,
...@@ -739,7 +743,9 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -739,7 +743,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
{ {
#if 0 #if 0
mThreadwiseLoad.Run(p_src, p_buffer); mThreadwiseLoad.Run(p_src, p_buffer);
#else #elif 1
mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
#elif 0
// hardcoded: global to register // hardcoded: global to register
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer); mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
#endif #endif
...@@ -750,7 +756,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -750,7 +756,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
{ {
#if 0 #if 0
mThreadwiseStore.Run(p_buffer, p_dst); mThreadwiseStore.Run(p_buffer, p_dst);
#else #elif 1
// hardcoded: register to LDS // hardcoded: register to LDS
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst); mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
#endif #endif
...@@ -784,21 +790,31 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -784,21 +790,31 @@ struct BlockwiseGenericTensorSliceCopy_v4
private: private:
using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{})); using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc, using ThreadwiseLoad =
RegisterBufferDesc, ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
SubLengths, RegisterBufferDesc,
SrcDimAccessOrder, SrcLinearDimensionMask,
SrcVectorAccessDim, SrcNonLinearDimensionMask,
SrcDataPerAccess, typename uniform_sequence_gen<nDim, 1>::type,
1>; typename uniform_sequence_gen<nDim, 0>::type,
SubLengths,
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc, SrcDimAccessOrder,
DstDesc, SrcVectorAccessDim,
SubLengths, SrcDataPerAccess,
DstDimAccessOrder, 1>;
DstVectorAccessDim,
1, using ThreadwiseStore =
DstDataPerAccess>; ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
DstDesc,
typename uniform_sequence_gen<nDim, 1>::type,
typename uniform_sequence_gen<nDim, 0>::type,
DstLinearDimensionMask,
DstNonLinearDimensionMask,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad; ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore; ThreadwiseStore mThreadwiseStore;
......
...@@ -1136,6 +1136,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 ...@@ -1136,6 +1136,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
// the other is device memory or LDS // the other is device memory or LDS
template <typename SrcDesc, template <typename SrcDesc,
typename DstDesc, typename DstDesc,
typename SrcLinearDimensionMask,
typename SrcNonLinearDimensionMask,
typename DstLinearDimensionMask,
typename DstNonLinearDimensionMask,
typename SliceLengths, typename SliceLengths,
typename DimAccessOrder, typename DimAccessOrder,
index_t VectorAccessDim, index_t VectorAccessDim,
...@@ -1254,6 +1258,117 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1254,6 +1258,117 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}); });
} }
template <index_t... Lengths, index_t... Mask>
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
{
return Sequence<(Mask ? Lengths : 1)...>{};
}
template <class TData>
__device__ void Run_access_order_optimized_for_source_index_calculation(const TData* p_src,
TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// TODO:: don't use hack
constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{};
constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{};
// separate steps into linear and non-linear components
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
// loop over src's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}(
[&](auto nonlinear_dim_long_vector_access_id) {
// step-sizes along src's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move src cooridnate along nonlinear dimensions
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
// loop over src's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}(
[&](auto linear_dim_long_vector_access_id) {
// step-sizes along src's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// set 0
for(index_t i = 0; i < long_vector_size; ++i)
{
p_long_vector[i] = 0;
}
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
// move src cooridnate along linear dimensions
const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// TODO: good implementation?
const index_t src_linear_offset_diff =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
// check for padding
// TODO: still kind of messy
if(!src_coord.IsAnyLevelIndexInPaddingArea())
{
const index_t src_offset = src_coord.GetOffset();
const index_t buffer_offset = i * src_data_per_access;
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
}
}
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset =
(mDstSliceOrigin +
(nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id))
.GetOffset();
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
}
});
});
}
// memory-space // memory-space
// 0: VGPR // 0: VGPR
// 1: LDS // 1: LDS
......
...@@ -389,6 +389,20 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi ...@@ -389,6 +389,20 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
return result; return result;
} }
// Array = Array * TData
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
{
Array<TData, NSize> result;
for(index_t i = 0; i < NSize; ++i)
{
result(i) = a[i] * v;
}
return result;
}
template <typename TData, index_t NSize, typename Reduce> template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr TData __host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init) accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
......
...@@ -706,18 +706,18 @@ __host__ __device__ constexpr auto sequence_pop_back(Seq) ...@@ -706,18 +706,18 @@ __host__ __device__ constexpr auto sequence_pop_back(Seq)
return sequence_pop_front(Seq::Reverse()).Reverse(); return sequence_pop_front(Seq::Reverse()).Reverse();
} }
template <typename F, index_t... Xs>
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
{
return Sequence<f(Xs)...>{};
}
template <typename... Seqs> template <typename... Seqs>
__host__ __device__ constexpr auto merge_sequences(Seqs...) __host__ __device__ constexpr auto merge_sequences(Seqs...)
{ {
return typename sequence_merge<Seqs...>::type{}; return typename sequence_merge<Seqs...>::type{};
} }
template <typename F, index_t... Xs>
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
{
return Sequence<f(Xs)...>{};
}
template <typename F, index_t... Xs, index_t... Ys> template <typename F, index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>) __host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>)
{ {
......
...@@ -113,19 +113,33 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs) ...@@ -113,19 +113,33 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
namespace detail { namespace detail {
template <typename F, typename X, index_t... Is> template <typename F, typename X, index_t... Is>
__host__ __device__ constexpr auto transform_tuple_impl(F f, const X& x, Sequence<Is...>) __host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
{ {
return make_tuple(f(x.At(Number<Is>{}))...); return make_tuple(f(x.At(Number<Is>{}))...);
} }
template <typename F, typename X, typename Y, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
}
} // namespace detail } // namespace detail
template <typename F, typename X> template <typename F, typename X>
__host__ __device__ constexpr auto transform_tuple(F f, const X& x) __host__ __device__ constexpr auto transform_tuples(F f, const X& x)
{ {
return detail::transform_tuple_impl( return detail::transform_tuples_impl(
f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
} }
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
{
return detail::transform_tuples_impl(
f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "device.hpp" #include "device.hpp"
#include "tensor.hpp" #include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" //#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
template <typename T, template <typename T,
......
...@@ -9,17 +9,17 @@ ...@@ -9,17 +9,17 @@
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp" #include "host_conv.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp" //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp" //#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment