Commit 34cbbb48 authored by Chao Liu's avatar Chao Liu
Browse files

refactored dynamically indexed array works now

parent 4d70c71b
...@@ -97,10 +97,10 @@ struct DummyDynamicTransform_v1 ...@@ -97,10 +97,10 @@ struct DummyDynamicTransform_v1
const WeiDesc wei_k_c_y_x_global_desc, const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc, const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc, const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2>& conv_strides, const MultiIndex<2> conv_strides,
const MultiIndex<2>& conv_dilations, const MultiIndex<2> conv_dilations,
const MultiIndex<2>& in_left_pads, const MultiIndex<2> in_left_pads,
const MultiIndex<2>& in_right_pads) const const MultiIndex<2> in_right_pads) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -542,10 +542,10 @@ struct DummyDynamicTransform_v1 ...@@ -542,10 +542,10 @@ struct DummyDynamicTransform_v1
const WeiDesc wei_k_c_y_x_global_desc, const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc, const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc, const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2>& conv_strides, const MultiIndex<2> conv_strides,
const MultiIndex<2>& conv_dilations, const MultiIndex<2> conv_dilations,
const MultiIndex<2>& in_left_pads, const MultiIndex<2> in_left_pads,
const MultiIndex<2>& in_right_pads) const const MultiIndex<2> in_right_pads) const
{ {
const auto transformed_tensor_descs = const auto transformed_tensor_descs =
map_convolution_into_gemm_v1(wei_k_c_y_x_global_desc, map_convolution_into_gemm_v1(wei_k_c_y_x_global_desc,
...@@ -564,7 +564,8 @@ struct DummyDynamicTransform_v1 ...@@ -564,7 +564,8 @@ struct DummyDynamicTransform_v1
const index_t niter = p_wei_global[10]; const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx); auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate<2>(in_gemmk_gemmn_global_desc, idx);
for(index_t iter = 0; iter < niter; ++iter) for(index_t iter = 0; iter < niter; ++iter)
{ {
...@@ -587,7 +588,7 @@ struct DummyDynamicTransform_v1 ...@@ -587,7 +588,7 @@ struct DummyDynamicTransform_v1
1, 1,
p_out_global, p_out_global,
in_gemmk_gemmn_coord.GetOffset(), in_gemmk_gemmn_coord.GetOffset(),
#if 0 #if 1
in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(), in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
#else #else
true, true,
...@@ -603,10 +604,10 @@ struct DummyDynamicTransform_v1 ...@@ -603,10 +604,10 @@ struct DummyDynamicTransform_v1
const WeiDesc wei_k_c_y_x_global_desc, const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc, const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc, const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2>& conv_strides, const MultiIndex<2> conv_strides,
const MultiIndex<2>& conv_dilations, const MultiIndex<2> conv_dilations,
const MultiIndex<2>& in_left_pads, const MultiIndex<2> in_left_pads,
const MultiIndex<2>& in_right_pads) const const MultiIndex<2> in_right_pads) const
{ {
Run_2(p_wei_global, Run_2(p_wei_global,
p_in_global, p_in_global,
......
...@@ -117,19 +117,41 @@ struct DummyDynamicTransform_v2_1 ...@@ -117,19 +117,41 @@ struct DummyDynamicTransform_v2_1
// initialize idx // initialize idx
static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; }); static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord = auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx); make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2( const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
in_gemmk_gemmn_global_desc, make_multi_index(1, 0)); in_gemmk_gemmn_global_desc, make_multi_index(1, 0));
#pragma unroll 1 for(index_t iter = 0; iter < niter; ++iter)
for(index_t i = 0; i < 10; ++i)
{ {
move_dynamic_tensor_coordinate_v2( move_dynamic_tensor_coordinate_v2(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step); in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
p_out_global[in_gemmk_gemmn_coord.GetOffset()] = 1; // write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_coord.GetOffset(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
#else
true,
#endif
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
} }
} }
...@@ -215,13 +237,13 @@ struct DummyDynamicTransform_v2_1 ...@@ -215,13 +237,13 @@ struct DummyDynamicTransform_v2_1
// initialize idx // initialize idx
static_for<0, 4, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; }); static_for<0, 4, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
#if 0 #if 1
const index_t niter = p_wei_global[10]; const index_t niter = p_wei_global[10];
auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, idx); auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, idx);
const auto in_coord_step = make_dynamic_tensor_coordinate_step_v2( const auto in_coord_step = make_dynamic_tensor_coordinate_step_v2(
in_n_c_hip_wip_global_desc, MultiIndex<4>{{1, 0, 0, 0}}); in_n_c_hip_wip_global_desc, make_multi_index(1, 0, 0, 0));
for(index_t iter = 0; iter < niter; ++iter) for(index_t iter = 0; iter < niter; ++iter)
{ {
...@@ -266,7 +288,7 @@ struct DummyDynamicTransform_v2_1 ...@@ -266,7 +288,7 @@ struct DummyDynamicTransform_v2_1
const MultiIndex<2> in_left_pads, const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const const MultiIndex<2> in_right_pads) const
{ {
Run_1(p_wei_global, Run_2(p_wei_global,
p_in_global, p_in_global,
p_out_global, p_out_global,
wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc,
...@@ -293,7 +315,6 @@ struct DummyDynamicTransform_v2_2 ...@@ -293,7 +315,6 @@ struct DummyDynamicTransform_v2_2
// initialize idx // initialize idx
static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; }); static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
#if 1
const index_t niter = p_wei_global[10]; const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord = auto in_gemmk_gemmn_coord =
...@@ -322,13 +343,14 @@ struct DummyDynamicTransform_v2_2 ...@@ -322,13 +343,14 @@ struct DummyDynamicTransform_v2_2
1, 1,
p_out_global, p_out_global,
in_gemmk_gemmn_coord.GetOffset(), in_gemmk_gemmn_coord.GetOffset(),
#if 0
coordinate_has_valid_offset_assuming_visible_index_is_valid( coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord), in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
}
#else #else
p_out_global[in_gemmk_gemmn_global_desc.CalculateOffset(idx)] = 1; true,
#endif #endif
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
}
} }
}; };
......
...@@ -371,10 +371,10 @@ struct DynamicMerge ...@@ -371,10 +371,10 @@ struct DynamicMerge
__host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths) __host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths}, : low_lengths_{low_lengths},
low_lengths_scan_{reverse_exclusive_scan_on_array<index_t, NDimLow>( low_lengths_scan_{container_reverse_exclusive_scan<index_t, NDimLow>(
low_lengths, math::multiplies<index_t>{}, index_t{1})}, low_lengths, math::multiplies<index_t>{}, index_t{1})},
up_lengths_{make_multi_index( up_lengths_{make_multi_index(
reduce_on_array(low_lengths, math::multiplies<index_t>(), index_t{1}))} container_reduce(low_lengths, math::multiplies<index_t>(), index_t{1}))}
{ {
static_assert(LowerIndex::Size() == NDimLow, "wrong!"); static_assert(LowerIndex::Size() == NDimLow, "wrong!");
} }
...@@ -516,7 +516,7 @@ struct DynamicUnMerge ...@@ -516,7 +516,7 @@ struct DynamicUnMerge
__host__ __device__ constexpr DynamicUnMerge(const UpperIndex& up_lengths) __host__ __device__ constexpr DynamicUnMerge(const UpperIndex& up_lengths)
: up_lengths_{up_lengths}, : up_lengths_{up_lengths},
up_lengths_scan_{ up_lengths_scan_{
reverse_exclusive_scan_on_array(up_lengths, math::multiplies<index_t>(), index_t{1})} container_reverse_exclusive_scan(up_lengths, math::multiplies<index_t>(), index_t{1})}
{ {
} }
......
...@@ -263,7 +263,7 @@ struct DynamicTensorCoordinate ...@@ -263,7 +263,7 @@ struct DynamicTensorCoordinate
{ {
static constexpr index_t NDim = TensorDesc::GetNumOfDimension(); static constexpr index_t NDim = TensorDesc::GetNumOfDimension();
using type = decltype(make_dynamic_tensor_coordinate(TensorDesc{}, MultiIndex<NDim>{})); using type = decltype(make_dynamic_tensor_coordinate<NDim>(TensorDesc{}, MultiIndex<NDim>{}));
}; };
} // namespace ck } // namespace ck
......
...@@ -45,7 +45,7 @@ struct DynamicNativeTensorDescriptor ...@@ -45,7 +45,7 @@ struct DynamicNativeTensorDescriptor
__host__ __device__ constexpr index_t GetElementSize() const __host__ __device__ constexpr index_t GetElementSize() const
{ {
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1}); return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
} }
__host__ __device__ constexpr index_t GetElementSpace() const __host__ __device__ constexpr index_t GetElementSpace() const
...@@ -136,7 +136,7 @@ struct DynamicTransformedTensorDescriptor ...@@ -136,7 +136,7 @@ struct DynamicTransformedTensorDescriptor
template <typename... Xs> template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const __host__ __device__ constexpr auto operator()(Xs... xs) const
{ {
return array_cat(xs...); return container_cat(xs...);
} }
}; };
...@@ -211,21 +211,22 @@ struct DynamicTransformedTensorDescriptor ...@@ -211,21 +211,22 @@ struct DynamicTransformedTensorDescriptor
const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths); const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);
const auto sorted_up_lengths = const auto sorted_up_lengths =
reorder_array_given_new2old(unsorted_up_lengths, sorted2unsorted_map); container_reorder_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
return sorted_up_lengths; return sorted_up_lengths;
} }
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); } __host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr index_t GetLength(index_t idim) const template <index_t IDim>
__host__ __device__ constexpr index_t GetLength(Number<IDim>) const
{ {
return GetLengths()[idim]; return GetLengths()[Number<IDim>{}];
} }
__host__ __device__ constexpr index_t GetElementSize() const __host__ __device__ constexpr index_t GetElementSize() const
{ {
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1}); return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
} }
__host__ __device__ constexpr index_t GetElementSpace() const __host__ __device__ constexpr index_t GetElementSpace() const
......
...@@ -18,7 +18,7 @@ make_dynamic_native_tensor_descriptor_packed_v2(const MultiIndex<N>& lengths) ...@@ -18,7 +18,7 @@ make_dynamic_native_tensor_descriptor_packed_v2(const MultiIndex<N>& lengths)
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{}; constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
const index_t element_space_size = const index_t element_space_size =
reduce_on_array(lengths, math::multiplies<index_t>{}, index_t{1}); container_reduce(lengths, math::multiplies<index_t>{}, index_t{1});
return DynamicTensorDescriptor_v2<decltype(transforms), return DynamicTensorDescriptor_v2<decltype(transforms),
decltype(low_dim_hidden_idss), decltype(low_dim_hidden_idss),
......
...@@ -116,7 +116,7 @@ struct DynamicTensorDescriptor_v2 ...@@ -116,7 +116,7 @@ struct DynamicTensorDescriptor_v2
// maybe this result should be saved as a member variable // maybe this result should be saved as a member variable
__host__ __device__ constexpr index_t GetElementSize() const __host__ __device__ constexpr index_t GetElementSize() const
{ {
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1}); return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
} }
__host__ __device__ constexpr index_t GetElementSpaceSize() const __host__ __device__ constexpr index_t GetElementSpaceSize() const
...@@ -411,13 +411,13 @@ transform_dynamic_tensor_descriptor_v2(const OldTensorDescriptor& old_tensor_des ...@@ -411,13 +411,13 @@ transform_dynamic_tensor_descriptor_v2(const OldTensorDescriptor& old_tensor_des
unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered); unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered);
// put everything together // put everything together
const auto all_transforms = tuple_cat(old_tensor_desc.GetTransforms(), new_transforms); const auto all_transforms = container_cat(old_tensor_desc.GetTransforms(), new_transforms);
constexpr auto all_low_dim_hidden_idss = constexpr auto all_low_dim_hidden_idss =
tuple_cat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss); container_cat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);
constexpr auto all_up_dim_hidden_idss = constexpr auto all_up_dim_hidden_idss =
tuple_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss); container_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
return DynamicTensorDescriptor_v2<decltype(all_transforms), return DynamicTensorDescriptor_v2<decltype(all_transforms),
decltype(all_low_dim_hidden_idss), decltype(all_low_dim_hidden_idss),
...@@ -494,7 +494,7 @@ make_dynamic_tensor_coordinate_step_v2(const TensorDesc&, const VisibleIndex& id ...@@ -494,7 +494,7 @@ make_dynamic_tensor_coordinate_step_v2(const TensorDesc&, const VisibleIndex& id
// 2) all components of lower index diff will assume to be non-zero and need to be // 2) all components of lower index diff will assume to be non-zero and need to be
// computed // computed
const bool idx_diff_up_has_non_zero = const bool idx_diff_up_has_non_zero =
reduce_on_array(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false); container_reduce(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false);
do_transforms(itran) = idx_diff_up_has_non_zero; do_transforms(itran) = idx_diff_up_has_non_zero;
......
...@@ -9,30 +9,11 @@ namespace ck { ...@@ -9,30 +9,11 @@ namespace ck {
template <index_t N> template <index_t N>
using MultiIndex = Array<index_t, N>; using MultiIndex = Array<index_t, N>;
#if 1 // works
template <typename... Xs> template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(const Xs&... xs) __host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
{ {
return make_array<const index_t>(std::forward<const Xs>(xs)...); return make_array<const index_t>(std::forward<const Xs>(xs)...);
} }
#else // doesn't work: don't know how to get the lvalue/rvalue reference correct
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
{
return make_array<const index_t>(std::forward<const Xs>(xs)...);
}
#endif
#else
template <index_t N>
using MultiIndex = StaticallyIndexedArray<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
{
return make_statically_indexed_array<const index_t>(std::forward<const Xs>(xs)...);
}
#endif
template <index_t NSize> template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index() __host__ __device__ constexpr auto make_zero_multi_index()
...@@ -93,5 +74,84 @@ __host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T ...@@ -93,5 +74,84 @@ __host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T
return r; return r;
} }
#else
template <index_t N>
using MultiIndex = StaticallyIndexedArray<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
{
return make_statically_indexed_array<const index_t>(std::forward<const Xs>(xs)...);
}
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
{
return unpack([](auto... xs) { return make_multi_index(xs...); },
typename uniform_sequence_gen<NSize, 0>::type{});
}
template <typename T>
__host__ __device__ constexpr auto to_multi_index(const T& x)
{
return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
}
// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
// is the alias of the latter. This is because compiler cannot infer the NSize if
// using MultiIndex<NSize>
// TODO: how to fix this?
template <typename... Ys, typename X>
__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
{
static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Ys);
static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
return y;
}
template <typename... Ys, typename X>
__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
{
static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Ys);
static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
return y;
}
template <typename... Xs, typename Y>
__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
{
static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
return r;
}
template <typename... Xs, typename Y>
__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
{
static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
return r;
}
template <typename... Xs, typename Y>
__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
{
static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
return r;
}
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -3,17 +3,17 @@ ...@@ -3,17 +3,17 @@
#include "sequence.hpp" #include "sequence.hpp"
#include "sequence_helper.hpp" #include "sequence_helper.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "array.hpp" #include "array.hpp"
#include "array_helper.hpp" #include "array_helper.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "statically_indexed_array.hpp" #include "statically_indexed_array.hpp"
#include "array_element_picker.hpp" #include "array_element_picker.hpp"
namespace ck { namespace ck {
template <typename TData, index_t NSize> template <typename TData, index_t NSize>
__host__ __device__ constexpr auto push_back(const Array<TData, NSize>& a, const TData& x) __host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
{ {
Array<TData, NSize + 1> r; Array<TData, NSize + 1> r;
...@@ -25,137 +25,64 @@ __host__ __device__ constexpr auto push_back(const Array<TData, NSize>& a, const ...@@ -25,137 +25,64 @@ __host__ __device__ constexpr auto push_back(const Array<TData, NSize>& a, const
} }
template <typename TData, index_t NSize, index_t... IRs> template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array, __host__ __device__ constexpr auto
Sequence<IRs...> /*new2old*/) container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
{ {
static_assert(NSize == sizeof...(IRs), "NSize not consistent"); static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map"); static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Array<TData, NSize>{old_array[IRs]...}; return make_array(old_array[Number<IRs>{}]...);
} }
template <typename TData, index_t NSize, typename MapOld2New>
struct lambda_reorder_array_given_old2new
{
const Array<TData, NSize>& old_array;
Array<TData, NSize>& new_array;
__host__ __device__ constexpr lambda_reorder_array_given_old2new(
const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t IOldDim>
__host__ __device__ constexpr void operator()(Number<IOldDim>) const
{
TData old_data = old_array[IOldDim];
constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
new_array(Number<INewDim>{}) = old_data;
}
};
template <typename TData, index_t NSize, index_t... IRs> template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array, __host__ __device__ constexpr auto
Sequence<IRs...> /*old2new*/) container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
{ {
Array<TData, NSize> new_array; return container_reorder_given_new2old(
old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
static_for<0, NSize, 1>{}(
lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
return new_array;
} }
// emulate constepxr lambda for array template <typename... Ts, index_t... IRs>
template <typename F, typename X, typename Y, typename Z> __host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
struct lambda_array_math Sequence<IRs...> /*new2old*/)
{ {
const F& f; static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
const X& x;
const Y& y;
Z& z;
__host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
: f(f_), x(x_), y(y_), z(z_)
{
}
template <index_t IDim_>
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
z(IDim) = f(x[IDim], y[IDim]);
}
};
// Array = Sequence - Array
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::minus<index_t>{}; static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result; return make_tuple(old_tuple[Number<IRs>{}]...);
} }
// Array = Array * TData template <typename... Ts, index_t... IRs>
template <typename TData, index_t NSize> __host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a) Sequence<IRs...> old2new)
{ {
Array<TData, NSize> result; return container_reorder_given_new2old(
old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
for(index_t i = 0; i < NSize; ++i)
{
result(i) = a[i] * v;
}
return result;
} }
template <typename TData, typename Arr, typename Reduce> template <typename TData, typename Container, typename Reduce>
__host__ __device__ constexpr TData reduce_on_array(const Arr& a, Reduce f, TData init) __host__ __device__ constexpr TData container_reduce(const Container& a, Reduce f, TData init)
{ {
// static_assert(is_same<typename Arr::data_type, TData>::value, "wrong! different data type"); // static_assert(is_same<typename Arr::data_type, TData>::value, "wrong! different data type");
static_assert(Arr::Size() > 0, "wrong"); static_assert(Container::Size() > 0, "wrong");
TData result = init; TData result = init;
static_for<0, Arr::Size(), 1>{}([&](auto I) { result = f(result, a[I]); }); static_for<0, Container::Size(), 1>{}([&](auto I) { result = f(result, a[I]); });
return result; return result;
} }
template <typename TData, index_t NSize, typename Reduce> template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init) container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{ {
Array<TData, NSize> y; Array<TData, NSize> y;
TData r = init; TData r = init;
#if 0
#pragma unroll
for(index_t i = NSize - 1; i >= 0; --i)
{
r = f(r, x[i]);
y(i) = r;
}
#else
static_for<NSize - 1, 0, -1>{}([&](auto i) { static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]); r = f(r, x[i]);
y(i) = r; y(i) = r;
...@@ -163,36 +90,61 @@ reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in ...@@ -163,36 +90,61 @@ reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
r = f(r, x[Number<0>{}]); r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r; y(Number<0>{}) = r;
#endif
return y; return y;
} }
template <typename TData, index_t NSize, typename Reduce> template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init) container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{ {
Array<TData, NSize> y; Array<TData, NSize> y;
TData r = init; TData r = init;
#if 0 static_for<NSize - 1, 0, -1>{}([&](auto i) {
#pragma unroll
for(index_t i = NSize - 1; i > 0; --i)
{
y(i) = r; y(i) = r;
r = f(r, x[i]); r = f(r, x[i]);
} });
y(Number<0>{}) = r;
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto container_reverse_exclusive_scan(
const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
y(0) = r;
#else
static_for<NSize - 1, 0, -1>{}([&](auto i) { static_for<NSize - 1, 0, -1>{}([&](auto i) {
y(i) = r; y(i) = r;
r = f(r, x[i]); r = f(r, x[i]);
}); });
y(Number<0>{}) = r; y(Number<0>{}) = r;
#endif
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto container_reverse_inclusive_scan(
const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y; return y;
} }
...@@ -204,19 +156,21 @@ __host__ __device__ constexpr auto container_cat(const X& x, const Ys&... ys) ...@@ -204,19 +156,21 @@ __host__ __device__ constexpr auto container_cat(const X& x, const Ys&... ys)
} }
template <typename T, index_t NX, index_t NY> template <typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto container_cat(const Array<T, NX>& x, const Array<T, NY>& y) __host__ __device__ constexpr auto container_cat(const Array<T, NX>& ax, const Array<T, NY>& ay)
{ {
Array<T, NX + NY> z; return unpack2(
[&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
static_for<0, NX, 1>{}([&](auto i) { z(i) = x[i]; }); }
static_for<0, NY, 1>{}([&](auto i) { z(i + Number<NX>{}) = y[i]; });
return z; template <typename... X, typename... Y>
__host__ __device__ constexpr auto container_cat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
{
return unpack2(
[&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
} }
template <typename T, index_t N> template <typename Container>
__host__ __device__ constexpr auto container_cat(const Array<T, N>& x) __host__ __device__ constexpr auto container_cat(const Container& x)
{ {
return x; return x;
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0 #define CK_UNSIGNED_INDEX_TYPE 0
// multi index // multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 #define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 1
// device backend // device backend
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
......
...@@ -63,7 +63,7 @@ struct ford_impl ...@@ -63,7 +63,7 @@ struct ford_impl
for(index_t i = 0; i < RemainLengths::Front(); ++i) for(index_t i = 0; i < RemainLengths::Front(); ++i)
{ {
ford_impl<decltype(RemainLengths::PopFront()), Orders>{}( ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
f, push_back(current_ordered_id, i)); f, container_push_back(current_ordered_id, i));
} }
} }
}; };
...@@ -77,7 +77,7 @@ struct ford_impl<Sequence<>, Orders> ...@@ -77,7 +77,7 @@ struct ford_impl<Sequence<>, Orders>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
{ {
// retrive unordered Id // retrive unordered Id
f(reorder_array_given_old2new(current_ordered_id, Orders{})); f(container_reorder_given_old2new(current_ordered_id, Orders{}));
} }
}; };
......
...@@ -49,7 +49,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x) ...@@ -49,7 +49,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x)
// TODO: properly implement unpack that takes any number of containers // TODO: properly implement unpack that takes any number of containers
template <typename F, typename X, typename Y> template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto unpack(F&& f, X&& x, Y&& y) __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
{ {
using X_ = remove_reference_t<X>; using X_ = remove_reference_t<X>;
using Y_ = remove_reference_t<Y>; using Y_ = remove_reference_t<Y>;
......
...@@ -36,42 +36,5 @@ __host__ __device__ constexpr auto make_statically_indexed_array() ...@@ -36,42 +36,5 @@ __host__ __device__ constexpr auto make_statically_indexed_array()
return StaticallyIndexedArray<X, 0>(); return StaticallyIndexedArray<X, 0>();
} }
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_exclusive_scan_on_array(const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
y(i) = r;
r = f(r, x[i]);
});
y(Number<0>{}) = r;
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_inclusive_scan_on_array(const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y;
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -128,12 +128,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X ...@@ -128,12 +128,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
__host__ __device__ constexpr Tuple(Tuple&&) = default; __host__ __device__ constexpr Tuple(Tuple&&) = default;
#if 0
template <typename... Ys, template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false> typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
#else
template <typename... Ys>
#endif
__host__ __device__ explicit constexpr Tuple(const Tuple<Ys...>& y) __host__ __device__ explicit constexpr Tuple(const Tuple<Ys...>& y)
: base(static_cast< : base(static_cast<
const detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type, const detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
...@@ -141,12 +137,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X ...@@ -141,12 +137,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
{ {
} }
#if 0
template <typename... Ys, template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false> typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
#else
template <typename... Ys>
#endif
__host__ __device__ explicit constexpr Tuple(Tuple<Ys...>&& y) __host__ __device__ explicit constexpr Tuple(Tuple<Ys...>&& y)
: base(static_cast< : base(static_cast<
detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type, detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
...@@ -154,7 +146,9 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X ...@@ -154,7 +146,9 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
{ {
} }
template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 1, bool>::type = false> template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 1,
bool>::type = false>
__host__ __device__ explicit constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...) __host__ __device__ explicit constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
{ {
} }
......
...@@ -12,13 +12,6 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>) ...@@ -12,13 +12,6 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
typename arithmetic_sequence_gen<0, N, 1>::type{}); typename arithmetic_sequence_gen<0, N, 1>::type{});
} }
template <typename... Tuples>
__host__ __device__ constexpr auto tuple_cat(Tuples&&... tuples)
{
return unpack([&](auto&&... xs) { return make_tuple(std::forward<decltype(xs)>(xs)...); },
std::forward<Tuples>(tuples)...);
}
namespace detail { namespace detail {
template <typename F, typename X, index_t... Is> template <typename F, typename X, index_t... Is>
......
...@@ -52,7 +52,7 @@ void device_dummy_dynamic_transform_v1(InDesc, ...@@ -52,7 +52,7 @@ void device_dummy_dynamic_transform_v1(InDesc,
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{}); const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
auto in_gemmk_gemmn_coord = auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, make_multi_index(0, 0)); make_dynamic_tensor_coordinate<2>(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));
for(index_t iter = 0; iter < 10; ++iter) for(index_t iter = 0; iter < 10; ++iter)
{ {
...@@ -115,10 +115,10 @@ void device_dummy_dynamic_transform_v1(InDesc, ...@@ -115,10 +115,10 @@ void device_dummy_dynamic_transform_v1(InDesc,
const DynamicNativeTensorDescriptor<4>, const DynamicNativeTensorDescriptor<4>,
const DynamicNativeTensorDescriptor<4>, const DynamicNativeTensorDescriptor<4>,
const DynamicNativeTensorDescriptor<4>, const DynamicNativeTensorDescriptor<4>,
const Array<index_t, 2>, const MultiIndex<2>,
const Array<index_t, 2>, const MultiIndex<2>,
const Array<index_t, 2>, const MultiIndex<2>,
const Array<index_t, 2>>, const MultiIndex<2>>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
......
...@@ -96,7 +96,7 @@ void device_dummy_dynamic_transform_v2(InDesc, ...@@ -96,7 +96,7 @@ void device_dummy_dynamic_transform_v2(InDesc,
for(index_t j = 0; j < nrepeat; ++j) for(index_t j = 0; j < nrepeat; ++j)
{ {
#if 0 #if 1
launch_kernel(run_gridwise_operation<DummyDynamicTransform_v2_1<BlockSize>, launch_kernel(run_gridwise_operation<DummyDynamicTransform_v2_1<BlockSize>,
index_t* const, index_t* const,
float* const, float* const,
......
...@@ -549,7 +549,7 @@ int main(int argc, char* argv[]) ...@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
#endif #endif
} }
#if 0 #if 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment