Commit 34cbbb48 authored by Chao Liu's avatar Chao Liu
Browse files

refactored dynamically indexed array works now

parent 4d70c71b
......@@ -97,10 +97,10 @@ struct DummyDynamicTransform_v1
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2>& conv_strides,
const MultiIndex<2>& conv_dilations,
const MultiIndex<2>& in_left_pads,
const MultiIndex<2>& in_right_pads) const
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -542,10 +542,10 @@ struct DummyDynamicTransform_v1
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2>& conv_strides,
const MultiIndex<2>& conv_dilations,
const MultiIndex<2>& in_left_pads,
const MultiIndex<2>& in_right_pads) const
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
const auto transformed_tensor_descs =
map_convolution_into_gemm_v1(wei_k_c_y_x_global_desc,
......@@ -564,7 +564,8 @@ struct DummyDynamicTransform_v1
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate<2>(in_gemmk_gemmn_global_desc, idx);
for(index_t iter = 0; iter < niter; ++iter)
{
......@@ -587,7 +588,7 @@ struct DummyDynamicTransform_v1
1,
p_out_global,
in_gemmk_gemmn_coord.GetOffset(),
#if 0
#if 1
in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
#else
true,
......@@ -603,10 +604,10 @@ struct DummyDynamicTransform_v1
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2>& conv_strides,
const MultiIndex<2>& conv_dilations,
const MultiIndex<2>& in_left_pads,
const MultiIndex<2>& in_right_pads) const
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
Run_2(p_wei_global,
p_in_global,
......
......@@ -117,19 +117,41 @@ struct DummyDynamicTransform_v2_1
// initialize idx
static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
in_gemmk_gemmn_global_desc, make_multi_index(1, 0));
#pragma unroll 1
for(index_t i = 0; i < 10; ++i)
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate_v2(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
p_out_global[in_gemmk_gemmn_coord.GetOffset()] = 1;
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_coord.GetOffset(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
#else
true,
#endif
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
}
}
......@@ -215,13 +237,13 @@ struct DummyDynamicTransform_v2_1
// initialize idx
static_for<0, 4, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
#if 0
#if 1
const index_t niter = p_wei_global[10];
auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, idx);
const auto in_coord_step = make_dynamic_tensor_coordinate_step_v2(
in_n_c_hip_wip_global_desc, MultiIndex<4>{{1, 0, 0, 0}});
in_n_c_hip_wip_global_desc, make_multi_index(1, 0, 0, 0));
for(index_t iter = 0; iter < niter; ++iter)
{
......@@ -266,7 +288,7 @@ struct DummyDynamicTransform_v2_1
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
Run_1(p_wei_global,
Run_2(p_wei_global,
p_in_global,
p_out_global,
wei_k_c_y_x_global_desc,
......@@ -293,7 +315,6 @@ struct DummyDynamicTransform_v2_2
// initialize idx
static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
#if 1
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord =
......@@ -322,13 +343,14 @@ struct DummyDynamicTransform_v2_2
1,
p_out_global,
in_gemmk_gemmn_coord.GetOffset(),
#if 0
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
}
#else
p_out_global[in_gemmk_gemmn_global_desc.CalculateOffset(idx)] = 1;
true,
#endif
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
}
}
};
......
......@@ -371,10 +371,10 @@ struct DynamicMerge
__host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths},
low_lengths_scan_{reverse_exclusive_scan_on_array<index_t, NDimLow>(
low_lengths_scan_{container_reverse_exclusive_scan<index_t, NDimLow>(
low_lengths, math::multiplies<index_t>{}, index_t{1})},
up_lengths_{make_multi_index(
reduce_on_array(low_lengths, math::multiplies<index_t>(), index_t{1}))}
container_reduce(low_lengths, math::multiplies<index_t>(), index_t{1}))}
{
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
}
......@@ -516,7 +516,7 @@ struct DynamicUnMerge
__host__ __device__ constexpr DynamicUnMerge(const UpperIndex& up_lengths)
: up_lengths_{up_lengths},
up_lengths_scan_{
reverse_exclusive_scan_on_array(up_lengths, math::multiplies<index_t>(), index_t{1})}
container_reverse_exclusive_scan(up_lengths, math::multiplies<index_t>(), index_t{1})}
{
}
......
......@@ -263,7 +263,7 @@ struct DynamicTensorCoordinate
{
static constexpr index_t NDim = TensorDesc::GetNumOfDimension();
using type = decltype(make_dynamic_tensor_coordinate(TensorDesc{}, MultiIndex<NDim>{}));
using type = decltype(make_dynamic_tensor_coordinate<NDim>(TensorDesc{}, MultiIndex<NDim>{}));
};
} // namespace ck
......
......@@ -45,7 +45,7 @@ struct DynamicNativeTensorDescriptor
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
......@@ -136,7 +136,7 @@ struct DynamicTransformedTensorDescriptor
template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const
{
return array_cat(xs...);
return container_cat(xs...);
}
};
......@@ -211,21 +211,22 @@ struct DynamicTransformedTensorDescriptor
const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);
const auto sorted_up_lengths =
reorder_array_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
container_reorder_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
return sorted_up_lengths;
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr index_t GetLength(index_t idim) const
template <index_t IDim>
__host__ __device__ constexpr index_t GetLength(Number<IDim>) const
{
return GetLengths()[idim];
return GetLengths()[Number<IDim>{}];
}
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
......
......@@ -18,7 +18,7 @@ make_dynamic_native_tensor_descriptor_packed_v2(const MultiIndex<N>& lengths)
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
const index_t element_space_size =
reduce_on_array(lengths, math::multiplies<index_t>{}, index_t{1});
container_reduce(lengths, math::multiplies<index_t>{}, index_t{1});
return DynamicTensorDescriptor_v2<decltype(transforms),
decltype(low_dim_hidden_idss),
......
......@@ -116,7 +116,7 @@ struct DynamicTensorDescriptor_v2
// maybe this result should be saved as a member variable
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpaceSize() const
......@@ -411,13 +411,13 @@ transform_dynamic_tensor_descriptor_v2(const OldTensorDescriptor& old_tensor_des
unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered);
// put everything together
const auto all_transforms = tuple_cat(old_tensor_desc.GetTransforms(), new_transforms);
const auto all_transforms = container_cat(old_tensor_desc.GetTransforms(), new_transforms);
constexpr auto all_low_dim_hidden_idss =
tuple_cat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);
container_cat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);
constexpr auto all_up_dim_hidden_idss =
tuple_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
container_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
return DynamicTensorDescriptor_v2<decltype(all_transforms),
decltype(all_low_dim_hidden_idss),
......@@ -494,7 +494,7 @@ make_dynamic_tensor_coordinate_step_v2(const TensorDesc&, const VisibleIndex& id
// 2) all components of lower index diff will assume to be non-zero and need to be
// computed
const bool idx_diff_up_has_non_zero =
reduce_on_array(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false);
container_reduce(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false);
do_transforms(itran) = idx_diff_up_has_non_zero;
......
......@@ -9,30 +9,11 @@ namespace ck {
template <index_t N>
using MultiIndex = Array<index_t, N>;
#if 1 // works
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
{
return make_array<const index_t>(std::forward<const Xs>(xs)...);
}
#else // doesn't work: don't know how to get the lvalue/rvalue reference correct
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
{
return make_array<const index_t>(std::forward<const Xs>(xs)...);
}
#endif
#else
template <index_t N>
using MultiIndex = StaticallyIndexedArray<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
{
return make_statically_indexed_array<const index_t>(std::forward<const Xs>(xs)...);
}
#endif
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
......@@ -93,5 +74,84 @@ __host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T
return r;
}
#else
template <index_t N>
using MultiIndex = StaticallyIndexedArray<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
{
return make_statically_indexed_array<const index_t>(std::forward<const Xs>(xs)...);
}
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
{
return unpack([](auto... xs) { return make_multi_index(xs...); },
typename uniform_sequence_gen<NSize, 0>::type{});
}
template <typename T>
__host__ __device__ constexpr auto to_multi_index(const T& x)
{
return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
}
// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
// is the alias of the latter. This is because compiler cannot infer the NSize if
// using MultiIndex<NSize>
// TODO: how to fix this?
template <typename... Ys, typename X>
__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
{
static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Ys);
static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
return y;
}
template <typename... Ys, typename X>
__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
{
static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Ys);
static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
return y;
}
template <typename... Xs, typename Y>
__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
{
static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
return r;
}
template <typename... Xs, typename Y>
__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
{
static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
return r;
}
template <typename... Xs, typename Y>
__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
{
static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
return r;
}
#endif
} // namespace ck
#endif
......@@ -3,17 +3,17 @@
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "array.hpp"
#include "array_helper.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "statically_indexed_array.hpp"
#include "array_element_picker.hpp"
namespace ck {
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto push_back(const Array<TData, NSize>& a, const TData& x)
__host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
{
Array<TData, NSize + 1> r;
......@@ -25,137 +25,64 @@ __host__ __device__ constexpr auto push_back(const Array<TData, NSize>& a, const
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> /*new2old*/)
__host__ __device__ constexpr auto
container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
{
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Array<TData, NSize>{old_array[IRs]...};
return make_array(old_array[Number<IRs>{}]...);
}
template <typename TData, index_t NSize, typename MapOld2New>
struct lambda_reorder_array_given_old2new
{
const Array<TData, NSize>& old_array;
Array<TData, NSize>& new_array;
__host__ __device__ constexpr lambda_reorder_array_given_old2new(
const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t IOldDim>
__host__ __device__ constexpr void operator()(Number<IOldDim>) const
{
TData old_data = old_array[IOldDim];
constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
new_array(Number<INewDim>{}) = old_data;
}
};
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
Sequence<IRs...> /*old2new*/)
__host__ __device__ constexpr auto
container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
{
Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
static_for<0, NSize, 1>{}(
lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
return new_array;
return container_reorder_given_new2old(
old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
}
// emulate constepxr lambda for array
template <typename F, typename X, typename Y, typename Z>
struct lambda_array_math
template <typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
Sequence<IRs...> /*new2old*/)
{
const F& f;
const X& x;
const Y& y;
Z& z;
__host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
: f(f_), x(x_), y(y_), z(z_)
{
}
template <index_t IDim_>
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
z(IDim) = f(x[IDim], y[IDim]);
}
};
// Array = Sequence - Array
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return result;
return make_tuple(old_tuple[Number<IRs>{}]...);
}
// Array = Array * TData
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
template <typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
Sequence<IRs...> old2new)
{
Array<TData, NSize> result;
for(index_t i = 0; i < NSize; ++i)
{
result(i) = a[i] * v;
}
return result;
return container_reorder_given_new2old(
old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
}
template <typename TData, typename Arr, typename Reduce>
__host__ __device__ constexpr TData reduce_on_array(const Arr& a, Reduce f, TData init)
template <typename TData, typename Container, typename Reduce>
__host__ __device__ constexpr TData container_reduce(const Container& a, Reduce f, TData init)
{
// static_assert(is_same<typename Arr::data_type, TData>::value, "wrong! different data type");
static_assert(Arr::Size() > 0, "wrong");
static_assert(Container::Size() > 0, "wrong");
TData result = init;
static_for<0, Arr::Size(), 1>{}([&](auto I) { result = f(result, a[I]); });
static_for<0, Container::Size(), 1>{}([&](auto I) { result = f(result, a[I]); });
return result;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
#if 0
#pragma unroll
for(index_t i = NSize - 1; i >= 0; --i)
{
r = f(r, x[i]);
y(i) = r;
}
#else
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
......@@ -163,36 +90,61 @@ reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
#endif
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
#if 0
#pragma unroll
for(index_t i = NSize - 1; i > 0; --i)
{
static_for<NSize - 1, 0, -1>{}([&](auto i) {
y(i) = r;
r = f(r, x[i]);
}
});
y(Number<0>{}) = r;
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto container_reverse_exclusive_scan(
const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
y(0) = r;
#else
static_for<NSize - 1, 0, -1>{}([&](auto i) {
y(i) = r;
r = f(r, x[i]);
});
y(Number<0>{}) = r;
#endif
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto container_reverse_inclusive_scan(
const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y;
}
......@@ -204,19 +156,21 @@ __host__ __device__ constexpr auto container_cat(const X& x, const Ys&... ys)
}
template <typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto container_cat(const Array<T, NX>& x, const Array<T, NY>& y)
__host__ __device__ constexpr auto container_cat(const Array<T, NX>& ax, const Array<T, NY>& ay)
{
Array<T, NX + NY> z;
static_for<0, NX, 1>{}([&](auto i) { z(i) = x[i]; });
static_for<0, NY, 1>{}([&](auto i) { z(i + Number<NX>{}) = y[i]; });
return unpack2(
[&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
}
return z;
template <typename... X, typename... Y>
__host__ __device__ constexpr auto container_cat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
{
return unpack2(
[&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
}
template <typename T, index_t N>
__host__ __device__ constexpr auto container_cat(const Array<T, N>& x)
template <typename Container>
__host__ __device__ constexpr auto container_cat(const Container& x)
{
return x;
}
......
......@@ -9,7 +9,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0
// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 1
// device backend
#define CK_DEVICE_BACKEND_AMD 1
......
......@@ -63,7 +63,7 @@ struct ford_impl
for(index_t i = 0; i < RemainLengths::Front(); ++i)
{
ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
f, push_back(current_ordered_id, i));
f, container_push_back(current_ordered_id, i));
}
}
};
......@@ -77,7 +77,7 @@ struct ford_impl<Sequence<>, Orders>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
{
// retrive unordered Id
f(reorder_array_given_old2new(current_ordered_id, Orders{}));
f(container_reorder_given_old2new(current_ordered_id, Orders{}));
}
};
......
......@@ -49,7 +49,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x)
// TODO: properly implement unpack that takes any number of containers
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto unpack(F&& f, X&& x, Y&& y)
__host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
{
using X_ = remove_reference_t<X>;
using Y_ = remove_reference_t<Y>;
......
......@@ -36,42 +36,5 @@ __host__ __device__ constexpr auto make_statically_indexed_array()
return StaticallyIndexedArray<X, 0>();
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_exclusive_scan_on_array(const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
y(i) = r;
r = f(r, x[i]);
});
y(Number<0>{}) = r;
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_inclusive_scan_on_array(const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
{
StaticallyIndexedArray<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y;
}
} // namespace ck
#endif
......@@ -128,12 +128,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
__host__ __device__ constexpr Tuple(Tuple&&) = default;
#if 0
template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
#else
template <typename... Ys>
#endif
__host__ __device__ explicit constexpr Tuple(const Tuple<Ys...>& y)
: base(static_cast<
const detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
......@@ -141,12 +137,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
{
}
#if 0
template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
#else
template <typename... Ys>
#endif
__host__ __device__ explicit constexpr Tuple(Tuple<Ys...>&& y)
: base(static_cast<
detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
......@@ -154,7 +146,9 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
{
}
template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 1, bool>::type = false>
template <typename... Ys,
typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 1,
bool>::type = false>
__host__ __device__ explicit constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
{
}
......
......@@ -12,13 +12,6 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
typename arithmetic_sequence_gen<0, N, 1>::type{});
}
template <typename... Tuples>
__host__ __device__ constexpr auto tuple_cat(Tuples&&... tuples)
{
return unpack([&](auto&&... xs) { return make_tuple(std::forward<decltype(xs)>(xs)...); },
std::forward<Tuples>(tuples)...);
}
namespace detail {
template <typename F, typename X, index_t... Is>
......
......@@ -52,7 +52,7 @@ void device_dummy_dynamic_transform_v1(InDesc,
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));
make_dynamic_tensor_coordinate<2>(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));
for(index_t iter = 0; iter < 10; ++iter)
{
......@@ -115,10 +115,10 @@ void device_dummy_dynamic_transform_v1(InDesc,
const DynamicNativeTensorDescriptor<4>,
const DynamicNativeTensorDescriptor<4>,
const DynamicNativeTensorDescriptor<4>,
const Array<index_t, 2>,
const Array<index_t, 2>,
const Array<index_t, 2>,
const Array<index_t, 2>>,
const MultiIndex<2>,
const MultiIndex<2>,
const MultiIndex<2>,
const MultiIndex<2>>,
dim3(GridSize),
dim3(BlockSize),
0,
......
......@@ -96,7 +96,7 @@ void device_dummy_dynamic_transform_v2(InDesc,
for(index_t j = 0; j < nrepeat; ++j)
{
#if 0
#if 1
launch_kernel(run_gridwise_operation<DummyDynamicTransform_v2_1<BlockSize>,
index_t* const,
float* const,
......
......@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
#endif
}
#if 0
#if 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment