Commit b23e7f8e authored by Chao Liu's avatar Chao Liu
Browse files

dynamic tensor descriptor v2 can produce correct result, but spill too many register

parent 0a944e8f
...@@ -15,7 +15,7 @@ make_dynamic_native_tensor_descriptor_packed_v2(const MultiIndex<N>& lengths) ...@@ -15,7 +15,7 @@ make_dynamic_native_tensor_descriptor_packed_v2(const MultiIndex<N>& lengths)
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{}); constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
constexpr auto up_dim_hidden_idss = constexpr auto up_dim_hidden_idss =
make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{}); make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<0, N, 1>::type{}; constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
const index_t element_space_size = const index_t element_space_size =
reduce_on_array(lengths, math::multiplies<index_t>{}, index_t{1}); reduce_on_array(lengths, math::multiplies<index_t>{}, index_t{1});
...@@ -37,7 +37,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul ...@@ -37,7 +37,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{}); constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
constexpr auto up_dim_hidden_idss = constexpr auto up_dim_hidden_idss =
make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{}); make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<0, N, 1>::type{}; constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
index_t element_space_size = 1; index_t element_space_size = 1;
......
...@@ -282,6 +282,8 @@ struct DynamicTensorCoordinateStep_v2 ...@@ -282,6 +282,8 @@ struct DynamicTensorCoordinateStep_v2
{ {
} }
__host__ __device__ constexpr const auto& GetIndexDiff() const { return GetVisibleIndexDiff(); }
// private: // private:
__host__ __device__ constexpr const auto& GetVisibleIndexDiff() const __host__ __device__ constexpr const auto& GetVisibleIndexDiff() const
{ {
...@@ -510,7 +512,12 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten ...@@ -510,7 +512,12 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten
// this is what needs to be updated // this is what needs to be updated
auto& idx_hidden = coord.GetHiddenIndex(); auto& idx_hidden = coord.GetHiddenIndex();
// update hidden index // update visible index
auto idx_hidden_pick_visible =
pick_array_element(idx_hidden, TensorDesc::GetVisibleDimensionIds());
idx_hidden_pick_visible += coord_step.GetIndexDiff();
// update rest of hidden index
static_for<ntransform - 1, -1, -1>{}([&](auto itran) { static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
const auto& tran = tensor_desc.GetTransforms().At(itran); const auto& tran = tensor_desc.GetTransforms().At(itran);
constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran); constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
......
...@@ -147,6 +147,18 @@ struct Array ...@@ -147,6 +147,18 @@ struct Array
return new_array; return new_array;
} }
template <index_t NAppend>
__host__ __device__ constexpr auto Append(const Array<TData, NAppend>& xs) const
{
Array<TData, NSize + NAppend> r;
static_for<0, NSize, 1>{}([&r, this ](auto i) constexpr { r(i) = (*this)[i]; });
static_for<0, NAppend, 1>{}([&r, &xs ](auto i) constexpr { r(NSize + i) = xs[i]; });
return r;
}
}; };
// Arr: Array // Arr: Array
......
...@@ -5,6 +5,12 @@ ...@@ -5,6 +5,12 @@
namespace ck { namespace ck {
template <typename X, typename... Xs>
__host__ __device__ constexpr auto make_array(const X& x, const Xs&... xs)
{
return Array<X, sizeof...(xs) + 1>{{x, xs...}};
}
template <typename Arr, typename Picks> template <typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_array_element(Arr& a, Picks) __host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
{ {
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define CK_PRINT_HPP #define CK_PRINT_HPP
#include "array.hpp" #include "array.hpp"
#include "array_helper.hpp"
#include "sequence.hpp" #include "sequence.hpp"
namespace ck { namespace ck {
...@@ -12,8 +13,6 @@ __host__ __device__ void print_array(const char* s, T a) ...@@ -12,8 +13,6 @@ __host__ __device__ void print_array(const char* s, T a)
using data_type = typename decltype(a)::data_type; using data_type = typename decltype(a)::data_type;
constexpr index_t nsize = a.Size(); constexpr index_t nsize = a.Size();
static_assert(nsize >= 0 && nsize <= 10, "wrong!");
if constexpr(is_same<data_type, uint32_t>{}) if constexpr(is_same<data_type, uint32_t>{})
{ {
if constexpr(nsize == 0) if constexpr(nsize == 0)
...@@ -103,6 +102,12 @@ __host__ __device__ void print_array(const char* s, T a) ...@@ -103,6 +102,12 @@ __host__ __device__ void print_array(const char* s, T a)
a[8], a[8],
a[9]); a[9]);
} }
else
{
printf("%s size %u, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%u, ", a[i]); });
printf("}\n");
}
} }
else if constexpr(is_same<data_type, int32_t>{}) else if constexpr(is_same<data_type, int32_t>{})
{ {
...@@ -193,6 +198,32 @@ __host__ __device__ void print_array(const char* s, T a) ...@@ -193,6 +198,32 @@ __host__ __device__ void print_array(const char* s, T a)
a[8], a[8],
a[9]); a[9]);
} }
else
{
printf("%s size %d, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", a[i]); });
printf("}\n");
}
}
}
template <typename T>
__host__ __device__ void print_array_v2(const char* s, T a)
{
using data_type = typename decltype(a)::data_type;
constexpr index_t nsize = a.Size();
if constexpr(is_same<data_type, uint32_t>{})
{
printf("%s size %u, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%u] %u, ", i.value, a[i]); });
printf("}\n");
}
else if constexpr(is_same<data_type, int32_t>{})
{
printf("%s size %d, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
printf("}\n");
} }
} }
......
...@@ -54,7 +54,7 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -54,7 +54,7 @@ void device_dummy_dynamic_transform(InDesc,
auto in_gemmk_gemmn_coord = auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0}); make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});
for(index_t iter = 0; iter < 100; ++iter) for(index_t iter = 0; iter < 10; ++iter)
{ {
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0}; constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
...@@ -190,17 +190,14 @@ void device_dummy_dynamic_transform_v2(InDesc, ...@@ -190,17 +190,14 @@ void device_dummy_dynamic_transform_v2(InDesc,
make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0}); make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});
const auto in_gemmk_gemmn_coord_step = const auto in_gemmk_gemmn_coord_step =
make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{1, 0}); make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 1});
for(index_t iter = 0; iter < 100; ++iter) for(index_t iter = 0; iter < 100; ++iter)
{ {
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
printf("iter %d\n", iter); printf("iter %d\n", iter);
print_array_v2("visible idx: ", in_gemmk_gemmn_coord.GetIndex());
print_array("idx: ", in_gemmk_gemmn_coord.GetIndex()); print_array_v2("hidden idx: ", in_gemmk_gemmn_coord.GetHiddenIndex());
printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset()); printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
printf("\n"); printf("\n");
move_dynamic_tensor_coordinate_v2( move_dynamic_tensor_coordinate_v2(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment