dynamic tensor descriptor v2 can produce correct result, but spill too many register

b23e7f8e · Chao Liu · 0a944e8f · b23e7f8e · b23e7f8e · b23e7f8e
Commit b23e7f8e authored Sep 11, 2020 by Chao Liu
6 changed files
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
@@ -15,7 +15,7 @@ make_dynamic_native_tensor_descriptor_packed_v2(const MultiIndex<N>& lengths)
    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
    constexpr auto up_dim_hidden_idss =
        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
-    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<0, N, 1>::type{};
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
    const index_t element_space_size =
        reduce_on_array(lengths, math::multiplies<index_t>{}, index_t{1});
@@ -37,7 +37,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul
    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
    constexpr auto up_dim_hidden_idss =
        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
-    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<0, N, 1>::type{};
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
    index_t element_space_size = 1;

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
@@ -282,6 +282,8 @@ struct DynamicTensorCoordinateStep_v2
    {
    }
+    __host__ __device__ constexpr const auto& GetIndexDiff() const { return GetVisibleIndexDiff(); }
    // private:
    __host__ __device__ constexpr const auto& GetVisibleIndexDiff() const
    {
@@ -510,7 +512,12 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten
    // this is what needs to be updated
    auto& idx_hidden = coord.GetHiddenIndex();
-    // update hidden index
+    // update visible index
+    auto idx_hidden_pick_visible =
+        pick_array_element(idx_hidden, TensorDesc::GetVisibleDimensionIds());
+    idx_hidden_pick_visible += coord_step.GetIndexDiff();
+    // update rest of hidden index
    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
        const auto& tran        = tensor_desc.GetTransforms().At(itran);
        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);

--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
@@ -147,6 +147,18 @@ struct Array
        return new_array;
    }
+    template <index_t NAppend>
+    __host__ __device__ constexpr auto Append(const Array<TData, NAppend>& xs) const
+    {
+        Array<TData, NSize + NAppend> r;
+        static_for<0, NSize, 1>{}([&r, this ](auto i) constexpr { r(i) = (*this)[i]; });
+        static_for<0, NAppend, 1>{}([&r, &xs ](auto i) constexpr { r(NSize + i) = xs[i]; });
+        return r;
+    }
 };
 // Arr: Array

--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
@@ -5,6 +5,12 @@
 namespace ck {
+template <typename X, typename... Xs>
+__host__ __device__ constexpr auto make_array(const X& x, const Xs&... xs)
+{
+    return Array<X, sizeof...(xs) + 1>{{x, xs...}};
+}
 template <typename Arr, typename Picks>
 __host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
 {

--- a/composable_kernel/include/utility/print.hpp
+++ b/composable_kernel/include/utility/print.hpp
@@ -2,6 +2,7 @@
 #define CK_PRINT_HPP
 #include "array.hpp"
+#include "array_helper.hpp"
 #include "sequence.hpp"
 namespace ck {
@@ -12,8 +13,6 @@ __host__ __device__ void print_array(const char* s, T a)
    using data_type         = typename decltype(a)::data_type;
    constexpr index_t nsize = a.Size();
-    static_assert(nsize >= 0 && nsize <= 10, "wrong!");
    if constexpr(is_same<data_type, uint32_t>{})
    {
        if constexpr(nsize == 0)
@@ -103,6 +102,12 @@ __host__ __device__ void print_array(const char* s, T a)
                   a[8],
                   a[9]);
        }
+        else
+        {
+            printf("%s size %u, {", s, nsize);
+            static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%u, ", a[i]); });
+            printf("}\n");
+        }
    }
    else if constexpr(is_same<data_type, int32_t>{})
    {
@@ -193,6 +198,32 @@ __host__ __device__ void print_array(const char* s, T a)
                   a[8],
                   a[9]);
        }
+        else
+        {
+            printf("%s size %d, {", s, nsize);
+            static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", a[i]); });
+            printf("}\n");
+        }
+    }
+}
+template <typename T>
+__host__ __device__ void print_array_v2(const char* s, T a)
+{
+    using data_type         = typename decltype(a)::data_type;
+    constexpr index_t nsize = a.Size();
+    if constexpr(is_same<data_type, uint32_t>{})
+    {
+        printf("%s size %u, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%u] %u, ", i.value, a[i]); });
+        printf("}\n");
+    }
+    else if constexpr(is_same<data_type, int32_t>{})
+    {
+        printf("%s size %d, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
+        printf("}\n");
    }
 }

--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
@@ -54,7 +54,7 @@ void device_dummy_dynamic_transform(InDesc,
        auto in_gemmk_gemmn_coord =
            make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});
-        for(index_t iter = 0; iter < 100; ++iter)
+        for(index_t iter = 0; iter < 10; ++iter)
        {
            constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
@@ -190,17 +190,14 @@ void device_dummy_dynamic_transform_v2(InDesc,
            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});
        const auto in_gemmk_gemmn_coord_step =
-            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{1, 0});
+            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 1});
        for(index_t iter = 0; iter < 100; ++iter)
        {
-            constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
            printf("iter %d\n", iter);
+            print_array_v2("visible idx: ", in_gemmk_gemmn_coord.GetIndex());
-            print_array("idx: ", in_gemmk_gemmn_coord.GetIndex());
+            print_array_v2("hidden idx: ", in_gemmk_gemmn_coord.GetHiddenIndex());
            printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
            printf("\n");
            move_dynamic_tensor_coordinate_v2(