prototype dynamic tensor descriptor

c1eaba27 · Chao Liu · f1541994 · c1eaba27 · c1eaba27 · c1eaba27
Commit c1eaba27 authored Aug 19, 2020 by Chao Liu
5 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
@@ -2,10 +2,9 @@
 #define CK_DUMMY_DYNAMIC_TRANSFORM_HPP

 #include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
+#include "dynamic_tensor_coordinate.hpp"

 namespace ck {
 __host__ __device__ constexpr auto
@@ -83,7 +82,7 @@ template <index_t BlockSize>
 struct DummyDynamicTransform
 {
    __device__ void Run_v1(index_t* const __restrict__ p_wei_global,
-                           index_t* const __restrict__ p_in_global,
+                           float* const __restrict__ p_in_global,
                           float* const __restrict__ p_out_global,
                           const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
                           const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
@@ -522,7 +521,7 @@ struct DummyDynamicTransform
    }

    __device__ void Run_v2(index_t* const __restrict__ p_wei_global,
-                           index_t* const __restrict__ p_in_global,
+                           float* const __restrict__ p_in_global,
                           float* const __restrict__ p_out_global,
                           const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
                           const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
@@ -542,24 +541,23 @@ struct DummyDynamicTransform

        const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});

-#pragma unroll 1
-        for(index_t iter = 0; iter < 100; ++iter)
+        MultiIndex<2> idx;
+
+        // initialize idx
+        for(index_t i = 0; i < 2; ++i)
        {
-            //
-            MultiIndex<2> idx;
+            idx(i) = p_wei_global[get_thread_local_1d_id() + i];
+        }

-            // initialize idx
-            for(index_t i = 0; i < 2; ++i)
-            {
-                idx(i) = p_wei_global[10 * iter + get_thread_local_1d_id() + i];
-            }
+        const index_t niter = p_wei_global[10];
+
+        auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);

-            // offset
-            index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx);
+        for(index_t iter = 0; iter < 100; ++iter)
+        {
+            constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};

-            // is_in_bound
-            bool is_in_bound =
-                in_gemmk_gemmn_global_desc.IsValidUpperIndexMappedToValidLowerIndex(idx);
+            in_gemmk_gemmn_coord += gemmk1_gemmn0;

            // write
            float value = 1;
@@ -575,14 +573,14 @@ struct DummyDynamicTransform
                             true,
                             1,
                             p_out_global,
-                             offset,
-                             is_in_bound,
-                             out_n_k_ho_wo_global_desc.GetElementSpace());
+                             in_gemmk_gemmn_coord.GetOffset(),
+                             in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
+                             in_gemmk_gemmn_global_desc.GetElementSpace());
        }
    }

    __device__ void Run(index_t* const __restrict__ p_wei_global,
-                        index_t* const __restrict__ p_in_global,
+                        float* const __restrict__ p_in_global,
                        float* const __restrict__ p_out_global,
                        const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
                        const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -344,7 +344,8 @@ struct DynamicMerge
        // If neither 1) nor 2) is satisfied, then the calculation will also be
        // computed at
        //   run-time each time this function is called, and can be very expensive.
-        LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
+        LowerIndex idx_low_diff_const;
+        CalculateLowerIndex(idx_low_diff_const, idx_up_diff);

        // do carry check on each low dimension in reversed order
        // do not need to check the first dimension
@@ -361,15 +362,15 @@ struct DynamicMerge
                low_lengths_[i] + idx_low_diff_const[i];
 #endif

-            index_t idx_low_tmp[i] = idx_low_old[i] + carry;
+            index_t idx_low_tmp = idx_low_old[i] + carry;

-            bool do_carry = idx_low_tmp[i] >= idx_low_length_minus_idx_low_diff_const;
+            bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const;
 #if 0
-            bool do_borrow = idx_low_tmp[i] < -idx_low_diff_const[i];
+            bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
 #endif

            idx_low_diff(i) =
-                do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const;
+                do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const[i];
 #if 0
            idx_low_diff(i) =
                do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];

--- a/composable_kernel/include/tensor_description/dynamic_tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_coordinate.hpp
+#ifndef CK_DYNAMIC_TENSOR_COORDINATE_HPP
+#define CK_DYNAMIC_TENSOR_COORDINATE_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+
+namespace ck {
+
+// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
+// At the bare minimun, user should be able to query the following information from a tensor
+// coordinate:
+//   1. Tensor descriptor
+//   2. Location, represented in the form of multi-index
+//   3. Location, represented in the form of the offset to the origin of the tensor
+//   4. If the location is inside invalid area or not, e.g. the padding area of an implicitly padded
+//      tensor is considered invalid, because the padding area doesn't have any physical memory
+//      allocation
+// A tensor cooridnate also provides following functionality:
+//   1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
+//      can freely move the "point of location" inside the tensor
+
+// wrapper class for DynamicNativeTensorCoordinate and DynamicTransformedTensorCoordinate
+template <typename TensorDesc>
+struct DynamicTensorCoordinate;
+
+// tensor coordinate for native tensor
+template <typename TensorDesc>
+struct DynamicNativeTensorCoordinate
+{
+    using type                    = DynamicNativeTensorCoordinate;
+    using tensor_desc_type        = TensorDesc;
+    static constexpr index_t NDim = tensor_desc_type::GetNumOfDimension();
+    using Index                   = MultiIndex<NDim>;
+
+    __host__ __device__ explicit constexpr DynamicNativeTensorCoordinate(
+        const tensor_desc_type& tensor_desc, const Index& idx)
+        : tensor_desc_{tensor_desc}, idx_{idx}, offset_{tensor_desc.CalculateOffset(idx)}
+    {
+    }
+
+    __host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_; }
+
+    __host__ __device__ constexpr const auto& GetUpperIndex() const { return idx_; }
+
+    __host__ __device__ constexpr const auto& GetIndex() const { return idx_; }
+
+    __host__ __device__ constexpr const index_t& GetOffset() const { return offset_; }
+
+    __host__ __device__ constexpr type operator+=(const Index& idx_diff)
+    {
+        // idx_ is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
+        idx_ += idx_diff;
+
+        offset_ += tensor_desc_.CalculateOffsetDiff(idx_diff);
+
+        return *this;
+    }
+
+    __host__ __device__ constexpr type operator-=(const Index& idx_diff)
+    {
+        // idx_ is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
+        idx_ -= idx_diff;
+
+        offset_ -= tensor_desc_.CalculateOffsetDiff(idx_diff);
+
+        return *this;
+    }
+
+    __host__ __device__ constexpr type operator+(const Index& idx_diff) const
+    {
+        type coord = *this;
+        coord += idx_diff;
+        return coord;
+    }
+
+    __host__ __device__ constexpr type operator-(const Index& idx_diff) const
+    {
+        type coord = *this;
+        coord -= idx_diff;
+        return coord;
+    }
+
+    __host__ __device__ constexpr index_t CalculateOffsetDiff(const Index& idx_diff) const
+    {
+        return tensor_desc_.CalculateOffsetDiff(idx_diff);
+    }
+
+    // evaluated at run-time
+    __host__ __device__ constexpr bool IsUpperIndexValid() const
+    {
+        return tensor_desc_.IsUpperIndexValid(idx_);
+    }
+
+    // evaluated at run-time
+    __host__ __device__ constexpr bool IsOffsetValid() const
+    {
+        // For native tensor, offset is valid if upper-index is valid
+        return IsUpperIndexValid();
+    }
+
+    // evaluated at compile-time
+    __host__ __device__ static constexpr bool IsOffsetValidAssumingUpperIndexIsValid()
+    {
+        return true;
+    }
+
+    private:
+    const tensor_desc_type tensor_desc_;
+    // idx_ may be saved and updated, however, the value of some (or all) of its entries may
+    //   never be used. Compiler should be able to remove these entries as well as its calculation
+    //   as dead code.
+    // TODO: make sure compiler indeed remove these dead code
+    Index idx_;
+    index_t offset_;
+};
+
+// tensor coordinate for transformed tensor
+template <typename TensorDesc>
+struct DynamicTransformedTensorCoordinate
+{
+    static constexpr index_t NDimUp = TensorDesc::GetNumOfDimension();
+    using UpperDesc                 = TensorDesc;
+    using UpperCoord                = DynamicTransformedTensorCoordinate;
+    using UpperIndex                = MultiIndex<NDimUp>;
+
+    using LowerDesc  = typename UpperDesc::LowerDesc;
+    using LowerCoord = typename DynamicTensorCoordinate<LowerDesc>::type;
+
+    __host__ __device__ explicit constexpr DynamicTransformedTensorCoordinate(
+        const UpperDesc& tensor_desc_up, const UpperIndex& idx_up)
+        : tensor_desc_up_{tensor_desc_up},
+          idx_up_{idx_up},
+          coord_low_{tensor_desc_up.GetLowerTensorDescriptor(),
+                     tensor_desc_up.CalculateLowerIndex(idx_up)}
+    {
+    }
+
+    __host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_up_; }
+
+    __host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const
+    {
+        return coord_low_;
+    }
+
+    __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return idx_up_; }
+
+    __host__ __device__ constexpr const UpperIndex& GetIndex() const { return idx_up_; }
+
+    __host__ __device__ constexpr const index_t& GetOffset() const
+    {
+        return GetLowerCoordinate().GetOffset();
+    }
+
+    __host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
+    {
+        // For transformation of multi-index difference, not all transformation functions need to
+        //   know the old lower-index or the old upper-index. We pass both of them to the
+        //   transformation function. The transformation function itself decides to use them or not.
+        coord_low_ += tensor_desc_up_.CalculateLowerIndexDiff(
+            idx_up_diff, GetLowerCoordinate().GetIndex(), GetIndex());
+
+        // idx_up_ is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
+        idx_up_ += idx_up_diff;
+
+        return *this;
+    }
+
+    __host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
+    {
+        coord_low_ -= tensor_desc_up_.CalculateLowerIndexDiff(
+            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
+
+        // mIndex is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
+        idx_up_ -= idx_up_diff;
+
+        return *this;
+    }
+
+    __host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
+    {
+        UpperCoord coord_up = *this;
+        coord_up += idx_up_diff;
+        return coord_up;
+    }
+
+    __host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
+    {
+        UpperCoord coord_up = *this;
+        coord_up -= idx_up_diff;
+        return coord_up;
+    }
+
+    // Calculate offset diff without updating tensor-coordinate
+    // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
+    //   then all calculation can be done at compile-time.
+    // TODO: this function is not compiled to expected ISA
+    __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
+    {
+        // For transformation of multi-index difference, not all transformation functions need to
+        //   know the old lower-index or the old upper-index. We pass both of them to the
+        //   transformation function. The transformation function itself decides to use them or not.
+        const auto idx_low_diff =
+            tensor_desc_up_.CalculateLowerIndexDiff(idx_up_diff, coord_low_.GetIndex(), idx_up_);
+
+        return coord_low_.CalculateOffsetDiff(idx_low_diff);
+    }
+
+    // evaluated at run-time
+    __host__ __device__ constexpr bool IsUpperIndexValid() const
+    {
+        return tensor_desc_up_.IsUpperIndexValid(idx_up_);
+    }
+
+    // evaluted at run-time
+    __host__ __device__ constexpr bool IsOffsetValid() const
+    {
+        return IsUpperIndexValid() && coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
+    }
+
+    // most evaluatation is done at comile-time
+    __host__ __device__ constexpr bool IsOffsetValidAssumingUpperIndexIsValid() const
+    {
+        return tensor_desc_up_.IsValidUpperIndexMappedToValidLowerIndex(idx_up_) &&
+               coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
+    }
+
+    private:
+    const UpperDesc tensor_desc_up_;
+    // idx_up_ may be calculated and updated, however, the value of some (or all) of its entries
+    // may never be used. Compiler should be able to remove these entries as well as its calculation
+    // as dead code.
+    // TODO: make sure compiler indeed remove these dead code
+    UpperIndex idx_up_;
+    LowerCoord coord_low_;
+};
+
+template <index_t NDim>
+__host__ __device__ constexpr auto
+make_dynamic_tensor_coordinate(const DynamicNativeTensorDescriptor<NDim>& tensor_desc,
+                               const MultiIndex<NDim>& idx)
+{
+    return DynamicNativeTensorCoordinate<DynamicNativeTensorDescriptor<NDim>>{tensor_desc, idx};
+}
+
+template <index_t NDim, typename... Ts>
+__host__ __device__ constexpr auto
+make_dynamic_tensor_coordinate(const DynamicTransformedTensorDescriptor<Ts...>& tensor_desc,
+                               const MultiIndex<NDim>& idx)
+{
+    static_assert(DynamicTransformedTensorDescriptor<Ts...>::GetNumOfDimension() == NDim,
+                  "wrong! inconsistent # of dimensions");
+
+    return DynamicTransformedTensorCoordinate<DynamicTransformedTensorDescriptor<Ts...>>{
+        tensor_desc, idx};
+}
+
+template <typename TensorDesc>
+struct DynamicTensorCoordinate
+{
+    static constexpr index_t NDim = TensorDesc::GetNumOfDimension();
+
+    using type = decltype(make_dynamic_tensor_coordinate(TensorDesc{}, MultiIndex<NDim>{}));
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -20,6 +20,11 @@ struct DynamicNativeTensorDescriptor
    {
    }

+    __host__ __device__ explicit constexpr DynamicNativeTensorDescriptor()
+        : lengths_{make_zero_array<index_t, NDim>()}, strides_{make_zero_array<index_t, NDim>()}
+    {
+    }
+
    __host__ __device__ static constexpr index_t GetNumOfDimension() { return NDim; }

    __host__ __device__ constexpr auto GetLengths() const { return lengths_; }
@@ -62,12 +67,10 @@ struct DynamicNativeTensorDescriptor
        return offset;
    }

-    template <typename UpIdxDiff, typename UpIdx, typename LowIdx>
-    __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpIdxDiff& idx_up_diff,
-                                                              const LowIdx& /* idx_low_old */,
-                                                              const UpIdx& /* idx_up_old */) const
+    template <typename IdxDiff>
+    __host__ __device__ constexpr index_t CalculateOffsetDiff(const IdxDiff& idx_diff) const
    {
-        return CalculateOffset(idx_up_diff);
+        return CalculateOffset(idx_diff);
    }

    template <typename Idx>
@@ -92,13 +95,16 @@ template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor or
          typename UpDimensionIds>      // Tuple<Sequence<...>>
 struct DynamicTransformedTensorDescriptor
 {
-    const LowTensorDescriptor low_tensor_desc_;
-    const Transforms transforms_;
+    using LowerDesc = LowTensorDescriptor;
+    using UpperDesc = DynamicTransformedTensorDescriptor;
+
    static constexpr index_t NTransform = Transforms::Size();
+    const LowerDesc low_tensor_desc_;
+    const Transforms transforms_;

    __host__ __device__ static constexpr index_t GetNumOfLowerDimension()
    {
-        return LowTensorDescriptor::GetNumOfDimension();
+        return LowerDesc::GetNumOfDimension();
    }

    __host__ __device__ static constexpr index_t GetNumOfUpperDimension()
@@ -138,7 +144,7 @@ struct DynamicTransformedTensorDescriptor
    };

    __host__ __device__ explicit constexpr DynamicTransformedTensorDescriptor(
-        const LowTensorDescriptor& low_tensor_desc, const Transforms& transforms)
+        const LowerDesc& low_tensor_desc, const Transforms& transforms)
        : low_tensor_desc_{low_tensor_desc}, transforms_{transforms}
    {
        static_assert(NTransform == Transforms::Size() && NTransform == LowDimensionIds::Size() &&
@@ -178,7 +184,12 @@ struct DynamicTransformedTensorDescriptor
        //   of lower-tensor-descriptor
    }

-    __host__ __device__ static constexpr auto GetNumOfDimension()
+    __host__ __device__ explicit constexpr DynamicTransformedTensorDescriptor()
+        : low_tensor_desc_{}, transforms_{}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfDimension()
    {
        return GetNumOfUpperDimension();
    }
@@ -281,7 +292,7 @@ struct DynamicTransformedTensorDescriptor
    {
        LowerIndex idx_low_diff;

-        CalculateLowerIndex(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
+        CalculateLowerIndexDiff(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);

        return idx_low_diff;
    }
@@ -321,7 +332,7 @@ struct DynamicTransformedTensorDescriptor
                const auto up_dims_part = UpDimensionIds{}.At(itran);
                const auto idx_up_part  = pick_array_element(idx_up, up_dims_part);

-                flag = flag && IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
+                flag = flag && tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
            }
        });


--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
@@ -50,19 +50,37 @@ void device_dummy_dynamic_transform(InDesc,
                                                            in_right_pads);

        const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
-        print_array("cpu: in_gemmk_gemmn_global_desc:", in_gemmk_gemmn_global_desc.GetLengths());

-        const auto idx0 = MultiIndex<2>({2591, 36991});
-        const auto idx1 = in_gemmk_gemmn_global_desc.CalculateLowerIndex(idx0);
-        const auto idx2 =
-            in_gemmk_gemmn_global_desc.GetLowerTensorDescriptor().CalculateLowerIndex(idx1);
+        auto in_gemmk_gemmn_coord =
+            make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});

-        const index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx0);
-
-        print_array("idx0:", idx0);
-        print_array("idx1:", idx1);
-        print_array("idx2:", idx2);
-        printf("offset %d\n", offset);
+        for(index_t iter = 0; iter < 100; ++iter)
+        {
+            constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
+
+            printf("iter %d\n", iter);
+
+            print_array("idx0: ", in_gemmk_gemmn_coord.GetIndex());
+            print_array("idx1: ", in_gemmk_gemmn_coord.GetLowerCoordinate().GetIndex());
+            print_array("idx2: ",
+                        in_gemmk_gemmn_coord.GetLowerCoordinate().GetLowerCoordinate().GetIndex());
+            print_array("idx3: ",
+                        in_gemmk_gemmn_coord.GetLowerCoordinate()
+                            .GetLowerCoordinate()
+                            .GetLowerCoordinate()
+                            .GetIndex());
+            print_array("idx4: ",
+                        in_gemmk_gemmn_coord.GetLowerCoordinate()
+                            .GetLowerCoordinate()
+                            .GetLowerCoordinate()
+                            .GetLowerCoordinate()
+                            .GetIndex());
+            printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
+
+            printf("\n");
+
+            in_gemmk_gemmn_coord += gemmk1_gemmn0;
+        }
    }

    std::size_t data_sz = sizeof(T);
@@ -79,7 +97,6 @@ void device_dummy_dynamic_transform(InDesc,

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

-#if 0
    using dummy_transform = DummyDynamicTransform<BlockSize>;

    for(index_t i = 0; i < 5; ++i)
@@ -93,7 +110,7 @@ void device_dummy_dynamic_transform(InDesc,
        {
            launch_kernel(run_gridwise_operation<dummy_transform,
                                                 index_t* const,
-                                                 index_t* const,
+                                                 float* const,
                                                 float* const,
                                                 const DynamicNativeTensorDescriptor<4>,
                                                 const DynamicNativeTensorDescriptor<4>,
@@ -106,8 +123,8 @@ void device_dummy_dynamic_transform(InDesc,
                          dim3(BlockSize),
                          0,
                          0,
-                          static_cast<index_t*>(in_nchw_device_buf.GetDeviceBuffer()),
                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
                          wei_kcyx_desc,
                          in_nchw_desc,
@@ -118,7 +135,6 @@ void device_dummy_dynamic_transform(InDesc,
                          in_right_pads);
        }
    }
-#endif

    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
 }