adding tensor_view

c9a8e558 · Chao Liu · 8669e242 · c9a8e558 · c9a8e558 · c9a8e558
Commit c9a8e558 authored Jul 20, 2019 by Chao Liu
20 changed files
--- a/composable_kernel/include/tensor/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor/tensor_coordinate.hpp
+#ifndef CK_CONSTANT_TENSOR_COORDINATE_HPP
+#define CK_CONSTANT_TENSOR_COORDINATE_HPP
+
+#include "constant_tensor_descriptor.hpp"
+#include "constant_merged_tensor_descriptor.hpp"
+
+namespace ck {
+
+template <class TDesc>
+struct TensorCoordinate;
+
+template <class... Ts>
+struct TensorCoordinate<ConstantTensorDescriptor<Ts...>>
+{
+    using TensorDescriptor = ConstantTensorDescriptor<Ts...>;
+    using nDim             = TensorDescriptor::GetNumOfDimension();
+
+    __host__ __device__ constexpr TensorCoordinate(Array<nDim, index_t> multi_id) {}
+
+    template <class IDim>
+    __host__ __device__ void March(IDim, index_t step_size, bool positive_direction)
+    {
+    }
+
+    private:
+    // multi-index
+    // offset
+};
+
+template <class... Ts>
+struct TensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>
+{
+    using TensorDescriptor = ConstantMergedTensorDescriptor<Ts...>;
+
+    template <class IDim>
+    __host__ __device__ void March(IDim, index_t step_size, bool positive_direction)
+    {
+    }
+
+    private:
+    // multi-index
+    // offset
+    // original multi-index
+    // partial offset
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor/tensor_view.hpp
+++ b/composable_kernel/include/tensor/tensor_view.hpp
+#ifndef CK_TENSOR_VIEW_HPP
+#define CK_TENSOR_VIEW_HPP
+
+namespace ck {
+
+template <class TData, class Desc>
+struct TensorViewForNormalTensor
+{
+    using DataType   = TData;
+    using TensorDesc = Desc;
+    using Coordinate = typename TensorCoordinate<TDesc>::Coordinate;
+
+    constexpr index_t nDim = TensorDesc::GetNumOfDimensions();
+
+    __host__ __device__ constexpr TensorView(
+        TData* p_data, Coordinate origin = Coordinate(make_zero_array<index_t, nDim>()))
+        : mpData{p_data}, mOrigin{origin}
+    {
+    }
+
+    // data access method
+    __host__ __device__ const TData& operator[](Coordinate coord) const {}
+
+    __host__ __device__ TData& operator()(Coordinate coord) {}
+
+    template <class IDim, class DataPerAccess>
+    __host__ __device__ static constexpr bool IsVectorAccessAllowed(IDim, DataPerAccess)
+    {
+        constexpr index_t length = TensorDescriptor::GetLength(IDim);
+        constexpr index_t stride = TensorDescriptor::GetLength(IDim);
+
+        return (length % DataPerAccess == 0) && (stride == 1 || DataPerAccess == 1);
+    }
+
+    private:
+    DataType* mpData;      // raw data
+    index_t mOriginOffset; // offset of the point of origin from pointer
+};
+
+template <class TData, class MergedDesc, class Lengths>
+struct TensorViewForMergedTensor
+{
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_2D_TENSOR_OP_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"

 namespace ck {


--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_3D_TENSOR_OP_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"

 namespace ck {


--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_4D_TENSOR_OP_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"
 #include "threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_BATCHED_GEMM_HPP

 #include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
+#include "constant_matrix_descriptor.hpp"
 #include "threadwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_GEMM_HPP

 #include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
+#include "constant_matrix_descriptor.hpp"
 #include "threadwise_gemm.hpp"

 #ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -2,8 +2,8 @@
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"
+#include "constant_merged_tensor_descriptor.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"

 #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
@@ -214,7 +214,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
            // By setting SubLengths = 1 at the merged dimension, this is always true;
            // If in the future, you want to enable SubLengths > 1 at the merged dimension,
            // special care in implementation is needed
-            threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
+            threadwise_generic_tensor_slice_copy_v2(SrcDesc{},
                                                    p_src + src_offset + mThreadSrcOffset,
                                                    make_zero_array<index_t, nDim>(),
                                                    thread_tensor_desc,
@@ -269,7 +269,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
            // By setting SubLengths = 1 at the merged dimension, this is always true;
            // If in the future, you want to enable SubLengths > 1 at the merged dimension,
            // special care in implementation is needed
-            threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
+            threadwise_generic_tensor_slice_copy_v2(thread_tensor_desc,
                                                    p_clipboard + clipboard_offset,
                                                    make_zero_array<index_t, nDim>(),
                                                    DstDesc{},
@@ -373,6 +373,37 @@ struct BlockwiseGenericTensorSliceCopy_v1
    }
 };

-} // namespace ck
+template <index_t BlockSize,
+          class SrcDesc,
+          class DstDesc,
+          class SubLengths,
+          class DataClusterLengths,
+          class ThreadClusterArrangeOrder,
+          class SrcAccessOrder,
+          class DstAccessOrder,
+          index_t SrcDataPerAccess,
+          index_t DstDataPerAccess>
+struct BlockwiseTensorCopy_v2
+{
+    static constexpr index_t nDim = SliceLengths::GetSize();
+
+    using ThreadwiseSliceCopy = GetThreadwiseTensorCopyOperator();

+    __device__ BlockwiseTensorCopy_v2(SrcTensor src_tensor, DstTensor dst_tensor)
+    {
+        // initialize threadwise-copy
+    }
+
+    __device__ void RunRead(const Float* __restrict__ p_src) {}
+
+    __device__ void RunWrite(Float* __restrict__ p_dst) const {}
+
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
+    {
+        RunRead(p_src);
+        RunWrite(p_dst);
+    }
+};
+
+} // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"
 #include "threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
@@ -2,7 +2,7 @@
 #define CK_THREADWISE_4D_TENSOR_OP_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"

 namespace ck {


--- a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
@@ -2,7 +2,7 @@
 #define CK_THREADWISE_DIRECT_CONVOLUTION_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"
 #include "threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -2,7 +2,7 @@
 #define CK_THREADWISE_GEMM_HPP

 #include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
+#include "constant_matrix_descriptor.hpp"

 namespace ck {


--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
@@ -2,8 +2,8 @@
 #define CK_THREADWISE_GENERIC_TENSOR_OP_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"
+#include "constant_merged_tensor_descriptor.hpp"

 namespace ck {
 template <class Float, class TDesc>

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -2,8 +2,8 @@
 #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"
+#include "constant_merged_tensor_descriptor.hpp"

 #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
@@ -64,8 +64,46 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
    constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
        Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});

+#if 1
+    if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+    {
+        print_Sequence("access_lengths: ", access_lengths);
+    }
+#endif
+
    using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;

+#if 1
+    if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+    {
+        printf("src:");
+        for(index_t i = 0; i < SliceLengths{}[0]; ++i)
+        {
+            for(index_t j = 0; j < SliceLengths{}[1]; ++j)
+            {
+                index_t offset = SrcDesc::GetOffsetFromMultiIndex(i, j);
+
+                printf("%d %d %d %f, ", i, j, offset, p_src[offset]);
+            }
+        }
+        printf("\n");
+
+        printf("dst:");
+        for(index_t i = 0; i < SliceLengths{}[0]; ++i)
+        {
+            for(index_t j = 0; j < SliceLengths{}[1]; ++j)
+            {
+                index_t offset = DstDesc::GetOffsetFromMultiIndex(i, j);
+
+                printf("%d %d %d %f, ", i, j, offset, p_dst[offset]);
+            }
+        }
+        printf("\n");
+
+        printf("\n");
+    }
+#endif
+
 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
    static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
        constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
@@ -101,9 +139,152 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(

        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+#if 1
+        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+        {
+            printf("src_index %d, dst_index %d\n", src_index, dst_index);
+
+            printf("src:");
+            for(index_t i = 0; i < SliceLengths{}[0]; ++i)
+            {
+                for(index_t j = 0; j < SliceLengths{}[1]; ++j)
+                {
+                    index_t offset = SrcDesc::GetOffsetFromMultiIndex(i, j);
+
+                    printf("%d %d %d %f, ", i, j, offset, p_src[offset]);
+                }
+            }
+            printf("\n");
+
+            printf("dst:");
+            for(index_t i = 0; i < SliceLengths{}[0]; ++i)
+            {
+                for(index_t j = 0; j < SliceLengths{}[1]; ++j)
+                {
+                    index_t offset = DstDesc::GetOffsetFromMultiIndex(i, j);
+
+                    printf("%d %d %d %f, ", i, j, offset, p_dst[offset]);
+                }
+            }
+            printf("\n");
+
+            printf("\n");
+        }
+#endif
    });
 #endif
+
+#if 1
+    if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+    {
+        printf("src:");
+        for(index_t i = 0; i < SliceLengths{}[0]; ++i)
+        {
+            for(index_t j = 0; j < SliceLengths{}[1]; ++j)
+            {
+                index_t offset = SrcDesc::GetOffsetFromMultiIndex(i, j);
+
+                printf("%d %d %d %f, ", i, j, offset, p_src[offset]);
+            }
+        }
+        printf("\n");
+
+        printf("dst:");
+        for(index_t i = 0; i < SliceLengths{}[0]; ++i)
+        {
+            for(index_t j = 0; j < SliceLengths{}[1]; ++j)
+            {
+                index_t offset = DstDesc::GetOffsetFromMultiIndex(i, j);
+
+                printf("%d %d %d %f, ", i, j, offset, p_dst[offset]);
+            }
+        }
+        printf("\n");
+
+        printf("\n");
+    }
+#endif
 }

+// user need to make sure alignment requirement is satisfied when setting SrcDataPerAccesss > 1 or
+// DstDataPerAccess > 1
+template <class SrcTensor,         // src tensor view
+          class DstTensor,         // dst tensor view
+          class SrcDimAccessOrder, // Sequence
+          class DstDimAccessOrder, // Sequence
+          index_t SrcDimVectorAccess,
+          index_t DstDimVectorAccess,
+          index_t SrcDataPerAccess,
+          index_t DstDataPerAccess>
+struct ThreadwiseTensorCopy_v2
+{
+    static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
+    using DataType                = typename SrcTensor::DataType;
+
+    __device__ ThreadwiseTensorCopy_v2(SrcTensor src_tensor, DstTensor dst_tensor)
+        : mBufferTensor(mpBufferData)
+    {
+        static_assert(is_same<typename SrcTensor::DataType, typename DstTensor::DataType>,
+                      "wrong! src and dst should have the same data type");
+
+        static_assert(
+            nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() &&
+                nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
+            "wrong! # of dimensions should be the same");
+
+        static_assert(is_same<decltype(SrcTensor::GetLengths()), decltype(DstTensor::GetLengths())>,
+                      "wrong! src and dst should have same lengths on all dimension");
+
+        static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
+                          is_valid_sequence_map<DstDimAccessOrder>::value,
+                      "wrong! src or dst dimension-access-order is not valid map");
+
+        static_assert(SrcTensor::IsVectorAccessAllowed(Number<SrcDimVectorAccess>{},
+                                                       Number<SrcDataOerAccess>{}) &&
+                          DstTensor::IsVectorAccessAllowed(Number<DstDimVectorAccess>{},
+                                                           Number<DstDataPerAccess>{}),
+                      "wrong! src or dst vector access is not allowed");
+    }
+
+    __device__ static constexpr auto GetSrcAccessLengths()
+    {
+        return SliceLengths::Modify(Number<SrcDimVectorAccess>{},
+                                    SliceLengths{}[SrcDimVectorAccess] / SrcDataPerAccess)
+            .ReorderGiveNew2Old(SrcDimAddccessOrder{});
+    }
+
+    __device__ static constexpr auto GetDstAccessLengths() {}
+
+    // read data from src into buffer
+    __device__ void RunRead(const Float* __restrict__ p_src)
+    {
+        using vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
+
+        static_ford<decltype(GetSrcAccessLengths())>([&](auto access_id) {
+            constexpr auto data_id = access_id
+
+                src_tensor[]
+        });
+    }
+
+    // write data from buffer into dst
+    __device__ void RunWrite(Float* __restrict__ p_dst) const {}
+
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst)
+    {
+        RunRead(p_src);
+        RunWrite(p_dst);
+    }
+
+    private:
+    __device__ static constexpr auto GetBufferTensorDescriptor()
+    {
+        return make_ConstantTensorDescriptor_packed(SrcTensor::GetLengths());
+    }
+
+    DataType mpBufferData[SrcTensor::GetElementSize()];
+    TensorView<TData, decltype(GetBufferTensorDescriptor())> mBufferTensor;
+};
+
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
@@ -2,7 +2,7 @@
 #define CK_THREADWISE_TENSOR_SLICE_COPY_HPP

 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/Array.hpp
+++ b/composable_kernel/include/utility/Array.hpp
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP

-#include "Sequence.hpp"
+#include "sequence.hpp"
 #include "functional2.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -6,8 +6,8 @@
 #include "integral_constant.hpp"
 #include "math.hpp"
 #include "vector_type.hpp"
-#include "Sequence.hpp"
-#include "Array.hpp"
+#include "sequence.hpp"
+#include "array.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"

--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -2,7 +2,7 @@
 #define CK_FUNCTIONAL_HPP

 #include "integral_constant.hpp"
-#include "Sequence.hpp"
+#include "sequence.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/functional2.hpp
+++ b/composable_kernel/include/utility/functional2.hpp
@@ -2,7 +2,7 @@
 #define CK_FUNCTIONAL2_HPP

 #include "functional.hpp"
-#include "Sequence.hpp"
+#include "sequence.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -3,8 +3,8 @@

 #include "functional.hpp"
 #include "functional2.hpp"
-#include "Sequence.hpp"
-#include "Array.hpp"
+#include "sequence.hpp"
+#include "array.hpp"

 namespace ck {