Commit c9a8e558 authored by Chao Liu's avatar Chao Liu
Browse files

adding tensor_view

parent 8669e242
#ifndef CK_CONSTANT_TENSOR_COORDINATE_HPP
#define CK_CONSTANT_TENSOR_COORDINATE_HPP
#include "constant_tensor_descriptor.hpp"
#include "constant_merged_tensor_descriptor.hpp"
namespace ck {
template <class TDesc>
struct TensorCoordinate;
template <class... Ts>
struct TensorCoordinate<ConstantTensorDescriptor<Ts...>>
{
using TensorDescriptor = ConstantTensorDescriptor<Ts...>;
using nDim = TensorDescriptor::GetNumOfDimension();
__host__ __device__ constexpr TensorCoordinate(Array<nDim, index_t> multi_id) {}
template <class IDim>
__host__ __device__ void March(IDim, index_t step_size, bool positive_direction)
{
}
private:
// multi-index
// offset
};
template <class... Ts>
struct TensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>
{
using TensorDescriptor = ConstantMergedTensorDescriptor<Ts...>;
template <class IDim>
__host__ __device__ void March(IDim, index_t step_size, bool positive_direction)
{
}
private:
// multi-index
// offset
// original multi-index
// partial offset
};
} // namespace ck
#endif
#ifndef CK_TENSOR_VIEW_HPP
#define CK_TENSOR_VIEW_HPP
namespace ck {
template <class TData, class Desc>
struct TensorViewForNormalTensor
{
using DataType = TData;
using TensorDesc = Desc;
using Coordinate = typename TensorCoordinate<TDesc>::Coordinate;
constexpr index_t nDim = TensorDesc::GetNumOfDimensions();
__host__ __device__ constexpr TensorView(
TData* p_data, Coordinate origin = Coordinate(make_zero_array<index_t, nDim>()))
: mpData{p_data}, mOrigin{origin}
{
}
// data access method
__host__ __device__ const TData& operator[](Coordinate coord) const {}
__host__ __device__ TData& operator()(Coordinate coord) {}
template <class IDim, class DataPerAccess>
__host__ __device__ static constexpr bool IsVectorAccessAllowed(IDim, DataPerAccess)
{
constexpr index_t length = TensorDescriptor::GetLength(IDim);
constexpr index_t stride = TensorDescriptor::GetLength(IDim);
return (length % DataPerAccess == 0) && (stride == 1 || DataPerAccess == 1);
}
private:
DataType* mpData; // raw data
index_t mOriginOffset; // offset of the point of origin from pointer
};
template <class TData, class MergedDesc, class Lengths>
struct TensorViewForMergedTensor
{
};
} // namespace ck
#endif
......@@ -2,7 +2,7 @@
#define CK_BLOCKWISE_2D_TENSOR_OP_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_BLOCKWISE_3D_TENSOR_OP_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_BLOCKWISE_4D_TENSOR_OP_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
#include "threadwise_tensor_slice_copy.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_BLOCKWISE_BATCHED_GEMM_HPP
#include "common_header.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "constant_matrix_descriptor.hpp"
#include "threadwise_gemm.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_BLOCKWISE_GEMM_HPP
#include "common_header.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "constant_matrix_descriptor.hpp"
#include "threadwise_gemm.hpp"
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
......
......@@ -2,8 +2,8 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
#include "constant_merged_tensor_descriptor.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
......@@ -214,7 +214,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// By setting SubLengths = 1 at the merged dimension, this is always true;
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
threadwise_generic_tensor_slice_copy_v2(SrcDesc{},
p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(),
thread_tensor_desc,
......@@ -269,7 +269,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// By setting SubLengths = 1 at the merged dimension, this is always true;
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
threadwise_generic_tensor_slice_copy_v2(thread_tensor_desc,
p_clipboard + clipboard_offset,
make_zero_array<index_t, nDim>(),
DstDesc{},
......@@ -373,6 +373,37 @@ struct BlockwiseGenericTensorSliceCopy_v1
}
};
} // namespace ck
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SubLengths,
class DataClusterLengths,
class ThreadClusterArrangeOrder,
class SrcAccessOrder,
class DstAccessOrder,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseTensorCopy_v2
{
static constexpr index_t nDim = SliceLengths::GetSize();
using ThreadwiseSliceCopy = GetThreadwiseTensorCopyOperator();
__device__ BlockwiseTensorCopy_v2(SrcTensor src_tensor, DstTensor dst_tensor)
{
// initialize threadwise-copy
}
__device__ void RunRead(const Float* __restrict__ p_src) {}
__device__ void RunWrite(Float* __restrict__ p_dst) const {}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
RunRead(p_src);
RunWrite(p_dst);
}
};
} // namespace ck
#endif
......@@ -2,7 +2,7 @@
#define CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
#include "threadwise_tensor_slice_copy.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_THREADWISE_4D_TENSOR_OP_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_THREADWISE_DIRECT_CONVOLUTION_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
#include "threadwise_tensor_slice_copy.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_THREADWISE_GEMM_HPP
#include "common_header.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "constant_matrix_descriptor.hpp"
namespace ck {
......
......@@ -2,8 +2,8 @@
#define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
#include "constant_merged_tensor_descriptor.hpp"
namespace ck {
template <class Float, class TDesc>
......
......@@ -2,8 +2,8 @@
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
#include "constant_merged_tensor_descriptor.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
......@@ -64,8 +64,46 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
#if 1
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_Sequence("access_lengths: ", access_lengths);
}
#endif
using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
#if 1
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
printf("src:");
for(index_t i = 0; i < SliceLengths{}[0]; ++i)
{
for(index_t j = 0; j < SliceLengths{}[1]; ++j)
{
index_t offset = SrcDesc::GetOffsetFromMultiIndex(i, j);
printf("%d %d %d %f, ", i, j, offset, p_src[offset]);
}
}
printf("\n");
printf("dst:");
for(index_t i = 0; i < SliceLengths{}[0]; ++i)
{
for(index_t j = 0; j < SliceLengths{}[1]; ++j)
{
index_t offset = DstDesc::GetOffsetFromMultiIndex(i, j);
printf("%d %d %d %f, ", i, j, offset, p_dst[offset]);
}
}
printf("\n");
printf("\n");
}
#endif
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
......@@ -101,9 +139,152 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
#if 1
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
printf("src_index %d, dst_index %d\n", src_index, dst_index);
printf("src:");
for(index_t i = 0; i < SliceLengths{}[0]; ++i)
{
for(index_t j = 0; j < SliceLengths{}[1]; ++j)
{
index_t offset = SrcDesc::GetOffsetFromMultiIndex(i, j);
printf("%d %d %d %f, ", i, j, offset, p_src[offset]);
}
}
printf("\n");
printf("dst:");
for(index_t i = 0; i < SliceLengths{}[0]; ++i)
{
for(index_t j = 0; j < SliceLengths{}[1]; ++j)
{
index_t offset = DstDesc::GetOffsetFromMultiIndex(i, j);
printf("%d %d %d %f, ", i, j, offset, p_dst[offset]);
}
}
printf("\n");
printf("\n");
}
#endif
});
#endif
#if 1
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
printf("src:");
for(index_t i = 0; i < SliceLengths{}[0]; ++i)
{
for(index_t j = 0; j < SliceLengths{}[1]; ++j)
{
index_t offset = SrcDesc::GetOffsetFromMultiIndex(i, j);
printf("%d %d %d %f, ", i, j, offset, p_src[offset]);
}
}
printf("\n");
printf("dst:");
for(index_t i = 0; i < SliceLengths{}[0]; ++i)
{
for(index_t j = 0; j < SliceLengths{}[1]; ++j)
{
index_t offset = DstDesc::GetOffsetFromMultiIndex(i, j);
printf("%d %d %d %f, ", i, j, offset, p_dst[offset]);
}
}
printf("\n");
printf("\n");
}
#endif
}
// user need to make sure alignment requirement is satisfied when setting SrcDataPerAccesss > 1 or
// DstDataPerAccess > 1
template <class SrcTensor, // src tensor view
class DstTensor, // dst tensor view
class SrcDimAccessOrder, // Sequence
class DstDimAccessOrder, // Sequence
index_t SrcDimVectorAccess,
index_t DstDimVectorAccess,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseTensorCopy_v2
{
static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
using DataType = typename SrcTensor::DataType;
__device__ ThreadwiseTensorCopy_v2(SrcTensor src_tensor, DstTensor dst_tensor)
: mBufferTensor(mpBufferData)
{
static_assert(is_same<typename SrcTensor::DataType, typename DstTensor::DataType>,
"wrong! src and dst should have the same data type");
static_assert(
nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions should be the same");
static_assert(is_same<decltype(SrcTensor::GetLengths()), decltype(DstTensor::GetLengths())>,
"wrong! src and dst should have same lengths on all dimension");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! src or dst dimension-access-order is not valid map");
static_assert(SrcTensor::IsVectorAccessAllowed(Number<SrcDimVectorAccess>{},
Number<SrcDataOerAccess>{}) &&
DstTensor::IsVectorAccessAllowed(Number<DstDimVectorAccess>{},
Number<DstDataPerAccess>{}),
"wrong! src or dst vector access is not allowed");
}
__device__ static constexpr auto GetSrcAccessLengths()
{
return SliceLengths::Modify(Number<SrcDimVectorAccess>{},
SliceLengths{}[SrcDimVectorAccess] / SrcDataPerAccess)
.ReorderGiveNew2Old(SrcDimAddccessOrder{});
}
__device__ static constexpr auto GetDstAccessLengths() {}
// read data from src into buffer
__device__ void RunRead(const Float* __restrict__ p_src)
{
using vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
static_ford<decltype(GetSrcAccessLengths())>([&](auto access_id) {
constexpr auto data_id = access_id
src_tensor[]
});
}
// write data from buffer into dst
__device__ void RunWrite(Float* __restrict__ p_dst) const {}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst)
{
RunRead(p_src);
RunWrite(p_dst);
}
private:
__device__ static constexpr auto GetBufferTensorDescriptor()
{
return make_ConstantTensorDescriptor_packed(SrcTensor::GetLengths());
}
DataType mpBufferData[SrcTensor::GetElementSize()];
TensorView<TData, decltype(GetBufferTensorDescriptor())> mBufferTensor;
};
} // namespace ck
#endif
......@@ -2,7 +2,7 @@
#define CK_THREADWISE_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "constant_tensor_descriptor.hpp"
namespace ck {
......
#ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP
#include "Sequence.hpp"
#include "sequence.hpp"
#include "functional2.hpp"
namespace ck {
......
......@@ -6,8 +6,8 @@
#include "integral_constant.hpp"
#include "math.hpp"
#include "vector_type.hpp"
#include "Sequence.hpp"
#include "Array.hpp"
#include "sequence.hpp"
#include "array.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
......
......@@ -2,7 +2,7 @@
#define CK_FUNCTIONAL_HPP
#include "integral_constant.hpp"
#include "Sequence.hpp"
#include "sequence.hpp"
namespace ck {
......
......@@ -2,7 +2,7 @@
#define CK_FUNCTIONAL2_HPP
#include "functional.hpp"
#include "Sequence.hpp"
#include "sequence.hpp"
namespace ck {
......
......@@ -3,8 +3,8 @@
#include "functional.hpp"
#include "functional2.hpp"
#include "Sequence.hpp"
#include "Array.hpp"
#include "sequence.hpp"
#include "array.hpp"
namespace ck {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment