Commit c1eaba27 authored by Chao Liu's avatar Chao Liu
Browse files

prototype dynamic tensor descriptor

parent f1541994
......@@ -2,10 +2,9 @@
#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "dynamic_tensor_coordinate.hpp"
namespace ck {
__host__ __device__ constexpr auto
......@@ -83,7 +82,7 @@ template <index_t BlockSize>
struct DummyDynamicTransform
{
__device__ void Run_v1(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
......@@ -522,7 +521,7 @@ struct DummyDynamicTransform
}
__device__ void Run_v2(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
......@@ -542,24 +541,23 @@ struct DummyDynamicTransform
const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
#pragma unroll 1
for(index_t iter = 0; iter < 100; ++iter)
MultiIndex<2> idx;
// initialize idx
for(index_t i = 0; i < 2; ++i)
{
//
MultiIndex<2> idx;
idx(i) = p_wei_global[get_thread_local_1d_id() + i];
}
// initialize idx
for(index_t i = 0; i < 2; ++i)
{
idx(i) = p_wei_global[10 * iter + get_thread_local_1d_id() + i];
}
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);
// offset
index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx);
for(index_t iter = 0; iter < 100; ++iter)
{
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
// is_in_bound
bool is_in_bound =
in_gemmk_gemmn_global_desc.IsValidUpperIndexMappedToValidLowerIndex(idx);
in_gemmk_gemmn_coord += gemmk1_gemmn0;
// write
float value = 1;
......@@ -575,14 +573,14 @@ struct DummyDynamicTransform
true,
1,
p_out_global,
offset,
is_in_bound,
out_n_k_ho_wo_global_desc.GetElementSpace());
in_gemmk_gemmn_coord.GetOffset(),
in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
in_gemmk_gemmn_global_desc.GetElementSpace());
}
}
__device__ void Run(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
......
......@@ -344,7 +344,8 @@ struct DynamicMerge
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
LowerIndex idx_low_diff_const;
CalculateLowerIndex(idx_low_diff_const, idx_up_diff);
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
......@@ -361,15 +362,15 @@ struct DynamicMerge
low_lengths_[i] + idx_low_diff_const[i];
#endif
index_t idx_low_tmp[i] = idx_low_old[i] + carry;
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp[i] >= idx_low_length_minus_idx_low_diff_const;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const;
#if 0
bool do_borrow = idx_low_tmp[i] < -idx_low_diff_const[i];
bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
#endif
idx_low_diff(i) =
do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const;
do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const[i];
#if 0
idx_low_diff(i) =
do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
......
#ifndef CK_DYNAMIC_TENSOR_COORDINATE_HPP
#define CK_DYNAMIC_TENSOR_COORDINATE_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
namespace ck {
// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
// At the bare minimun, user should be able to query the following information from a tensor
// coordinate:
// 1. Tensor descriptor
// 2. Location, represented in the form of multi-index
// 3. Location, represented in the form of the offset to the origin of the tensor
// 4. If the location is inside invalid area or not, e.g. the padding area of an implicitly padded
// tensor is considered invalid, because the padding area doesn't have any physical memory
// allocation
// A tensor cooridnate also provides following functionality:
// 1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
// can freely move the "point of location" inside the tensor
// wrapper class for DynamicNativeTensorCoordinate and DynamicTransformedTensorCoordinate
template <typename TensorDesc>
struct DynamicTensorCoordinate;
// tensor coordinate for native tensor
template <typename TensorDesc>
struct DynamicNativeTensorCoordinate
{
using type = DynamicNativeTensorCoordinate;
using tensor_desc_type = TensorDesc;
static constexpr index_t NDim = tensor_desc_type::GetNumOfDimension();
using Index = MultiIndex<NDim>;
__host__ __device__ explicit constexpr DynamicNativeTensorCoordinate(
const tensor_desc_type& tensor_desc, const Index& idx)
: tensor_desc_{tensor_desc}, idx_{idx}, offset_{tensor_desc.CalculateOffset(idx)}
{
}
__host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_; }
__host__ __device__ constexpr const auto& GetUpperIndex() const { return idx_; }
__host__ __device__ constexpr const auto& GetIndex() const { return idx_; }
__host__ __device__ constexpr const index_t& GetOffset() const { return offset_; }
__host__ __device__ constexpr type operator+=(const Index& idx_diff)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_ += idx_diff;
offset_ += tensor_desc_.CalculateOffsetDiff(idx_diff);
return *this;
}
__host__ __device__ constexpr type operator-=(const Index& idx_diff)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_ -= idx_diff;
offset_ -= tensor_desc_.CalculateOffsetDiff(idx_diff);
return *this;
}
__host__ __device__ constexpr type operator+(const Index& idx_diff) const
{
type coord = *this;
coord += idx_diff;
return coord;
}
__host__ __device__ constexpr type operator-(const Index& idx_diff) const
{
type coord = *this;
coord -= idx_diff;
return coord;
}
__host__ __device__ constexpr index_t CalculateOffsetDiff(const Index& idx_diff) const
{
return tensor_desc_.CalculateOffsetDiff(idx_diff);
}
// evaluated at run-time
__host__ __device__ constexpr bool IsUpperIndexValid() const
{
return tensor_desc_.IsUpperIndexValid(idx_);
}
// evaluated at run-time
__host__ __device__ constexpr bool IsOffsetValid() const
{
// For native tensor, offset is valid if upper-index is valid
return IsUpperIndexValid();
}
// evaluated at compile-time
__host__ __device__ static constexpr bool IsOffsetValidAssumingUpperIndexIsValid()
{
return true;
}
private:
const tensor_desc_type tensor_desc_;
// idx_ may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index idx_;
index_t offset_;
};
// tensor coordinate for transformed tensor
template <typename TensorDesc>
struct DynamicTransformedTensorCoordinate
{
static constexpr index_t NDimUp = TensorDesc::GetNumOfDimension();
using UpperDesc = TensorDesc;
using UpperCoord = DynamicTransformedTensorCoordinate;
using UpperIndex = MultiIndex<NDimUp>;
using LowerDesc = typename UpperDesc::LowerDesc;
using LowerCoord = typename DynamicTensorCoordinate<LowerDesc>::type;
__host__ __device__ explicit constexpr DynamicTransformedTensorCoordinate(
const UpperDesc& tensor_desc_up, const UpperIndex& idx_up)
: tensor_desc_up_{tensor_desc_up},
idx_up_{idx_up},
coord_low_{tensor_desc_up.GetLowerTensorDescriptor(),
tensor_desc_up.CalculateLowerIndex(idx_up)}
{
}
__host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_up_; }
__host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const
{
return coord_low_;
}
__host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return idx_up_; }
__host__ __device__ constexpr const UpperIndex& GetIndex() const { return idx_up_; }
__host__ __device__ constexpr const index_t& GetOffset() const
{
return GetLowerCoordinate().GetOffset();
}
__host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
coord_low_ += tensor_desc_up_.CalculateLowerIndexDiff(
idx_up_diff, GetLowerCoordinate().GetIndex(), GetIndex());
// idx_up_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_ += idx_up_diff;
return *this;
}
__host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
{
coord_low_ -= tensor_desc_up_.CalculateLowerIndexDiff(
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_ -= idx_up_diff;
return *this;
}
__host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
{
UpperCoord coord_up = *this;
coord_up += idx_up_diff;
return coord_up;
}
__host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
{
UpperCoord coord_up = *this;
coord_up -= idx_up_diff;
return coord_up;
}
// Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time.
// TODO: this function is not compiled to expected ISA
__host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
const auto idx_low_diff =
tensor_desc_up_.CalculateLowerIndexDiff(idx_up_diff, coord_low_.GetIndex(), idx_up_);
return coord_low_.CalculateOffsetDiff(idx_low_diff);
}
// evaluated at run-time
__host__ __device__ constexpr bool IsUpperIndexValid() const
{
return tensor_desc_up_.IsUpperIndexValid(idx_up_);
}
// evaluted at run-time
__host__ __device__ constexpr bool IsOffsetValid() const
{
return IsUpperIndexValid() && coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
}
// most evaluatation is done at comile-time
__host__ __device__ constexpr bool IsOffsetValidAssumingUpperIndexIsValid() const
{
return tensor_desc_up_.IsValidUpperIndexMappedToValidLowerIndex(idx_up_) &&
coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
}
private:
const UpperDesc tensor_desc_up_;
// idx_up_ may be calculated and updated, however, the value of some (or all) of its entries
// may never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
UpperIndex idx_up_;
LowerCoord coord_low_;
};
template <index_t NDim>
__host__ __device__ constexpr auto
make_dynamic_tensor_coordinate(const DynamicNativeTensorDescriptor<NDim>& tensor_desc,
const MultiIndex<NDim>& idx)
{
return DynamicNativeTensorCoordinate<DynamicNativeTensorDescriptor<NDim>>{tensor_desc, idx};
}
template <index_t NDim, typename... Ts>
__host__ __device__ constexpr auto
make_dynamic_tensor_coordinate(const DynamicTransformedTensorDescriptor<Ts...>& tensor_desc,
const MultiIndex<NDim>& idx)
{
static_assert(DynamicTransformedTensorDescriptor<Ts...>::GetNumOfDimension() == NDim,
"wrong! inconsistent # of dimensions");
return DynamicTransformedTensorCoordinate<DynamicTransformedTensorDescriptor<Ts...>>{
tensor_desc, idx};
}
template <typename TensorDesc>
struct DynamicTensorCoordinate
{
static constexpr index_t NDim = TensorDesc::GetNumOfDimension();
using type = decltype(make_dynamic_tensor_coordinate(TensorDesc{}, MultiIndex<NDim>{}));
};
} // namespace ck
#endif
......@@ -20,6 +20,11 @@ struct DynamicNativeTensorDescriptor
{
}
__host__ __device__ explicit constexpr DynamicNativeTensorDescriptor()
: lengths_{make_zero_array<index_t, NDim>()}, strides_{make_zero_array<index_t, NDim>()}
{
}
__host__ __device__ static constexpr index_t GetNumOfDimension() { return NDim; }
__host__ __device__ constexpr auto GetLengths() const { return lengths_; }
......@@ -62,12 +67,10 @@ struct DynamicNativeTensorDescriptor
return offset;
}
template <typename UpIdxDiff, typename UpIdx, typename LowIdx>
__host__ __device__ constexpr index_t CalculateOffsetDiff(const UpIdxDiff& idx_up_diff,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
template <typename IdxDiff>
__host__ __device__ constexpr index_t CalculateOffsetDiff(const IdxDiff& idx_diff) const
{
return CalculateOffset(idx_up_diff);
return CalculateOffset(idx_diff);
}
template <typename Idx>
......@@ -92,13 +95,16 @@ template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor or
typename UpDimensionIds> // Tuple<Sequence<...>>
struct DynamicTransformedTensorDescriptor
{
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
using LowerDesc = LowTensorDescriptor;
using UpperDesc = DynamicTransformedTensorDescriptor;
static constexpr index_t NTransform = Transforms::Size();
const LowerDesc low_tensor_desc_;
const Transforms transforms_;
__host__ __device__ static constexpr index_t GetNumOfLowerDimension()
{
return LowTensorDescriptor::GetNumOfDimension();
return LowerDesc::GetNumOfDimension();
}
__host__ __device__ static constexpr index_t GetNumOfUpperDimension()
......@@ -138,7 +144,7 @@ struct DynamicTransformedTensorDescriptor
};
__host__ __device__ explicit constexpr DynamicTransformedTensorDescriptor(
const LowTensorDescriptor& low_tensor_desc, const Transforms& transforms)
const LowerDesc& low_tensor_desc, const Transforms& transforms)
: low_tensor_desc_{low_tensor_desc}, transforms_{transforms}
{
static_assert(NTransform == Transforms::Size() && NTransform == LowDimensionIds::Size() &&
......@@ -178,7 +184,12 @@ struct DynamicTransformedTensorDescriptor
// of lower-tensor-descriptor
}
__host__ __device__ static constexpr auto GetNumOfDimension()
__host__ __device__ explicit constexpr DynamicTransformedTensorDescriptor()
: low_tensor_desc_{}, transforms_{}
{
}
__host__ __device__ static constexpr index_t GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
......@@ -281,7 +292,7 @@ struct DynamicTransformedTensorDescriptor
{
LowerIndex idx_low_diff;
CalculateLowerIndex(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
CalculateLowerIndexDiff(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
return idx_low_diff;
}
......@@ -321,7 +332,7 @@ struct DynamicTransformedTensorDescriptor
const auto up_dims_part = UpDimensionIds{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, up_dims_part);
flag = flag && IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
flag = flag && tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
}
});
......
......@@ -50,19 +50,37 @@ void device_dummy_dynamic_transform(InDesc,
in_right_pads);
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
print_array("cpu: in_gemmk_gemmn_global_desc:", in_gemmk_gemmn_global_desc.GetLengths());
const auto idx0 = MultiIndex<2>({2591, 36991});
const auto idx1 = in_gemmk_gemmn_global_desc.CalculateLowerIndex(idx0);
const auto idx2 =
in_gemmk_gemmn_global_desc.GetLowerTensorDescriptor().CalculateLowerIndex(idx1);
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});
const index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx0);
print_array("idx0:", idx0);
print_array("idx1:", idx1);
print_array("idx2:", idx2);
printf("offset %d\n", offset);
for(index_t iter = 0; iter < 100; ++iter)
{
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
printf("iter %d\n", iter);
print_array("idx0: ", in_gemmk_gemmn_coord.GetIndex());
print_array("idx1: ", in_gemmk_gemmn_coord.GetLowerCoordinate().GetIndex());
print_array("idx2: ",
in_gemmk_gemmn_coord.GetLowerCoordinate().GetLowerCoordinate().GetIndex());
print_array("idx3: ",
in_gemmk_gemmn_coord.GetLowerCoordinate()
.GetLowerCoordinate()
.GetLowerCoordinate()
.GetIndex());
print_array("idx4: ",
in_gemmk_gemmn_coord.GetLowerCoordinate()
.GetLowerCoordinate()
.GetLowerCoordinate()
.GetLowerCoordinate()
.GetIndex());
printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
printf("\n");
in_gemmk_gemmn_coord += gemmk1_gemmn0;
}
}
std::size_t data_sz = sizeof(T);
......@@ -79,7 +97,6 @@ void device_dummy_dynamic_transform(InDesc,
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
#if 0
using dummy_transform = DummyDynamicTransform<BlockSize>;
for(index_t i = 0; i < 5; ++i)
......@@ -93,7 +110,7 @@ void device_dummy_dynamic_transform(InDesc,
{
launch_kernel(run_gridwise_operation<dummy_transform,
index_t* const,
index_t* const,
float* const,
float* const,
const DynamicNativeTensorDescriptor<4>,
const DynamicNativeTensorDescriptor<4>,
......@@ -106,8 +123,8 @@ void device_dummy_dynamic_transform(InDesc,
dim3(BlockSize),
0,
0,
static_cast<index_t*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
wei_kcyx_desc,
in_nchw_desc,
......@@ -118,7 +135,6 @@ void device_dummy_dynamic_transform(InDesc,
in_right_pads);
}
}
#endif
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment