Commit 0b7fcca6 authored by Chao Liu's avatar Chao Liu
Browse files

prototype dynamic tensor descriptor

parent 4388f572
...@@ -3,7 +3,7 @@ project(modular_convolution) ...@@ -3,7 +3,7 @@ project(modular_convolution)
#c++ #c++
enable_language(CXX) enable_language(CXX)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
......
...@@ -414,7 +414,7 @@ struct DummyDynamicTransform ...@@ -414,7 +414,7 @@ struct DummyDynamicTransform
idx[13] += idx_diff[13]; idx[13] += idx_diff[13];
// padding check // padding check
bool is_in_bound = true; bool is_in_bound = true;
#else // pad #else // pad
// offset // offset
idx[0] += idx_diff[0]; idx[0] += idx_diff[0];
...@@ -462,25 +462,102 @@ struct DummyDynamicTransform ...@@ -462,25 +462,102 @@ struct DummyDynamicTransform
const Array<index_t, 2> in_left_pads, const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const const Array<index_t, 2> in_right_pads) const
{ {
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
Index idx_up; const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
idx_up(0) = in_n_c_hi_wi_global_desc.GetLength(0); const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
idx_up(1) = in_n_c_hi_wi_global_desc.GetLength(1); const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
idx_up(2) = in_n_c_hi_wi_global_desc.GetLength(2);
idx_up(3) = in_n_c_hi_wi_global_desc.GetLength(3);
#if 0 const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
constexpr auto trans = GetTransforms(); const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
auto idx_low = trans[0]->CalculateLowerIndex(idx_up); const index_t ConvStrideH = conv_strides[0];
#elif 1 const index_t ConvStrideW = conv_strides[1];
constexpr DynamicCoordinateTransform* tran = &embed;
auto idx_low = tran->CalculateLowerIndex(idx_up); const index_t ConvDilationH = conv_dilations[0];
#endif const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
p_out_global[get_thread_local_1d_id()] = idx_low[0]; // input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#pragma unroll 1
for(index_t iter = 0; iter < 100; ++iter)
{
//
MultiIndex<2> idx;
// initialize idx
for(index_t i = 0; i < 2; ++i)
{
idx(i) = p_wei_global[10 * iter + get_thread_local_1d_id() + i];
}
// offset
index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx);
// is_in_bound
bool is_in_bound =
in_gemmk_gemmn_global_desc.IsValidUpperIndexMappedToValidLowerIndex(idx);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
offset,
is_in_bound,
out_n_k_ho_wo_global_desc.GetElementSpace());
}
} }
__device__ void Run(index_t* const __restrict__ p_wei_global, __device__ void Run(index_t* const __restrict__ p_wei_global,
......
...@@ -10,30 +10,41 @@ struct DynamicPassThrough ...@@ -10,30 +10,41 @@ struct DynamicPassThrough
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
const index_t low_length_; const index_t up_length_;
__host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length) __host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
: low_length_(low_length) : up_length_{low_length}
{ {
} }
__host__ __device__ explicit constexpr DynamicPassThrough() : up_length_{0} {}
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; } __host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperIndex() { return UpperIndex({low_length_}); } __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) template <typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
{ {
return idx_up; static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low(0) = idx_up[0];
} }
__host__ __device__ static constexpr auto template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpperIndex& /* idx_up_old */, const UpIdxDiff& idx_up_diff,
const LowerIndex& /* idx_low_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{ {
return idx_up_diff; static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low_diff(0) = idx_up_diff[0];
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -43,8 +54,9 @@ struct DynamicPassThrough ...@@ -43,8 +54,9 @@ struct DynamicPassThrough
return true; return true;
} }
template <typename UpIdx>
__host__ __device__ static constexpr bool __host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{ {
return true; return true;
} }
...@@ -56,34 +68,43 @@ struct DynamicLeftPad ...@@ -56,34 +68,43 @@ struct DynamicLeftPad
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
const index_t low_length_; const index_t up_length_;
const index_t left_pad_; const index_t left_pad_;
__host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& left_pad) __host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length,
: low_length_{low_length}, left_pad_{left_pad} const index_t& left_pad)
: up_length_{low_length + left_pad}, left_pad_{left_pad}
{ {
} }
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ explicit constexpr DynamicLeftPad() : up_length_{0}, left_pad_{0} {}
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperIndex() __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
{
return UpperIndex({low_length_ + left_pad_});
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{ {
return LowerIndex{idx_up - lef_pad_}; static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low(0) = idx_up[0] - left_pad_;
} }
__host__ __device__ static constexpr auto template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpperIndex& /* idx_up_old */, const UpIdxDiff& idx_up_diff,
const LowerIndex& /* idx_low_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{ {
return idx_up_diff; static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low_diff(0) = idx_up_diff[0];
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -93,8 +114,9 @@ struct DynamicLeftPad ...@@ -93,8 +114,9 @@ struct DynamicLeftPad
return SkipIsValidCheck; return SkipIsValidCheck;
} }
__host__ __device__ static constexpr bool template <typename UpIdx>
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) __host__ __device__ constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
{ {
return SkipIsValidCheck || (idx_up[0] >= left_pad_); return SkipIsValidCheck || (idx_up[0] >= left_pad_);
} }
...@@ -106,34 +128,47 @@ struct DynamicRightPad ...@@ -106,34 +128,47 @@ struct DynamicRightPad
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
const index_t up_length_;
const index_t low_length_; const index_t low_length_;
const index_t right_pad_; const index_t right_pad_;
__host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& right_pad) __host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length,
: low_length_{low_length}, right_pad_{right_pad} const index_t& right_pad)
: up_length_{low_length + right_pad}, low_length_{low_length}, right_pad_{right_pad}
{ {
} }
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ explicit constexpr DynamicRightPad()
: up_length_{0}, low_length_{0}, right_pad_{0}
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperIndex()
{ {
return UpperIndex({low_length_ + right_pad_});
} }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
{ {
return idx_up; static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low(0) = idx_up[0];
} }
__host__ __device__ static constexpr auto template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpperIndex& /* idx_up_old */, const UpIdxDiff& idx_up_diff,
const LowerIndex& /* idx_low_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{ {
return idx_up_diff; static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low_diff(0) = idx_up_diff[0];
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -143,8 +178,9 @@ struct DynamicRightPad ...@@ -143,8 +178,9 @@ struct DynamicRightPad
return SkipIsValidCheck; return SkipIsValidCheck;
} }
__host__ __device__ static constexpr bool template <typename UpIdx>
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) __host__ __device__ constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
{ {
return SkipIsValidCheck || (idx_up[0] < low_length_); return SkipIsValidCheck || (idx_up[0] < low_length_);
} }
...@@ -157,50 +193,61 @@ struct DynamicEmbed ...@@ -157,50 +193,61 @@ struct DynamicEmbed
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>; using UpperIndex = MultiIndex<NDimUp>;
const index_t low_length_;
const UpperIndex up_lengths_; const UpperIndex up_lengths_;
const Array<index_t, NDimUp + 1> coefficients_; const Array<index_t, NDimUp + 1> coefficients_;
__host__ __device__ explicit constexpr Embed(const index_t& low_length, __host__
const UpperIndex& up_lengths, __device__ explicit constexpr DynamicEmbed(const UpperIndex& up_lengths,
const Array<index_t, NDimUp + 1>& coefficients) const Array<index_t, NDimUp + 1>& coefficients)
: low_length_(low_length), up_lengths_(up_lengths), coefficients_(coefficients) : up_lengths_{up_lengths}, coefficients_{coefficients}
{
static_assert(UpperIndex::Size() == NDimUp, "wrong! # of dimensions not consistent");
}
__host__ __device__ explicit constexpr DynamicEmbed()
: up_lengths_{make_zero_array<index_t, NDimUp>()},
coefficients_{make_zero_array<index_t, NDimUp + 1>()}
{ {
static_assert(up_lengths.GetSize() == nDimUp && coefficients.GetSize() == nDimUp + 1,
"wrong! # of dimensions not consistent");
} }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimUp; } __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimUp; }
__host__ __device__ static constexpr auto GetUpperIndex() { return up_lengths_; } __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{ {
index_t idx_low = coefficients_[NDimUp]; static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
"wrong! inconsistent # of dimension");
idx_low(0) = coefficients_[NDimUp];
for(index_t i = 0; i < nDimUp; ++i) #pragma unroll
for(index_t i = 0; i < NDimUp; ++i)
{ {
idx_low += idx_up[i] * coefficients_[i]; idx_low(0) += idx_up[i] * coefficients_[i];
} }
return LowerIndex({idx_low});
} }
__host__ __device__ static constexpr auto template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpperIndex& /* idx_up_old */, const UpIdxDiff& idx_up_diff,
const LowerIndex& /* idx_low_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{ {
index_t idx_low_diff = 0; static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
"wrong! inconsistent # of dimension");
for(index_t i = 0; i < nDimUp; ++i) idx_low_diff(0) = 0;
#pragma unroll
for(index_t i = 0; i < NDimUp; ++i)
{ {
idx_low_diff += idx_up_diff[i] * Coefficients{}[i]; idx_low_diff(0) += idx_up_diff[i] * coefficients_[i];
} }
return LowerIndex({idx_low_diff});
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -210,8 +257,9 @@ struct DynamicEmbed ...@@ -210,8 +257,9 @@ struct DynamicEmbed
return true; return true;
} }
template <typename UpIdx>
__host__ __device__ static constexpr bool __host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{ {
return true; return true;
} }
...@@ -227,27 +275,33 @@ struct DynamicMerge ...@@ -227,27 +275,33 @@ struct DynamicMerge
const LowerIndex low_lengths_scan_; const LowerIndex low_lengths_scan_;
const index_t up_length_; const index_t up_length_;
__host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths_) __host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_(low_lengths), : low_lengths_{low_lengths},
low_lengths_scan_(reverse_inclusive_scan_on_array(low_lengths, multiplies<index_t>()), low_lengths_scan_{reverse_inclusive_scan_on_array(
up_length(accumulate_on_array(low_lengths, multiplies<index_t>(), 1)) low_lengths, math::multiplies<index_t>{}, index_t{1})},
up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)}
{ {
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
} }
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { __host__ __device__ explicit constexpr DynamicMerge()
return NDimLow; } : low_lengths_{make_zero_array<index_t, NDimLow>()},
low_lengths_scan_{make_zero_array<index_t, NDimLow>()},
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { up_length_{0}
return 1; }
__host__ __device__ constexpr auto GetUpperIndex() const
{ {
return UpperIndex({up_length_});
} }
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{ {
LowerIndex idx_low; static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
index_t itmp = idx_up[0]; index_t itmp = idx_up[0];
...@@ -259,29 +313,30 @@ struct DynamicMerge ...@@ -259,29 +313,30 @@ struct DynamicMerge
} }
idx_low(NDimLow - 1) = itmp; idx_low(NDimLow - 1) = itmp;
#pragma unroll
return idx_low;
} }
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized // If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler // away by compiler
// This function assume idx_low_old is not out-of-bound // This function assume idx_low_old is not out-of-bound
__host__ __device__ static constexpr auto template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpperIndex& /* idx_up_old */, const UpIdxDiff& idx_up_diff,
const LowerIndex& idx_low_old) const LowIdx& idx_low_old,
{ const UpIdx& /* idx_up_old */) const
LowerIndex idx_low_diff; {
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions. LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_low_diff_const) has multiple integer divisions.
// However,
// 1) If idx_up_diff is known at compile-time, then idx_low_diff_const // 1) If idx_up_diff is known at compile-time, then idx_low_diff_const
// can be calculated at compile-time. // can be calculated at compile-time.
// 2) If idx_up_diff is not known at compile-time, but its value // 2) If idx_up_diff is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then idx_low_diff_const also // doesn't change during the whole kernel execution, then idx_low_diff_const also
// doesn't change during the whole kernel execution. Compiler generated ISA should // doesn't change during the whole kernel execution. Compiler generated ISA should
// only caclculate idx_low_diff once and save it durinng the whole kernel execution // only caclculate idx_low_diff_const once and save it durinng the whole kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be computed at // If neither 1) nor 2) is satisfied, then the calculation will also be computed at
// run-time each time this function is called, and can be very expensive. // run-time each time this function is called, and can be very expensive.
LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff); LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
...@@ -303,19 +358,19 @@ struct DynamicMerge ...@@ -303,19 +358,19 @@ struct DynamicMerge
index_t idx_low_tmp[i] = idx_low_old[i] + carry; index_t idx_low_tmp[i] = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const; bool do_carry = idx_low_tmp[i] >= idx_low_length_minus_idx_low_diff_const;
#if 0 #if 0
bool do_borrow = idx_low_tmp < -idx_low_diff_const[i]; bool do_borrow = idx_low_tmp[i] < -idx_low_diff_const[i];
#endif #endif
idx_low_diff[i] = idx_low_diff(i) =
do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const; do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const;
#if 0 #if 0
idx_low_diff[i] = idx_low_diff(i) =
do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i]; do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
#endif #endif
idx_low_diff[i] += carry; idx_low_diff(i) += carry;
carry = do_carry ? 1 : 0; carry = do_carry ? 1 : 0;
#if 0 #if 0
...@@ -323,21 +378,19 @@ struct DynamicMerge ...@@ -323,21 +378,19 @@ struct DynamicMerge
#endif #endif
} }
idx_low_diff[0] = idx_low_diff_const[0] + carry; idx_low_diff(0) = idx_low_diff_const[0] + carry;
return idx_low_diff;
} }
__host__ __device__ static constexpr bool IsLinearTransform() { __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{ {
return true; return true;
} }
template <typename UpIdx>
__host__ __device__ static constexpr bool __host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{ {
return true; return true;
} }
...@@ -353,56 +406,55 @@ struct DynamicUnMerge ...@@ -353,56 +406,55 @@ struct DynamicUnMerge
const UpperIndex up_lengths_scan_; const UpperIndex up_lengths_scan_;
__host__ __device__ explicit constexpr DynamicUnMerge(const UpperIndex& up_lengths) __host__ __device__ explicit constexpr DynamicUnMerge(const UpperIndex& up_lengths)
: up_lengths_(up_lengths), : up_lengths_{up_lengths},
up_lengths_scan_(reverse_exclusive_scan_on_array(up_lengths, multiplies<index_t>(), index_t(1)) up_lengths_scan_{
reverse_exclusive_scan_on_array(up_lengths, math::multiplies<index_t>(), index_t{1})}
{
}
__host__ __device__ explicit constexpr DynamicUnMerge()
: up_lengths_{make_zero_array<index_t, NDimUp>()},
up_lengths_scan_{make_zero_array<index_t, NDimUp>()}
{ {
} }
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
return NDimUp; }
__host__ __device__ constexpr auto GetUpperIndex() const { __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
return up_lengths_; }
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{ {
index_t idx_low = idx_up[NDimUp]; idx_low(0) = idx_up[NDimUp];
#pragma unroll #pragma unroll
for(index_t i = 0; i < NDimUp - 1; ++i) for(index_t i = 0; i < NDimUp - 1; ++i)
{ {
idx_low += idx_up[i] * up_lengths_scan_[i]; idx_low(0) += idx_up[i] * up_lengths_scan_[i];
} }
return LowerIndex{idx_low};
} }
__host__ __device__ static constexpr auto template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpperIndex& /* idx_up_old */, const UpIdxDiff& idx_up_diff,
const LowerIndex& /* idx_low_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{ {
return CalculateLowerIndex(idx_up_diff); CalculateLowerIndex(idx_low_diff, idx_up_diff);
} }
__host__ __device__ static constexpr bool IsLinearTransform() { __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{ {
return true; return true;
} }
template <typename UpIdx>
__host__ __device__ static constexpr bool __host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{ {
return true; return true;
} }
...@@ -411,33 +463,38 @@ struct DynamicUnMerge ...@@ -411,33 +463,38 @@ struct DynamicUnMerge
struct DynamicFreeze struct DynamicFreeze
{ {
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<>; using UpperIndex = MultiIndex<0>;
const index_t low_idx_; const index_t low_idx_;
const index_t low_length_;
__host__ __device__ explicit constexpr Freeze(const index_t& low_idx, const index_t& low_length) __host__ __device__ explicit constexpr DynamicFreeze(const index_t& low_idx) : low_idx_{low_idx}
: low_idx_(low_idx), low_length_(low_length)
{ {
} }
__host__ __device__ explicit constexpr DynamicFreeze() : low_idx_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
__host__ __device__ static constexpr auto GetUpperIndex() { return UpperIndex(); } __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{}; }
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& /*idx_up*/) const template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{ {
return LowerIndex({low_length_}); static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low(0) = low_idx_;
} }
__host__ __device__ static constexpr auto template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
CalculateLowerIndexDiff(const UpperIndex& /* idx_up_diff */, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpperIndex& /* idx_up_old */, const UpIdxDiff& idx_up_diff,
const LowerIndex& /* idx_low_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{ {
return LowerIndex({0}); idx_low_diff(0) = index_t{0};
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -447,8 +504,9 @@ struct DynamicFreeze ...@@ -447,8 +504,9 @@ struct DynamicFreeze
return true; return true;
} }
template <typename UpIdx>
__host__ __device__ static constexpr bool __host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{ {
return true; return true;
} }
......
...@@ -6,8 +6,323 @@ ...@@ -6,8 +6,323 @@
namespace ck { namespace ck {
struct TensorDescriptor template <index_t NDim>
struct DynamicNativeTensorDescriptor
{ {
using Index = MultiIndex<NDim>;
const Index lengths_;
const Index strides_;
__host__ __device__ explicit constexpr DynamicNativeTensorDescriptor(const Index& lengths,
const Index& strides)
: lengths_{lengths}, strides_{strides}
{
}
__host__ __device__ static constexpr index_t GetNumOfDimension() { return NDim; }
__host__ __device__ constexpr auto GetLengths() const { return lengths_; }
__host__ __device__ constexpr auto GetStrides() const { return strides_; }
__host__ __device__ constexpr index_t GetLength(index_t idim) const { return lengths_[idim]; }
__host__ __device__ constexpr index_t GetStride(index_t idim) const { return strides_[idim]; }
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
{
index_t space = 1;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
space += (GetLength(i) - 1) * GetStride(i);
}
return space;
}
template <typename Idx>
__host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
{
index_t offset = 0;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
offset += idx[i] * GetStride(i);
}
return offset;
}
template <typename UpIdxDiff, typename UpIdx, typename LowIdx>
__host__ __device__ constexpr index_t CalculateOffsetDiff(const UpIdxDiff& idx_up_diff,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
return CalculateOffset(idx_up_diff);
}
template <typename Idx>
__host__ __device__ constexpr bool IsUpperIndexValid(const Idx& idx) const
{
bool flag = true;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
flag = flag && idx[i] >= 0 && idx[i] < GetLength(i);
}
return flag;
}
};
template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor or
// DynamicTransformedTensorDescriptor
typename Transforms, // Tuple<MultIndexTransforms...>
typename LowDimensionIds, // Tuple<Sequence<...>>
typename UpDimensionIds> // Tuple<Sequence<...>>
struct DynamicTransformedTensorDescriptor
{
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
static constexpr index_t NTransform = Transforms::Size();
__host__ __device__ static constexpr index_t GetNumOfLowerDimension()
{
return LowTensorDescriptor::GetNumOfDimension();
}
__host__ __device__ static constexpr index_t GetNumOfUpperDimension()
{
index_t ndim_up = 0;
static_for<0, NTransform, 1>{}([&](auto i) constexpr {
constexpr auto tmp = UpDimensionIds{}.At(i);
ndim_up += decltype(tmp)::Size();
});
return ndim_up;
}
static constexpr index_t NDimUp = GetNumOfUpperDimension();
static constexpr index_t NDimLow = GetNumOfLowerDimension();
using UpperIndex = MultiIndex<NDimUp>;
using LowerIndex = MultiIndex<NDimLow>;
struct lambda_merge_sequences
{
template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const
{
return merge_sequences(xs...);
}
};
struct lambda_merge_arrays
{
template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const
{
return merge_arrays(xs...);
}
};
__host__ __device__ explicit constexpr DynamicTransformedTensorDescriptor(
const LowTensorDescriptor& low_tensor_desc, const Transforms& transforms)
: low_tensor_desc_{low_tensor_desc}, transforms_{transforms}
{
static_assert(NTransform == Transforms::Size() && NTransform == LowDimensionIds::Size() &&
NTransform == UpDimensionIds::Size(),
"wrong! # of transformations not the same");
// sanity check:
// LowDimensionIds should include all low-dimensions,
// UpDimensionIds should include all up-dimensions
using unsorted_up_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
using sorted_up_dimension_ids =
typename sequence_sort<unsorted_up_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_up_dimension_ids::Size() == NDimUp &&
is_valid_sequence_map<sorted_up_dimension_ids>{},
"wrong! UpDimensionIds is not configured correctly");
using unsorted_low_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
using sorted_low_dimension_ids =
typename sequence_sort<unsorted_low_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_low_dimension_ids::Size() == NDimLow &&
is_valid_sequence_map<sorted_low_dimension_ids>{},
"wrong! LowDimensionIds is not configured correctly");
// TODO: sanity check: while a up-dimension could be associated with multille
// transformation, a low-dimension should be associated with only one transformation
// TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths
// of lower-tensor-descriptor
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetUpperLengths() const
{
// sort upper-dimension-ids
constexpr auto unsorted_up_dimension_ids =
unpack(lambda_merge_sequences{}, UpDimensionIds{});
using sort_up_dimension_ids = sequence_unique_sort<decltype(unsorted_up_dimension_ids),
math::less<index_t>,
math::equal<index_t>>;
constexpr auto sorted2unsorted_map = typename sort_up_dimension_ids::sorted2unsorted_map{};
// sort upper-lengths
const auto tuple_of_up_lengths =
transform_tuples([](const auto& tran) constexpr { return tran.GetUpperLengths(); },
transforms_);
const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);
const auto sorted_up_lengths =
reorder_array_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
return sorted_up_lengths;
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr index_t GetLength(index_t idim) const
{
return GetLengths()[idim];
}
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
{
return low_tensor_desc_.GetElementSpace();
}
template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{
static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
auto tran = transforms_.At(itran);
auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndex(idx_low_part, idx_up_part);
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpIdxDiff& idx_up_diff,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old) const
{
static_for<0, NTransform, 1>{}([&](auto itran) {
const auto tran = transforms_.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensionIds{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensionIds{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensionIds{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndexDiff(
idx_low_diff_part, idx_up_diff_part, idx_low_old_part, idx_up_old_part);
});
}
template <typename UpIdx>
__host__ __device__ constexpr auto CalculateLowerIndex(const UpIdx& idx_up) const
{
LowerIndex idx_low;
CalculateLowerIndex(idx_low, idx_up);
return idx_low;
}
template <typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr auto CalculateLowerIndexDiff(const UpIdxDiff& idx_up_diff,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old) const
{
LowerIndex idx_low_diff;
CalculateLowerIndex(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
return idx_low_diff;
}
__host__ __device__ constexpr index_t CalculateOffset(const UpperIndex& idx_up) const
{
return low_tensor_desc_.CalculateOffset(CalculateLowerIndex(idx_up));
}
__host__ __device__ constexpr bool IsUpperIndexValid(const UpperIndex& idx_up) const
{
bool flag = true;
#pragma unroll
for(index_t i = 0; i < NDimUp; ++i)
{
flag = flag && idx_up[i] >= 0 && idx_up[i] < GetLength(i);
}
return flag;
}
__host__ __device__ constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
{
bool flag = true;
static_for<0, NTransform, 1>{}([&](auto itran) {
const auto tran = Transforms{}.At(itran);
// check a indtransformation if it does not always has a valid mapping
constexpr bool is_valid_up_always_mapped_to_valid_low =
decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex();
if
constexpr(!is_valid_up_always_mapped_to_valid_low)
{
const auto up_dims_part = UpDimensionIds{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, up_dims_part);
flag = flag && IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
}
});
return flag;
}
}; };
} // namespace ck } // namespace ck
......
...@@ -15,5 +15,21 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L ...@@ -15,5 +15,21 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides); return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
} }
template <typename LowTensorDescriptor,
typename Transforms,
typename LowDimensionIds,
typename UpDimensionIds>
__host__ __device__ constexpr auto
transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms,
LowDimensionIds,
UpDimensionIds)
{
return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
Transforms,
LowDimensionIds,
UpDimensionIds>{low_tensor_desc, transforms};
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -531,47 +531,5 @@ struct Freeze ...@@ -531,47 +531,5 @@ struct Freeze
} }
}; };
template <index_t LowerLength, index_t VectorSize>
struct Vectorize
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
__host__ __device__ constexpr Vectorize()
{
static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
"wrong! cannot evenly divide");
}
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Sequence<LowerLength / VectorSize>{};
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return VectorSize * idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return VectorSize * idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -12,8 +12,9 @@ struct Array ...@@ -12,8 +12,9 @@ struct Array
using type = Array<TData, NSize>; using type = Array<TData, NSize>;
using data_type = TData; using data_type = TData;
// hack: add extra element to allow empty array
// TODO: implement empty Array // TODO: implement empty Array
TData mData[NSize] = {0}; TData mData[NSize + 1] = {0};
__host__ __device__ explicit constexpr Array() {} __host__ __device__ explicit constexpr Array() {}
...@@ -136,16 +137,16 @@ struct ArrayElementPicker ...@@ -136,16 +137,16 @@ struct ArrayElementPicker
return mArray(IP); return mArray(IP);
} }
template <typename I> __host__ __device__ constexpr const data_type& operator[](index_t i) const
__host__ __device__ constexpr const data_type& operator[](I i) const
{ {
return At(i); index_t ip = Picks{}[i];
return mArray[ip];
} }
template <typename I> __host__ __device__ constexpr data_type& operator()(index_t i)
__host__ __device__ constexpr data_type& operator()(I i)
{ {
return At(i); index_t ip = Picks{}[i];
return mArray(ip);
} }
template <typename T> template <typename T>
......
...@@ -244,7 +244,7 @@ __host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a) ...@@ -244,7 +244,7 @@ __host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
template <typename TData, index_t NSize, typename Reduce> template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr TData __host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init) reduce_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
{ {
TData result = init; TData result = init;
...@@ -288,10 +288,40 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in ...@@ -288,10 +288,40 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
r = f(r, x[i]); r = f(r, x[i]);
} }
y(i) = r; y(NSize - 1) = r;
return y; return y;
} }
template <typename X, typename... Ys>
__host__ __device__ constexpr auto merge_arrays(const X& x, const Ys&... ys)
{
return merge_arrays(x, merge_arrays(ys...));
}
template <typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto merge_arrays(const Array<T, NX>& x, const Array<T, NY>& y)
{
Array<T, NX + NY> z;
for(index_t i = 0; i < NX; ++i)
{
z(i) = x[i];
}
for(index_t i = 0; i < NY; ++i)
{
z(i + NX) = y[i];
}
return z;
}
template <typename X>
__host__ __device__ constexpr auto merge_arrays(const X& x)
{
return x;
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -8,9 +8,11 @@ ...@@ -8,9 +8,11 @@
#include "float_type.hpp" #include "float_type.hpp"
#include "type.hpp" #include "type.hpp"
#include "tuple.hpp" #include "tuple.hpp"
#include "tuple_helper.hpp"
#include "math.hpp" #include "math.hpp"
#include "sequence.hpp" #include "sequence.hpp"
#include "array.hpp" #include "array.hpp"
#include "array_helper.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "functional3.hpp" #include "functional3.hpp"
......
...@@ -104,56 +104,5 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X ...@@ -104,56 +104,5 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
} }
}; };
template <typename... Xs>
__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
{
return Tuple<remove_cv_t<remove_reference_t<Xs>>...>(std::forward<Xs>(xs)...);
}
namespace detail {
template <typename F, typename X, index_t... Is>
__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, typename Z, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
}
} // namespace detail
template <typename F, typename X>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
{
return detail::transform_tuples_impl(
f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
{
return detail::transform_tuples_impl(
f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y, typename Z>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
{
return detail::transform_tuples_impl(
f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
} // namespace ck } // namespace ck
#endif #endif
#ifndef CK_TUPLE_HELPER_HPP
#define CK_TUPLE_HELPER_HPP
#include "tuple_helper.hpp"
namespace ck {
template <typename... Xs>
__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
{
return Tuple<remove_cv_t<remove_reference_t<Xs>>...>(std::forward<Xs>(xs)...);
}
namespace detail {
template <typename F, typename X, index_t... Is>
__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, typename Z, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
}
} // namespace detail
template <typename F, typename X>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
{
return detail::transform_tuples_impl(
f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
{
return detail::transform_tuples_impl(
f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y, typename Z>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
{
return detail::transform_tuples_impl(
f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
} // namespace ck
#endif
...@@ -17,5 +17,5 @@ cmake ...@@ -17,5 +17,5 @@ cmake
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment