Commit 0b7fcca6 authored by Chao Liu's avatar Chao Liu
Browse files

prototype dynamic tensor descriptor

parent 4388f572
......@@ -3,7 +3,7 @@ project(modular_convolution)
#c++
enable_language(CXX)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
......
......@@ -414,7 +414,7 @@ struct DummyDynamicTransform
idx[13] += idx_diff[13];
// padding check
bool is_in_bound = true;
bool is_in_bound = true;
#else // pad
// offset
idx[0] += idx_diff[0];
......@@ -462,25 +462,102 @@ struct DummyDynamicTransform
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const
{
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
Index idx_up;
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
idx_up(0) = in_n_c_hi_wi_global_desc.GetLength(0);
idx_up(1) = in_n_c_hi_wi_global_desc.GetLength(1);
idx_up(2) = in_n_c_hi_wi_global_desc.GetLength(2);
idx_up(3) = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
#if 0
constexpr auto trans = GetTransforms();
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
auto idx_low = trans[0]->CalculateLowerIndex(idx_up);
#elif 1
constexpr DynamicCoordinateTransform* tran = &embed;
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
auto idx_low = tran->CalculateLowerIndex(idx_up);
#endif
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
p_out_global[get_thread_local_1d_id()] = idx_low[0];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#pragma unroll 1
for(index_t iter = 0; iter < 100; ++iter)
{
//
MultiIndex<2> idx;
// initialize idx
for(index_t i = 0; i < 2; ++i)
{
idx(i) = p_wei_global[10 * iter + get_thread_local_1d_id() + i];
}
// offset
index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx);
// is_in_bound
bool is_in_bound =
in_gemmk_gemmn_global_desc.IsValidUpperIndexMappedToValidLowerIndex(idx);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
offset,
is_in_bound,
out_n_k_ho_wo_global_desc.GetElementSpace());
}
}
__device__ void Run(index_t* const __restrict__ p_wei_global,
......
......@@ -6,8 +6,323 @@
namespace ck {
struct TensorDescriptor
template <index_t NDim>
struct DynamicNativeTensorDescriptor
{
using Index = MultiIndex<NDim>;
const Index lengths_;
const Index strides_;
__host__ __device__ explicit constexpr DynamicNativeTensorDescriptor(const Index& lengths,
const Index& strides)
: lengths_{lengths}, strides_{strides}
{
}
__host__ __device__ static constexpr index_t GetNumOfDimension() { return NDim; }
__host__ __device__ constexpr auto GetLengths() const { return lengths_; }
__host__ __device__ constexpr auto GetStrides() const { return strides_; }
__host__ __device__ constexpr index_t GetLength(index_t idim) const { return lengths_[idim]; }
__host__ __device__ constexpr index_t GetStride(index_t idim) const { return strides_[idim]; }
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
{
index_t space = 1;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
space += (GetLength(i) - 1) * GetStride(i);
}
return space;
}
template <typename Idx>
__host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
{
index_t offset = 0;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
offset += idx[i] * GetStride(i);
}
return offset;
}
template <typename UpIdxDiff, typename UpIdx, typename LowIdx>
__host__ __device__ constexpr index_t CalculateOffsetDiff(const UpIdxDiff& idx_up_diff,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
return CalculateOffset(idx_up_diff);
}
template <typename Idx>
__host__ __device__ constexpr bool IsUpperIndexValid(const Idx& idx) const
{
bool flag = true;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
flag = flag && idx[i] >= 0 && idx[i] < GetLength(i);
}
return flag;
}
};
template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor or
// DynamicTransformedTensorDescriptor
typename Transforms, // Tuple<MultIndexTransforms...>
typename LowDimensionIds, // Tuple<Sequence<...>>
typename UpDimensionIds> // Tuple<Sequence<...>>
struct DynamicTransformedTensorDescriptor
{
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
static constexpr index_t NTransform = Transforms::Size();
__host__ __device__ static constexpr index_t GetNumOfLowerDimension()
{
return LowTensorDescriptor::GetNumOfDimension();
}
__host__ __device__ static constexpr index_t GetNumOfUpperDimension()
{
index_t ndim_up = 0;
static_for<0, NTransform, 1>{}([&](auto i) constexpr {
constexpr auto tmp = UpDimensionIds{}.At(i);
ndim_up += decltype(tmp)::Size();
});
return ndim_up;
}
static constexpr index_t NDimUp = GetNumOfUpperDimension();
static constexpr index_t NDimLow = GetNumOfLowerDimension();
using UpperIndex = MultiIndex<NDimUp>;
using LowerIndex = MultiIndex<NDimLow>;
struct lambda_merge_sequences
{
template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const
{
return merge_sequences(xs...);
}
};
struct lambda_merge_arrays
{
template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const
{
return merge_arrays(xs...);
}
};
__host__ __device__ explicit constexpr DynamicTransformedTensorDescriptor(
const LowTensorDescriptor& low_tensor_desc, const Transforms& transforms)
: low_tensor_desc_{low_tensor_desc}, transforms_{transforms}
{
static_assert(NTransform == Transforms::Size() && NTransform == LowDimensionIds::Size() &&
NTransform == UpDimensionIds::Size(),
"wrong! # of transformations not the same");
// sanity check:
// LowDimensionIds should include all low-dimensions,
// UpDimensionIds should include all up-dimensions
using unsorted_up_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
using sorted_up_dimension_ids =
typename sequence_sort<unsorted_up_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_up_dimension_ids::Size() == NDimUp &&
is_valid_sequence_map<sorted_up_dimension_ids>{},
"wrong! UpDimensionIds is not configured correctly");
using unsorted_low_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
using sorted_low_dimension_ids =
typename sequence_sort<unsorted_low_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_low_dimension_ids::Size() == NDimLow &&
is_valid_sequence_map<sorted_low_dimension_ids>{},
"wrong! LowDimensionIds is not configured correctly");
// TODO: sanity check: while a up-dimension could be associated with multille
// transformation, a low-dimension should be associated with only one transformation
// TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths
// of lower-tensor-descriptor
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetUpperLengths() const
{
// sort upper-dimension-ids
constexpr auto unsorted_up_dimension_ids =
unpack(lambda_merge_sequences{}, UpDimensionIds{});
using sort_up_dimension_ids = sequence_unique_sort<decltype(unsorted_up_dimension_ids),
math::less<index_t>,
math::equal<index_t>>;
constexpr auto sorted2unsorted_map = typename sort_up_dimension_ids::sorted2unsorted_map{};
// sort upper-lengths
const auto tuple_of_up_lengths =
transform_tuples([](const auto& tran) constexpr { return tran.GetUpperLengths(); },
transforms_);
const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);
const auto sorted_up_lengths =
reorder_array_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
return sorted_up_lengths;
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr index_t GetLength(index_t idim) const
{
return GetLengths()[idim];
}
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
{
return low_tensor_desc_.GetElementSpace();
}
template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{
static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
auto tran = transforms_.At(itran);
auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndex(idx_low_part, idx_up_part);
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpIdxDiff& idx_up_diff,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old) const
{
static_for<0, NTransform, 1>{}([&](auto itran) {
const auto tran = transforms_.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensionIds{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensionIds{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensionIds{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndexDiff(
idx_low_diff_part, idx_up_diff_part, idx_low_old_part, idx_up_old_part);
});
}
template <typename UpIdx>
__host__ __device__ constexpr auto CalculateLowerIndex(const UpIdx& idx_up) const
{
LowerIndex idx_low;
CalculateLowerIndex(idx_low, idx_up);
return idx_low;
}
template <typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr auto CalculateLowerIndexDiff(const UpIdxDiff& idx_up_diff,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old) const
{
LowerIndex idx_low_diff;
CalculateLowerIndex(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
return idx_low_diff;
}
__host__ __device__ constexpr index_t CalculateOffset(const UpperIndex& idx_up) const
{
return low_tensor_desc_.CalculateOffset(CalculateLowerIndex(idx_up));
}
__host__ __device__ constexpr bool IsUpperIndexValid(const UpperIndex& idx_up) const
{
bool flag = true;
#pragma unroll
for(index_t i = 0; i < NDimUp; ++i)
{
flag = flag && idx_up[i] >= 0 && idx_up[i] < GetLength(i);
}
return flag;
}
__host__ __device__ constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
{
bool flag = true;
static_for<0, NTransform, 1>{}([&](auto itran) {
const auto tran = Transforms{}.At(itran);
// check a indtransformation if it does not always has a valid mapping
constexpr bool is_valid_up_always_mapped_to_valid_low =
decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex();
if
constexpr(!is_valid_up_always_mapped_to_valid_low)
{
const auto up_dims_part = UpDimensionIds{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, up_dims_part);
flag = flag && IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
}
});
return flag;
}
};
} // namespace ck
......
......@@ -15,5 +15,21 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
}
template <typename LowTensorDescriptor,
typename Transforms,
typename LowDimensionIds,
typename UpDimensionIds>
__host__ __device__ constexpr auto
transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms,
LowDimensionIds,
UpDimensionIds)
{
return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
Transforms,
LowDimensionIds,
UpDimensionIds>{low_tensor_desc, transforms};
}
} // namespace ck
#endif
......@@ -531,47 +531,5 @@ struct Freeze
}
};
template <index_t LowerLength, index_t VectorSize>
struct Vectorize
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
__host__ __device__ constexpr Vectorize()
{
static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
"wrong! cannot evenly divide");
}
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Sequence<LowerLength / VectorSize>{};
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return VectorSize * idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return VectorSize * idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
};
} // namespace ck
#endif
......@@ -12,8 +12,9 @@ struct Array
using type = Array<TData, NSize>;
using data_type = TData;
// hack: add extra element to allow empty array
// TODO: implement empty Array
TData mData[NSize] = {0};
TData mData[NSize + 1] = {0};
__host__ __device__ explicit constexpr Array() {}
......@@ -136,16 +137,16 @@ struct ArrayElementPicker
return mArray(IP);
}
template <typename I>
__host__ __device__ constexpr const data_type& operator[](I i) const
__host__ __device__ constexpr const data_type& operator[](index_t i) const
{
return At(i);
index_t ip = Picks{}[i];
return mArray[ip];
}
template <typename I>
__host__ __device__ constexpr data_type& operator()(I i)
__host__ __device__ constexpr data_type& operator()(index_t i)
{
return At(i);
index_t ip = Picks{}[i];
return mArray(ip);
}
template <typename T>
......
......@@ -244,7 +244,7 @@ __host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
reduce_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
{
TData result = init;
......@@ -288,10 +288,40 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
r = f(r, x[i]);
}
y(i) = r;
y(NSize - 1) = r;
return y;
}
template <typename X, typename... Ys>
__host__ __device__ constexpr auto merge_arrays(const X& x, const Ys&... ys)
{
return merge_arrays(x, merge_arrays(ys...));
}
template <typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto merge_arrays(const Array<T, NX>& x, const Array<T, NY>& y)
{
Array<T, NX + NY> z;
for(index_t i = 0; i < NX; ++i)
{
z(i) = x[i];
}
for(index_t i = 0; i < NY; ++i)
{
z(i + NX) = y[i];
}
return z;
}
template <typename X>
__host__ __device__ constexpr auto merge_arrays(const X& x)
{
return x;
}
} // namespace ck
#endif
......@@ -8,9 +8,11 @@
#include "float_type.hpp"
#include "type.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "math.hpp"
#include "sequence.hpp"
#include "array.hpp"
#include "array_helper.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
......
......@@ -104,56 +104,5 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
}
};
template <typename... Xs>
__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
{
return Tuple<remove_cv_t<remove_reference_t<Xs>>...>(std::forward<Xs>(xs)...);
}
namespace detail {
template <typename F, typename X, index_t... Is>
__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, typename Z, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
}
} // namespace detail
template <typename F, typename X>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
{
return detail::transform_tuples_impl(
f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
{
return detail::transform_tuples_impl(
f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y, typename Z>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
{
return detail::transform_tuples_impl(
f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
} // namespace ck
#endif
#ifndef CK_TUPLE_HELPER_HPP
#define CK_TUPLE_HELPER_HPP
#include "tuple_helper.hpp"
namespace ck {
template <typename... Xs>
__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
{
return Tuple<remove_cv_t<remove_reference_t<Xs>>...>(std::forward<Xs>(xs)...);
}
namespace detail {
template <typename F, typename X, index_t... Is>
__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
}
template <typename F, typename X, typename Y, typename Z, index_t... Is>
__host__ __device__ constexpr auto
transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
}
} // namespace detail
template <typename F, typename X>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
{
return detail::transform_tuples_impl(
f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
{
return detail::transform_tuples_impl(
f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
template <typename F, typename X, typename Y, typename Z>
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
{
return detail::transform_tuples_impl(
f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
}
} // namespace ck
#endif
......@@ -17,5 +17,5 @@ cmake
${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment