Unverified Commit b62bf8c3 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Merge pull request #14 from ROCmSoftwarePlatform/miopen_downstream_init_integration

MIOpen Downstream: Initial integration 2nd PR
parents ccc4a1d3 67ad47e7
...@@ -8,7 +8,7 @@ namespace ck { ...@@ -8,7 +8,7 @@ namespace ck {
template <typename Lengths, template <typename Lengths,
typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type> typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ __device__ constexpr auto make_cluster_descriptor_v2( __host__ __device__ constexpr auto make_cluster_descriptor(
const Lengths& lengths, const Lengths& lengths,
ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{}) ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
{ {
......
...@@ -481,11 +481,11 @@ struct Merge_v1_carry_check ...@@ -481,11 +481,11 @@ struct Merge_v1_carry_check
using LowerIndex = MultiIndex<NDimLow>; using LowerIndex = MultiIndex<NDimLow>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
using LowLengthsScan = decltype( using LowLengthsScan =
container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{})); decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
using UpLengths = using UpLengths =
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{}))); decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
LowLengths low_lengths_; LowLengths low_lengths_;
LowLengthsScan low_lengths_scan_; LowLengthsScan low_lengths_scan_;
...@@ -496,8 +496,8 @@ struct Merge_v1_carry_check ...@@ -496,8 +496,8 @@ struct Merge_v1_carry_check
__host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths) __host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
: low_lengths_{low_lengths}, : low_lengths_{low_lengths},
low_lengths_scan_{ low_lengths_scan_{
container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})}, container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))} up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
{ {
static_assert(LowerIndex::Size() == NDimLow, "wrong!"); static_assert(LowerIndex::Size() == NDimLow, "wrong!");
} }
...@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division ...@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
using UpLengths = using UpLengths =
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{}))); decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
using LowLengthsMagicDivisorMultipiler = decltype( using LowLengthsMagicDivisorMultipiler = decltype(
generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{}, generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
...@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division ...@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division
low_lengths_magic_divisor_shift_{generate_tuple( low_lengths_magic_divisor_shift_{generate_tuple(
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); }, [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
Number<NDimLow>{})}, Number<NDimLow>{})},
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))} up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
{ {
static_assert(LowerIndex::Size() == NDimLow, "wrong!"); static_assert(LowerIndex::Size() == NDimLow, "wrong!");
} }
...@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division ...@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division
using LowerIndex = MultiIndex<NDimLow>; using LowerIndex = MultiIndex<NDimLow>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
using LowLengthsScan = decltype( using LowLengthsScan =
container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{})); decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
using UpLengths = using UpLengths =
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{}))); decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple( using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{}, lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
...@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division ...@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division
__host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths) __host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
: low_lengths_{low_lengths}, : low_lengths_{low_lengths},
low_lengths_scan_{ low_lengths_scan_{
container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})}, container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
low_lengths_scan_magic_divisor_multiplier_{generate_tuple( low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
[&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); }, [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
Number<NDimLow>{})}, Number<NDimLow>{})},
low_lengths_scan_magic_divisor_shift_{generate_tuple( low_lengths_scan_magic_divisor_shift_{generate_tuple(
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); }, [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
Number<NDimLow>{})}, Number<NDimLow>{})},
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))} up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
{ {
static_assert(LowerIndex::Size() == NDimLow, "wrong!"); static_assert(LowerIndex::Size() == NDimLow, "wrong!");
} }
...@@ -1336,7 +1336,7 @@ struct UnMerge ...@@ -1336,7 +1336,7 @@ struct UnMerge
using UpperIndex = MultiIndex<NDimUp>; using UpperIndex = MultiIndex<NDimUp>;
using UpLengthsScan = using UpLengthsScan =
decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies_v2{}, Number<1>{})); decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies{}, Number<1>{}));
UpLengths up_lengths_; UpLengths up_lengths_;
UpLengthsScan up_lengths_scan_; UpLengthsScan up_lengths_scan_;
...@@ -1346,7 +1346,7 @@ struct UnMerge ...@@ -1346,7 +1346,7 @@ struct UnMerge
__host__ __device__ constexpr UnMerge(const UpLengths& up_lengths) __host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
: up_lengths_{up_lengths}, : up_lengths_{up_lengths},
up_lengths_scan_{ up_lengths_scan_{
container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})} container_reverse_exclusive_scan(up_lengths, math::multiplies{}, Number<1>{})}
{ {
} }
......
...@@ -64,7 +64,7 @@ struct TensorAdaptor ...@@ -64,7 +64,7 @@ struct TensorAdaptor
Number<ndim_top_>{}); Number<ndim_top_>{});
// TODO: make container_reduce support tuple of Number and index_t // TODO: make container_reduce support tuple of Number and index_t
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{}); return container_reduce(lengths, math::multiplies{}, Number<1>{});
} }
template <index_t IDim> template <index_t IDim>
......
...@@ -69,7 +69,7 @@ struct TensorDescriptor ...@@ -69,7 +69,7 @@ struct TensorDescriptor
Number<ndim_visible_>{}); Number<ndim_visible_>{});
// TODO: make container_reduce support tuple of Number and index_t // TODO: make container_reduce support tuple of Number and index_t
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{}); return container_reduce(lengths, math::multiplies{}, Number<1>{});
} }
template <index_t IDim> template <index_t IDim>
......
...@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt ...@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
template <typename... Lengths, template <typename... Lengths,
typename... Strides, typename... Strides,
typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false> typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
__host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths, __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Lengths...>& lengths,
const Tuple<Strides...>& strides) const Tuple<Strides...>& strides)
{ {
constexpr index_t N = sizeof...(Lengths); constexpr index_t N = sizeof...(Lengths);
...@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths) ...@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{}; constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{}); const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{});
return TensorDescriptor<remove_cv_t<decltype(transforms)>, return TensorDescriptor<remove_cv_t<decltype(transforms)>,
remove_cv_t<decltype(low_dim_hidden_idss)>, remove_cv_t<decltype(low_dim_hidden_idss)>,
...@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths) ...@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
template <typename... Lengths, typename Align> template <typename... Lengths, typename Align>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align) make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
{ {
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align ...@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
else else
{ {
return container_reduce(lengths, return container_reduce(lengths,
math::multiplies_v2{}, math::multiplies{},
Number<stride_n_minus_2>{}, Number<stride_n_minus_2>{},
i + I1, i + I1,
Number<N - 1>{}, Number<N - 1>{},
...@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align ...@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
}, },
Number<N>{}); Number<N>{});
return make_naive_tensor_descriptor_v2(lengths, strides); return make_naive_tensor_descriptor(lengths, strides);
} }
} // namespace ck } // namespace ck
......
...@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4 ...@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4
private: private:
static constexpr auto thread_cluster_desc_ = static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer = using ThreadwiseTransfer =
ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths, ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
......
...@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1 ...@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
private: private:
static constexpr auto thread_cluster_desc_ = static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer = using ThreadwiseTransfer =
ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths, ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
......
...@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN ...@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1), make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
max_lds_align); max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1), make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
max_lds_align); max_lds_align);
...@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN ...@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
constexpr auto BN = GN0 * GN11; constexpr auto BN = GN0 * GN11;
constexpr auto BM1 = constexpr auto BM1 =
Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies_v2{}, I1) * Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
BM1PerThreadBM11>{}; BM1PerThreadBM11>{};
constexpr auto BN1 = constexpr auto BN1 =
Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies_v2{}, I1) * Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
BN1PerThreadBN11>{}; BN1PerThreadBN11>{};
constexpr auto BM0 = BM / BM1; constexpr auto BM0 = BM / BM1;
...@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN ...@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1), make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
max_lds_align); max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1), make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
max_lds_align); max_lds_align);
// A matrix in LDS memory for blockwise GEMM // A matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align); make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
// B matrix in LDS memory for blockwise GEMM // B matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align); make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() == static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
......
...@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2 ...@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
// LDS allocation for A and B: be careful of alignment // LDS allocation for A and B: be careful of alignment
...@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2 ...@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align); make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align); make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
// A matrix blockwise copy // A matrix blockwise copy
......
...@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3 ...@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// TODO: check alignment // TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
// TODO: check alignment // TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
// TODO: check alignment // TODO: check alignment
...@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3 ...@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
const auto N0 = N / N1; const auto N0 = N / N1;
constexpr auto M11 = constexpr auto M11 =
Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) * Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
M1PerThreadM111>{}; M1PerThreadM111>{};
constexpr auto N11 = constexpr auto N11 =
Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) * Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
N1PerThreadN111>{}; N1PerThreadN111>{};
constexpr auto M10 = M1 / M11; constexpr auto M10 = M1 / M11;
...@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3 ...@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// TODO: check alignment // TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
// TODO: check alignment // TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
// TODO: check alignment // TODO: check alignment
// A matrix in LDS memory, for blockwise GEMM // A matrix in LDS memory, for blockwise GEMM
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
// TODO: check alignment // TODO: check alignment
// B matrix in LDS memory, for blockwise GEMM // B matrix in LDS memory, for blockwise GEMM
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() == static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
......
...@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align); make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
// LDS allocation for A and B: be careful of alignment // LDS allocation for A and B: be careful of alignment
...@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align); make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align); make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
......
...@@ -148,12 +148,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -148,12 +148,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
// LDS allocation for A and B: be careful of alignment // LDS allocation for A and B: be careful of alignment
...@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2( constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align); make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
// A matrix blockwise copy // A matrix blockwise copy
......
...@@ -91,13 +91,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -91,13 +91,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
container_reverse_exclusive_scan( container_reverse_exclusive_scan(
container_reorder_given_new2old(src_vector_tensor_lengths, container_reorder_given_new2old(src_vector_tensor_lengths,
SrcVectorTensorContiguousDimOrder{}), SrcVectorTensorContiguousDimOrder{}),
math::multiplies_v2{}, math::multiplies{},
I1), I1),
SrcVectorTensorContiguousDimOrder{}); SrcVectorTensorContiguousDimOrder{});
constexpr auto src_vector_desc = constexpr auto src_vector_desc =
make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths), make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
sequence_to_tuple_of_number(src_vector_tensor_strides)); sequence_to_tuple_of_number(src_vector_tensor_strides));
// access order and lengths // access order and lengths
constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths; constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
...@@ -259,13 +259,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -259,13 +259,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
container_reverse_exclusive_scan( container_reverse_exclusive_scan(
container_reorder_given_new2old(dst_vector_tensor_lengths, container_reorder_given_new2old(dst_vector_tensor_lengths,
DstVectorTensorContiguousDimOrder{}), DstVectorTensorContiguousDimOrder{}),
math::multiplies_v2{}, math::multiplies{},
I1), I1),
DstVectorTensorContiguousDimOrder{}); DstVectorTensorContiguousDimOrder{});
constexpr auto dst_vector_desc = constexpr auto dst_vector_desc =
make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(dst_vector_tensor_lengths), make_naive_tensor_descriptor(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
sequence_to_tuple_of_number(dst_vector_tensor_strides)); sequence_to_tuple_of_number(dst_vector_tensor_strides));
// dst access order and lengths // dst access order and lengths
constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths; constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
...@@ -699,13 +699,13 @@ struct ThreadwiseTensorSliceTransfer_v4r1 ...@@ -699,13 +699,13 @@ struct ThreadwiseTensorSliceTransfer_v4r1
container_reverse_exclusive_scan( container_reverse_exclusive_scan(
container_reorder_given_new2old(src_vector_tensor_lengths, container_reorder_given_new2old(src_vector_tensor_lengths,
SrcVectorTensorContiguousDimOrder{}), SrcVectorTensorContiguousDimOrder{}),
math::multiplies_v2{}, math::multiplies{},
I1), I1),
SrcVectorTensorContiguousDimOrder{}); SrcVectorTensorContiguousDimOrder{});
constexpr auto src_vector_desc = constexpr auto src_vector_desc =
make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths), make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
sequence_to_tuple_of_number(src_vector_tensor_strides)); sequence_to_tuple_of_number(src_vector_tensor_strides));
// access order and lengths // access order and lengths
constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths; constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
......
...@@ -28,13 +28,7 @@ struct minus ...@@ -28,13 +28,7 @@ struct minus
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; } __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
}; };
template <typename T>
struct multiplies struct multiplies
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
};
struct multiplies_v2
{ {
template <typename A, typename B> template <typename A, typename B>
__host__ __device__ constexpr auto operator()(const A& a, const B& b) const __host__ __device__ constexpr auto operator()(const A& a, const B& b) const
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment