Unverified Commit 52c3fe05 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen
parent 9aaeacc8
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP #define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
namespace ck { namespace ck {
...@@ -32,6 +32,11 @@ struct ConstantMatrixDescriptor ...@@ -32,6 +32,11 @@ struct ConstantMatrixDescriptor
return irow * RowStride_ + icol; return irow * RowStride_ + icol;
} }
__host__ __device__ static index_t CalculateOffset(index_t irow, index_t icol)
{
return GetOffsetFromMultiIndex(irow, icol);
}
template <index_t SubNRow, index_t SubNCol> template <index_t SubNRow, index_t SubNCol>
__host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>, __host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
Number<SubNCol>) Number<SubNCol>)
...@@ -54,9 +59,10 @@ __host__ __device__ constexpr auto ...@@ -54,9 +59,10 @@ __host__ __device__ constexpr auto
} }
template <typename... Ts> template <typename... Ts>
__host__ __device__ constexpr auto make_ConstantMatrixDescriptor(ConstantTensorDescriptor<Ts...>) __host__ __device__ constexpr auto
make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
{ {
using TDesc = ConstantTensorDescriptor<Ts...>; using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
static_assert(TDesc::GetStrides()[1] == 1, "wrong"); static_assert(TDesc::GetStrides()[1] == 1, "wrong");
return ConstantMatrixDescriptor<TDesc::GetLengths()[0], return ConstantMatrixDescriptor<TDesc::GetLengths()[0],
......
#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP #ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP #define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
namespace ck { namespace ck {
// OriginalTensorDesc : ConstantTensorDescriptor<...> // OriginalTensorDesc : ConstantTensorDescriptor_deprecated<...>
// it's the tensor whose dimensions are to be merged // it's the tensor whose dimensions are to be merged
// OriginalDimMergeSeqs : Sequence<...>... // OriginalDimMergeSeqs : Sequence<...>...
// each is a sequence of original dimensions (of OriginalTensorDesc) to be merged // each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs> template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
struct ConstantMergedTensorDescriptor struct ConstantMergedTensorDescriptor_deprecated
{ {
using Type = ConstantMergedTensorDescriptor; using Type = ConstantMergedTensorDescriptor_deprecated;
static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{}; static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs); static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs);
static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension(); static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
__host__ __device__ constexpr ConstantMergedTensorDescriptor() __host__ __device__ constexpr ConstantMergedTensorDescriptor_deprecated()
{ {
static_assert(nDim <= nOriginalDim, "wrong!"); static_assert(nDim <= nOriginalDim, "wrong!");
...@@ -189,7 +189,7 @@ struct ConstantMergedTensorDescriptor ...@@ -189,7 +189,7 @@ struct ConstantMergedTensorDescriptor
{ {
constexpr auto lengths = GetLengths(); constexpr auto lengths = GetLengths();
constexpr auto strides = calculate_tensor_strides_packed(lengths); constexpr auto strides = calculate_tensor_strides_packed(lengths);
return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{}; return ConstantTensorDescriptor_deprecated<decltype(lengths), decltype(strides)>{};
} }
}; };
...@@ -197,7 +197,7 @@ template <class OriginalTensorDesc, class... OriginalDimMergeSeqs> ...@@ -197,7 +197,7 @@ template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
OriginalDimMergeSeqs...) OriginalDimMergeSeqs...)
{ {
return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{}; return ConstantMergedTensorDescriptor_deprecated<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
} }
template <class TDesc> template <class TDesc>
......
#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP #ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP #define CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#include "common_header.hpp" #include "common_header.hpp"
namespace ck { namespace ck {
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths) __host__ __device__ constexpr auto calculate_tensor_strides_packed_deprecated(Lengths)
{ {
return reverse_inclusive_scan_sequence( return reverse_inclusive_scan_sequence(
Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{}) Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
...@@ -19,18 +19,18 @@ __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, ...@@ -19,18 +19,18 @@ __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths,
constexpr index_t L_back_align = constexpr index_t L_back_align =
Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align); Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_tensor_strides_packed_old( return calculate_tensor_strides_packed_deprecated(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{})); Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
} }
template <class Lengths, class Strides> template <class Lengths, class Strides>
struct ConstantTensorDescriptor struct ConstantTensorDescriptor_deprecated
{ {
using Type = ConstantTensorDescriptor; using Type = ConstantTensorDescriptor_deprecated;
static constexpr index_t nDim = Lengths::GetSize(); static constexpr index_t nDim = Lengths::GetSize();
__host__ __device__ constexpr ConstantTensorDescriptor() __host__ __device__ constexpr ConstantTensorDescriptor_deprecated()
{ {
static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent"); static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent");
} }
...@@ -186,7 +186,7 @@ struct ConstantTensorDescriptor ...@@ -186,7 +186,7 @@ struct ConstantTensorDescriptor
{ {
Array<index_t, nDim> multi_id; Array<index_t, nDim> multi_id;
using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths())); using PackedStrides = decltype(calculate_tensor_strides_packed_deprecated(GetLengths()));
// calculate index in each of the dimensions in the order of their dimension // calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id)); static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));
...@@ -284,7 +284,7 @@ struct ConstantTensorDescriptor ...@@ -284,7 +284,7 @@ struct ConstantTensorDescriptor
using extract_lengths = decltype(Lengths::Extract(extract_dims...)); using extract_lengths = decltype(Lengths::Extract(extract_dims...));
using extract_strides = decltype(Strides::Extract(extract_dims...)); using extract_strides = decltype(Strides::Extract(extract_dims...));
return ConstantTensorDescriptor<extract_lengths, extract_strides>{}; return ConstantTensorDescriptor_deprecated<extract_lengths, extract_strides>{};
} }
template <index_t... IDims> template <index_t... IDims>
...@@ -294,13 +294,13 @@ struct ConstantTensorDescriptor ...@@ -294,13 +294,13 @@ struct ConstantTensorDescriptor
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor<Ts...>) __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor_deprecated<Ts...>)
{ {
using leaf_tensor = ConstantTensorDescriptor<Ts...>; using leaf_tensor = ConstantTensorDescriptor_deprecated<Ts...>;
return ConstantTensorDescriptor<decltype(GetLengths().PushBack(leaf_tensor::GetLengths())), return ConstantTensorDescriptor_deprecated<
decltype( decltype(GetLengths().PushBack(leaf_tensor::GetLengths())),
GetStrides().PushBack(leaf_tensor::GetStrides()))>{}; decltype(GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
} }
template <index_t IDimVector, index_t DataPerVector> template <index_t IDimVector, index_t DataPerVector>
...@@ -351,7 +351,7 @@ struct ConstantTensorDescriptor ...@@ -351,7 +351,7 @@ struct ConstantTensorDescriptor
using vectorized_strides = using vectorized_strides =
decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{})); decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{}));
return ConstantTensorDescriptor<vectorized_lengths, vectorized_strides>{}; return ConstantTensorDescriptor_deprecated<vectorized_lengths, vectorized_strides>{};
} }
template <index_t IDim, index_t SliceLen> template <index_t IDim, index_t SliceLen>
...@@ -359,7 +359,7 @@ struct ConstantTensorDescriptor ...@@ -359,7 +359,7 @@ struct ConstantTensorDescriptor
{ {
using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{})); using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{}));
return ConstantTensorDescriptor<slice_lengths, Strides>{}; return ConstantTensorDescriptor_deprecated<slice_lengths, Strides>{};
} }
template <index_t... Is> template <index_t... Is>
...@@ -367,7 +367,7 @@ struct ConstantTensorDescriptor ...@@ -367,7 +367,7 @@ struct ConstantTensorDescriptor
{ {
static_assert(slice_lengths.GetSize() == nDim, "wrong!"); static_assert(slice_lengths.GetSize() == nDim, "wrong!");
return ConstantTensorDescriptor<decltype(slice_lengths), Strides>{}; return ConstantTensorDescriptor_deprecated<decltype(slice_lengths), Strides>{};
} }
template <index_t IDim, index_t SliceLength, index_t SliceStride> template <index_t IDim, index_t SliceLength, index_t SliceStride>
...@@ -379,7 +379,7 @@ struct ConstantTensorDescriptor ...@@ -379,7 +379,7 @@ struct ConstantTensorDescriptor
using new_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLength>{})); using new_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLength>{}));
using new_strides = decltype(Strides::Modify(Number<IDim>{}, Number<new_stride>{})); using new_strides = decltype(Strides::Modify(Number<IDim>{}, Number<new_stride>{}));
return ConstantTensorDescriptor<new_lengths, new_strides>{}; return ConstantTensorDescriptor_deprecated<new_lengths, new_strides>{};
} }
template <index_t IDim, index_t... FoldIntervals> template <index_t IDim, index_t... FoldIntervals>
...@@ -418,7 +418,7 @@ struct ConstantTensorDescriptor ...@@ -418,7 +418,7 @@ struct ConstantTensorDescriptor
constexpr auto new_strides = constexpr auto new_strides =
GetStrides().Extract(left).PushBack(fold_strides).PushBack(GetStrides().Extract(right)); GetStrides().Extract(left).PushBack(fold_strides).PushBack(GetStrides().Extract(right));
return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{}; return ConstantTensorDescriptor_deprecated<decltype(new_lengths), decltype(new_strides)>{};
} }
template <index_t IDim, index_t... FoldIntervals> template <index_t IDim, index_t... FoldIntervals>
...@@ -462,54 +462,55 @@ struct ConstantTensorDescriptor ...@@ -462,54 +462,55 @@ struct ConstantTensorDescriptor
.PushBack(Number<unfold_stride>{}) .PushBack(Number<unfold_stride>{})
.PushBack(GetStrides().Extract(right)); .PushBack(GetStrides().Extract(right));
return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{}; return ConstantTensorDescriptor_deprecated<decltype(new_lengths), decltype(new_strides)>{};
} }
__host__ __device__ static constexpr auto Pack() __host__ __device__ static constexpr auto Pack()
{ {
using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); using packed_strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{}));
return ConstantTensorDescriptor<Lengths, packed_strides>{}; return ConstantTensorDescriptor_deprecated<Lengths, packed_strides>{};
} }
template <class MapNew2Old> template <class MapNew2Old>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old) __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old)
{ {
return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})), return ConstantTensorDescriptor_deprecated<
decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{}; decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})),
decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{};
} }
template <class MapOld2New> template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New) __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
{ {
return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenOld2New(MapOld2New{})), return ConstantTensorDescriptor_deprecated<
decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{}; decltype(Lengths::ReorderGivenOld2New(MapOld2New{})),
decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{};
} }
}; };
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths) __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
{ {
using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); using Strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{}));
return ConstantTensorDescriptor<Lengths, Strides>{}; return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
} }
template <class Lengths, class Strides> template <class Lengths, class Strides>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides) __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides)
{ {
return ConstantTensorDescriptor<Lengths, Strides>{}; return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
} }
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>) __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
{ {
using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{})); using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{}));
return ConstantTensorDescriptor<Lengths, Strides>{}; return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
} }
template <index_t... Lengths, index_t... Strides> template <index_t... Lengths, index_t... Strides>
__host__ __device__ void __host__ __device__ void print_ConstantTensorDescriptor(
print_ConstantTensorDescriptor(const char* s, const char* s, ConstantTensorDescriptor_deprecated<Sequence<Lengths...>, Sequence<Strides...>>)
ConstantTensorDescriptor<Sequence<Lengths...>, Sequence<Strides...>>)
{ {
constexpr index_t ndim = sizeof...(Lengths); constexpr index_t ndim = sizeof...(Lengths);
......
#ifndef CK_PRINT_TENSOR_DESCRIPTOR_HPP
#define CK_PRINT_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
namespace ck {
template <typename... NativeDimensions>
__host__ __device__ void
print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
{
print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides());
}
template <typename... Ts>
__host__ __device__ void print_tensor_descriptor(const char* s,
const TransformedTensorDescriptor<Ts...>& desc)
{
print_tensor_descriptor_impl(s, desc.GetLengths());
}
template <index_t... Lengths, index_t... Strides>
__host__ __device__ void
print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>, Sequence<Strides...>)
{
constexpr index_t nDim = sizeof...(Lengths);
static_assert(nDim > 0 && nDim <= 12, "wrong!");
static_if<nDim == 1>{}([&](auto) {
printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 2>{}([&](auto) {
printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 3>{}([&](auto) {
printf(
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 4>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 5>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 6>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 7>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 8>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 9>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"%u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 10>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 11>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
"%u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 12>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
"%u %u %u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
}
template <index_t... Lengths>
__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>)
{
constexpr index_t nDim = sizeof...(Lengths);
static_assert(nDim > 0 && nDim <= 12, "wrong!");
static_if<nDim == 1>{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); });
static_if<nDim == 2>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 3>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 4>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 5>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 6>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); });
static_if<nDim == 7>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 8>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 9>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 10>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 11>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 12>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
}
} // namespace ck
#endif
#ifndef CK_TENSOR_COORDINATE_V2_HPP #ifndef CK_TENSOR_COORDINATE_HPP
#define CK_TENSOR_COORDINATE_V2_HPP #define CK_TENSOR_COORDINATE_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "dimension.hpp" #include "dimension.hpp"
...@@ -8,9 +8,24 @@ ...@@ -8,9 +8,24 @@
namespace ck { namespace ck {
// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
// At the bare minimun, user should be able to query the following information from a tensor
// coordinate:
// 1. Tensor descriptor
// 2. Location, represented in the form of multi-index
// 3. Location, represented in the form of the offset to the origin of the tensor
// 4. If the location is inside invalid area or not, i.e. the padding area of an implicitly padded
// tensor is considered invalid, because the padding area doesn't have any physical memory
// allocation
// A tensor cooridnate also provides following functionality:
// 1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
// can freely move the "point of location" inside the tensor
// wrapper class for NativeTensorCoordinate and TransformedTensorCoordinate
template <typename TensorDesc> template <typename TensorDesc>
struct TensorCoordinate; struct TensorCoordinate;
// tensor coordinate for native tensor
template <typename NativeTensorDesc> template <typename NativeTensorDesc>
struct NativeTensorCoordinate struct NativeTensorCoordinate
{ {
...@@ -78,12 +93,10 @@ struct NativeTensorCoordinate ...@@ -78,12 +93,10 @@ struct NativeTensorCoordinate
return coord; return coord;
} }
#if 0 // tweaking
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff) __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
{ {
return tensor_desc_type::CalculateOffsetDiff(idx_diff); return tensor_desc_type::CalculateOffsetDiff(idx_diff);
} }
#endif
__host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
...@@ -96,6 +109,7 @@ struct NativeTensorCoordinate ...@@ -96,6 +109,7 @@ struct NativeTensorCoordinate
index_t mOffset; index_t mOffset;
}; };
// tensor coordinate for transformed tensor
template <typename TransformedTensorDesc> template <typename TransformedTensorDesc>
struct TransformedTensorCoordinate struct TransformedTensorCoordinate
{ {
...@@ -177,10 +191,10 @@ struct TransformedTensorCoordinate ...@@ -177,10 +191,10 @@ struct TransformedTensorCoordinate
return coord_up; return coord_up;
} }
#if 0 // tweaking
// Calculate offset diff without updating tensor-coordinate // Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions, // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time. // then all calculation can be done at compile-time.
// TODO: this function is not compiled to expected ISA
__host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
{ {
// For transformation of multi-index difference, not all transformation functions need to // For transformation of multi-index difference, not all transformation functions need to
...@@ -191,7 +205,6 @@ struct TransformedTensorCoordinate ...@@ -191,7 +205,6 @@ struct TransformedTensorCoordinate
return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff); return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
} }
#endif
__host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
{ {
......
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
#define CK_TENSOR_COORDINATE_DEPRECATED_HPP #define CK_TENSOR_COORDINATE_DEPRECATED_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor_deprecated.hpp"
namespace ck { namespace ck {
// TensorDesc is ConstantTensorDescriptor // TensorDesc is ConstantTensorDescriptor_deprecated
template <class TensorDesc> template <class TensorDesc>
struct NormalTensorCoordinate_deprecated struct NormalTensorCoordinate_deprecated
{ {
...@@ -95,18 +95,19 @@ struct NormalTensorCoordinate_deprecated ...@@ -95,18 +95,19 @@ struct NormalTensorCoordinate_deprecated
index_t mOffset; index_t mOffset;
}; };
// TensorDesc is ConstantMergedTensorDescriptor // TensorDesc is ConstantMergedTensorDescriptor_deprecated
template <class TensorDesc> template <class TensorDesc>
struct MergedTensorCoordinate struct MergedTensorCoordinate_deprecated
{ {
using type = MergedTensorCoordinate; using type = MergedTensorCoordinate_deprecated;
using tensor_desc_type = TensorDesc; using tensor_desc_type = TensorDesc;
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
static constexpr index_t nOriginalDim = static constexpr index_t nOriginalDim =
tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
__host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index) __host__
__device__ constexpr MergedTensorCoordinate_deprecated(Array<index_t, nDim> tensor_index)
: mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
{ {
// partial offset on each dimension // partial offset on each dimension
...@@ -127,8 +128,8 @@ struct MergedTensorCoordinate ...@@ -127,8 +128,8 @@ struct MergedTensorCoordinate
} }
template <class... Xs> template <class... Xs>
__host__ __device__ constexpr MergedTensorCoordinate(Xs... xs) __host__ __device__ constexpr MergedTensorCoordinate_deprecated(Xs... xs)
: MergedTensorCoordinate(Array<index_t, nDim>{xs...}) : MergedTensorCoordinate_deprecated(Array<index_t, nDim>{xs...})
{ {
} }
...@@ -311,7 +312,7 @@ struct MergedTensorCoordinate ...@@ -311,7 +312,7 @@ struct MergedTensorCoordinate
// dimensions, and those merged dimensions, that would never be involved in index // dimensions, and those merged dimensions, that would never be involved in index
// arithmetic after construction of TensorCoordinate. // arithmetic after construction of TensorCoordinate.
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to // and simplify implementation of ConstantMergedTensorDescriptor_deprecated, so we don't need to
// count on compiler to optimize away those register memory for us // count on compiler to optimize away those register memory for us
Array<index_t, nOriginalDim> mOriginalIndex; Array<index_t, nOriginalDim> mOriginalIndex;
Array<index_t, nDim> mPartialOffsets; Array<index_t, nDim> mPartialOffsets;
...@@ -326,16 +327,17 @@ struct TensorCoordinate_deprecated ...@@ -326,16 +327,17 @@ struct TensorCoordinate_deprecated
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
{ {
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>(); return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
{ {
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>(); return MergedTensorCoordinate_deprecated<
ConstantMergedTensorDescriptor_deprecated<Ts...>>();
} }
public: public:
......
#ifndef CK_TENSOR_COORDINATE_HELPER_HPP #ifndef CK_TENSOR_COORDINATE_HELPER_HPP
#define CK_TENSOR_COORDINATE_HELPER_HPP #define CK_TENSOR_COORDINATE_HELPER_HPP
#include "tensor_coordiante_v2.hpp" #include "tensor_coordiante_hpp"
namespace ck { namespace ck {
template <typename TensorDesc> template <typename TensorDesc>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx) make_tensor_coordinate(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
{ {
return typename TensorCoordinate<TensorDesc>::type(idx); return typename TensorCoordinate<TensorDesc>::type(idx);
} }
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
namespace ck { namespace ck {
// tensor descriptor for "native tensor"
// A "native tensor" is a "true" tensor that can be represented by Lengths and Strides
template <typename... NativeDimensions> template <typename... NativeDimensions>
struct NativeTensorDescriptor struct NativeTensorDescriptor
{ {
...@@ -113,12 +115,10 @@ struct NativeTensorDescriptor ...@@ -113,12 +115,10 @@ struct NativeTensorDescriptor
__host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; } __host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
#if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{ {
return Tuple<>{}; return Tuple<>{};
} }
#endif
__host__ __device__ static constexpr bool __host__ __device__ static constexpr bool
IsUpperIndexMappedToValidOffset(const Index& /* idx */) IsUpperIndexMappedToValidOffset(const Index& /* idx */)
...@@ -127,14 +127,11 @@ struct NativeTensorDescriptor ...@@ -127,14 +127,11 @@ struct NativeTensorDescriptor
} }
}; };
// LowerTensorDescriptor // Tensor descriptor for "transformed tensor"
// Transforms: Tuple<DimensionTransforms...> template <typename LowTensorDescriptor, // NativeTensorDescriptor or TransformedTensorDescriptor
// LowerDimensionIds: Tuple<Sequence<...>> typename Transforms, // Tuple<MultIndexTransforms...>
// UpperDimensionIds: Tuple<Sequence<...>> typename LowDimensionIds, // Tuple<Sequence<...>>
template <typename LowTensorDescriptor, typename UpDimensionIds> // Tuple<Sequence<...>>
typename Transforms,
typename LowDimensionIds,
typename UpDimensionIds>
struct TransformedTensorDescriptor struct TransformedTensorDescriptor
{ {
using type = TransformedTensorDescriptor; using type = TransformedTensorDescriptor;
...@@ -412,6 +409,7 @@ struct TransformedTensorDescriptor ...@@ -412,6 +409,7 @@ struct TransformedTensorDescriptor
{ {
#if 0 #if 0
// create tuple of linear dimension masks, for all transformations // create tuple of linear dimension masks, for all transformations
// TODO: this doesn't compile, because transform_tuples() complain about constexpr
constexpr auto tuple_of_linear_dimension_mask = constexpr auto tuple_of_linear_dimension_mask =
transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{}, transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{},
Transforms{}, Transforms{},
...@@ -419,7 +417,7 @@ struct TransformedTensorDescriptor ...@@ -419,7 +417,7 @@ struct TransformedTensorDescriptor
UpDimensionIds{}); UpDimensionIds{});
#else #else
// create tuple of linear dimension masks, for all transformations // create tuple of linear dimension masks, for all transformations
// TODO: this is a hack, transform_tuples() doesn't compile, complain about constexpr // TODO: this is a hack
constexpr auto tuple_of_linear_dimension_mask = dummy_transform_tuples_impl( constexpr auto tuple_of_linear_dimension_mask = dummy_transform_tuples_impl(
lambda_get_linear_dimension_mask_of_single_tranform{}, lambda_get_linear_dimension_mask_of_single_tranform{},
Transforms{}, Transforms{},
...@@ -465,7 +463,7 @@ struct TransformedTensorDescriptor ...@@ -465,7 +463,7 @@ struct TransformedTensorDescriptor
#if 0 #if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{ {
// not implemented // TODO: not implemented
} }
#endif #endif
......
...@@ -63,10 +63,11 @@ template <typename LowerTensorDescriptor, ...@@ -63,10 +63,11 @@ template <typename LowerTensorDescriptor,
index_t... LowerLengths, index_t... LowerLengths,
index_t... LowerDimensionIds, index_t... LowerDimensionIds,
index_t... UpperDimensionIds> index_t... UpperDimensionIds>
__host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDescriptor, __host__ __device__ constexpr auto
Sequence<LowerLengths...>, reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
Sequence<LowerDimensionIds...>, Sequence<LowerLengths...>,
Sequence<UpperDimensionIds...>) Sequence<LowerDimensionIds...>,
Sequence<UpperDimensionIds...>)
{ {
return TransformedTensorDescriptor<LowerTensorDescriptor, return TransformedTensorDescriptor<LowerTensorDescriptor,
Tuple<PassThrough<LowerLengths>...>, Tuple<PassThrough<LowerLengths>...>,
...@@ -74,17 +75,40 @@ __host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDes ...@@ -74,17 +75,40 @@ __host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDes
Tuple<Sequence<UpperDimensionIds>...>>{}; Tuple<Sequence<UpperDimensionIds>...>>{};
} }
template <typename LowerTensorDescriptor, typename MapLower2Upper> // reorder a NativeTensorDescriptor
template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
{
static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map");
constexpr auto old_desc = NativeTensorDescriptor<Ts...>{};
static_assert(old_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!");
constexpr auto new_lengths = old_desc.GetLengths().ReorderGivenOld2New(MapLower2Upper{});
constexpr auto new_strides = old_desc.GetStrides().ReorderGivenOld2New(MapLower2Upper{});
return make_native_tensor_descriptor(new_lengths, new_strides);
}
// reorder a TransformedTensorDescriptor
template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(LowerTensorDescriptor, MapLower2Upper) reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
{ {
static_assert(is_valid_sequence_map<MapLower2Upper>{}, static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map"); "wrong! MapLower2Upper is not a valid map");
return reorder_tensor_descriptor_impl( constexpr auto low_desc = TransformedTensorDescriptor<Ts...>{};
LowerTensorDescriptor{},
LowerTensorDescriptor::GetLengths(), static_assert(low_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!");
typename arithmetic_sequence_gen<0, LowerTensorDescriptor::GetNumOfDimension(), 1>::type{},
return reorder_transformed_tensor_descriptor_impl(
low_desc,
low_desc.GetLengths(),
typename arithmetic_sequence_gen<0, low_desc.GetNumOfDimension(), 1>::type{},
MapLower2Upper{}); MapLower2Upper{});
} }
...@@ -97,7 +121,7 @@ __host__ __device__ constexpr auto ...@@ -97,7 +121,7 @@ __host__ __device__ constexpr auto
} }
template <typename Lengths, typename Strides> template <typename Lengths, typename Strides>
__host__ __device__ constexpr bool AreDimensionsUnfoldable(Lengths, Strides) __host__ __device__ constexpr bool are_dimensions_unfoldable(Lengths, Strides)
{ {
static_assert(Lengths::Size() == Strides::Size(), "wrong!"); static_assert(Lengths::Size() == Strides::Size(), "wrong!");
...@@ -129,7 +153,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript ...@@ -129,7 +153,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
constexpr auto right = typename arithmetic_sequence_gen<LastUnfoldDim + 1, nDim, 1>::type{}; constexpr auto right = typename arithmetic_sequence_gen<LastUnfoldDim + 1, nDim, 1>::type{};
// sanity-checknfoldable // sanity-checknfoldable
static_assert(AreDimensionsUnfoldable(desc.GetLengths(middle), desc.GetStrides(middle)), static_assert(are_dimensions_unfoldable(desc.GetLengths(middle), desc.GetStrides(middle)),
"wrong! not unfoldable"); "wrong! not unfoldable");
// unfolded length, stride // unfolded length, stride
...@@ -148,30 +172,6 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript ...@@ -148,30 +172,6 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
return make_native_tensor_descriptor(new_lengths, new_strides); return make_native_tensor_descriptor(new_lengths, new_strides);
} }
#if 0
// not implemented
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename LeftPads,
typename RightPads>
__host__ __device__ constexpr auto
pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
constexpr auto non_pad_low_dim_ids = xxx;
return transform_tensor_descriptor(
LowerTensorDescriptor{},
make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
LeftPads,
RightPads>{})
.PushBack(PassThrough<xxxx>...),
make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
}
#endif
// a cluster map 1d index to N-d index // a cluster map 1d index to N-d index
template <typename Lengths, typename ArrangeOrder> template <typename Lengths, typename ArrangeOrder>
struct ClusterDescriptor struct ClusterDescriptor
...@@ -205,169 +205,7 @@ template <typename Lengths, ...@@ -205,169 +205,7 @@ template <typename Lengths,
__host__ __device__ constexpr auto make_cluster_descriptor( __host__ __device__ constexpr auto make_cluster_descriptor(
Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{}) Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
{ {
return ClusterDescriptor<Lengths, ArrangeOrder>{}; return ClusterDescriptor<Lengths, decltype(order)>{};
}
template <typename... NativeDimensions>
__host__ __device__ void
print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
{
print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides());
}
template <typename... Ts>
__host__ __device__ void print_tensor_descriptor(const char* s,
const TransformedTensorDescriptor<Ts...>& desc)
{
print_tensor_descriptor_impl(s, desc.GetLengths());
}
template <index_t... Lengths, index_t... Strides>
__host__ __device__ void
print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>, Sequence<Strides...>)
{
constexpr index_t nDim = sizeof...(Lengths);
static_assert(nDim > 0 && nDim <= 12, "wrong!");
static_if<nDim == 1>{}([&](auto) {
printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 2>{}([&](auto) {
printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 3>{}([&](auto) {
printf(
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 4>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 5>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 6>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 7>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 8>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 9>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"%u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 10>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 11>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
"%u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 12>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
"%u %u %u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
}
template <index_t... Lengths>
__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>)
{
constexpr index_t nDim = sizeof...(Lengths);
static_assert(nDim > 0 && nDim <= 12, "wrong!");
static_if<nDim == 1>{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); });
static_if<nDim == 2>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 3>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 4>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 5>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 6>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); });
static_if<nDim == 7>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 8>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 9>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 10>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 11>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 12>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
} }
} // namespace ck } // namespace ck
......
...@@ -68,64 +68,118 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -68,64 +68,118 @@ struct BlockwiseGenericTensorSliceCopy_v4
template <typename BlockSrcData, template <typename BlockSrcData,
typename ThreadBufferData, typename ThreadBufferData,
address_space_t BlockSrcAddressSpace = address_space_t::generic, AddressSpace BlockSrcAddressSpace,
address_space_t ThreadBufferAddressSpace = address_space_t::generic> AddressSpace ThreadBufferAddressSpace>
__device__ void
RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer,
integral_constant<AddressSpace, BlockSrcAddressSpace>,
integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
{
constexpr auto block_src_address_space =
integral_constant<AddressSpace, BlockSrcAddressSpace>{};
constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseLoad.Run_optimized_src_address_calculation(
p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
}
else
{
mThreadwiseLoad.Run(
p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
}
}
template <typename BlockSrcData, typename ThreadBufferData>
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const ThreadBufferData* p_thread_buffer) const
{ {
#if 1 constexpr auto generic_address_space =
mThreadwiseLoad.template Run<BlockSrcData, integral_constant<AddressSpace, AddressSpace::generic>{};
ThreadBufferData,
BlockSrcAddressSpace, RunLoadThreadBuffer(
ThreadBufferAddressSpace>(p_block_src, p_thread_buffer); p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
#else // tweaking
mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
ThreadBufferData,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(
p_block_src, p_thread_buffer);
#endif
} }
template <typename ThreadBufferData, template <typename ThreadBufferData,
typename BlockDstData, typename BlockDstData,
address_space_t ThreadBufferAddressSpace = address_space_t::generic, AddressSpace ThreadBufferAddressSpace,
address_space_t BlockDstAddressSpace = address_space_t::generic> AddressSpace BlockDstAddressSpace>
__device__ void
RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, ThreadBufferAddressSpace>,
integral_constant<AddressSpace, BlockDstAddressSpace>) const
{
constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
constexpr auto block_dst_address_space =
integral_constant<AddressSpace, BlockDstAddressSpace>{};
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseStore.Run_optimized_dst_address_calculation(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
}
else
{
mThreadwiseStore.Run(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
}
}
template <typename ThreadBufferData, typename BlockDstData>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const BlockDstData* p_block_dst) const
{ {
#if 1 constexpr auto generic_address_space =
mThreadwiseStore.template Run<ThreadBufferData, integral_constant<AddressSpace, AddressSpace::generic>{};
BlockDstData,
ThreadBufferAddressSpace, RunStoreThreadBuffer(
BlockDstAddressSpace>(p_thread_buffer, p_block_dst); p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
#else // tweaking
mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
BlockDstData,
ThreadBufferAddressSpace,
BlockDstAddressSpace>(
p_thread_buffer, p_block_dst);
#endif
} }
template <typename BlockSrcData, template <typename BlockSrcData,
typename BlockDstData, typename BlockDstData,
address_space_t BlockSrcAddressSpace = address_space_t::generic, AddressSpace BlockSrcAddressSpace,
address_space_t BlockDstAddressSpace = address_space_t::generic> AddressSpace BlockDstAddressSpace>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const __device__ void
Run(const BlockSrcData* p_block_src,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
{ {
BlockSrcData p_thread_buffer[GetThreadBufferSize()]; BlockSrcData p_thread_buffer[GetThreadBufferSize()];
RunLoadThreadBuffer<BlockSrcData, constexpr auto generic_address_space =
BlockSrcData, integral_constant<AddressSpace, AddressSpace::generic>{};
BlockSrcAddressSpace,
address_space_t::generic>(p_block_src, p_thread_buffer); RunLoadThreadBuffer(
p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
// if there is type conversion, it's done during store // if there is type conversion, it's done during store
RunStoreThreadBuffer<BlockSrcData, RunStoreThreadBuffer(
BlockDstData, p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
address_space_t::generic, }
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
template <typename BlockSrcData, typename BlockDstData>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
} }
template <typename T, bool PositiveDirection> template <typename T, bool PositiveDirection>
......
...@@ -2,15 +2,11 @@ ...@@ -2,15 +2,11 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor_deprecated.hpp"
#include "tensor_coordinate_deprecated.hpp" #include "tensor_coordinate_deprecated.hpp"
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp" #include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace ck { namespace ck {
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
...@@ -20,7 +16,7 @@ namespace ck { ...@@ -20,7 +16,7 @@ namespace ck {
// that, on a merged dimension that constains multiple original dimensions, the length of // that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the // the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed // repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1 // in constructor of BlockwiseGenericTensorSliceCopy_v1_deprecated
template <index_t BlockSize, template <index_t BlockSize,
typename SrcDesc, typename SrcDesc,
typename DstDesc, typename DstDesc,
...@@ -34,7 +30,7 @@ template <index_t BlockSize, ...@@ -34,7 +30,7 @@ template <index_t BlockSize,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
index_t DstDataPerAccess> index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v1 struct BlockwiseGenericTensorSliceCopy_v1_deprecated
{ {
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
...@@ -62,7 +58,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -62,7 +58,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId; Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId; Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin, __device__
BlockwiseGenericTensorSliceCopy_v1_deprecated(Array<index_t, nDim> src_block_data_id_begin,
Array<index_t, nDim> dst_block_data_id_begin) Array<index_t, nDim> dst_block_data_id_begin)
{ {
// check NDim consistency // check NDim consistency
...@@ -196,14 +193,14 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -196,14 +193,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths); return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
} }
__device__ static constexpr index_t GetRegisterBufferSize() __device__ static constexpr index_t GetThreadBufferSize()
{ {
return GetRegisterBufferDescriptor().GetElementSpace(); return GetRegisterBufferDescriptor().GetElementSpace();
} }
template <typename TData> template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src, __device__ void RunLoadThreadBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const TData* __restrict__ p_buffer) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
...@@ -244,22 +241,22 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -244,22 +241,22 @@ struct BlockwiseGenericTensorSliceCopy_v1
// that constains multiple original dimensions, the length of the last original // that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor // the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1 // of BlockwiseGenericTensorSliceCopy_v1_deprecated
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc, ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<SrcDesc,
decltype(thread_buffer_desc), decltype(thread_buffer_desc),
SubLengths, SubLengths,
SrcDimAccessOrder, SrcDimAccessOrder,
SrcVectorAccessDim, SrcVectorAccessDim,
SrcDataPerAccess, SrcDataPerAccess,
1>(make_zero_array<index_t, nDim>(), 1>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>()) make_zero_array<index_t, nDim>())
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
}); });
} }
template <typename TData> template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer, __device__ void RunStoreThreadBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const TData* __restrict__ p_dst) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
...@@ -299,14 +296,14 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -299,14 +296,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
// that constains multiple original dimensions, the length of the last original // that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor // the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1 // of BlockwiseGenericTensorSliceCopy_v1_deprecated
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc), ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<decltype(thread_buffer_desc),
DstDesc, DstDesc,
SubLengths, SubLengths,
DstDimAccessOrder, DstDimAccessOrder,
DstVectorAccessDim, DstVectorAccessDim,
1, 1,
DstDataPerAccess>( DstDataPerAccess>(
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>()) make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
}); });
...@@ -315,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -315,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
template <typename TData> template <typename TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{ {
TData p_buffer[GetRegisterBufferSize()]; TData p_buffer[GetThreadBufferSize()];
RunLoadRegisterBuffer(p_src, p_buffer); RunLoadThreadBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst); RunStoreThreadBuffer(p_buffer, p_dst);
} }
// When moving the slicing windows along a merged dimension, if the strides of the // When moving the slicing windows along a merged dimension, if the strides of the
...@@ -432,14 +429,14 @@ template <index_t BlockSize, ...@@ -432,14 +429,14 @@ template <index_t BlockSize,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
index_t DstDataPerAccess> index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v2 struct BlockwiseGenericTensorSliceCopy_v2_deprecated
{ {
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin, __device__ constexpr BlockwiseGenericTensorSliceCopy_v2_deprecated(
const Index& dst_block_slice_origin) const Index& src_block_slice_origin, const Index& dst_block_slice_origin)
{ {
static_assert( static_assert(
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
...@@ -478,42 +475,96 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -478,42 +475,96 @@ struct BlockwiseGenericTensorSliceCopy_v2
return ThreadBufferDesc::GetElementSpace(); return ThreadBufferDesc::GetElementSpace();
} }
template <typename SrcData, template <typename BlockSrcData,
typename DstData, typename ThreadBufferData,
address_space_t BlockSrcAddressSpace = address_space_t::generic, AddressSpace BlockSrcAddressSpace,
address_space_t ThreadBufferAddressSpace = address_space_t::generic> AddressSpace ThreadBufferAddressSpace>
__device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const __device__ void
RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer,
integral_constant<AddressSpace, BlockSrcAddressSpace>,
integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
{ {
mThreadwiseLoad constexpr auto block_src_address_space =
.template Run<SrcData, DstData, BlockSrcAddressSpace, ThreadBufferAddressSpace>( integral_constant<AddressSpace, BlockSrcAddressSpace>{};
p_block_src, p_thread_buffer); constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
mThreadwiseLoad.Run(
p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
} }
template <typename SrcData, template <typename BlockSrcData, typename ThreadBufferData>
typename DstData, __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
address_space_t ThreadBufferAddressSpace = address_space_t::generic, ThreadBufferData* p_thread_buffer) const
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const
{ {
mThreadwiseStore constexpr auto generic_address_space =
.template Run<SrcData, DstData, ThreadBufferAddressSpace, BlockDstAddressSpace>( integral_constant<AddressSpace, AddressSpace::generic>{};
p_thread_buffer, p_block_dst);
RunLoadThreadBuffer(
p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
} }
template <typename SrcData, template <typename ThreadBufferData,
typename DstData, typename BlockDstData,
address_space_t BlockSrcAddressSpace = address_space_t::generic, AddressSpace ThreadBufferAddressSpace,
address_space_t BlockDstAddressSpace = address_space_t::generic> AddressSpace BlockDstAddressSpace>
__device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const __device__ void
RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, ThreadBufferAddressSpace>,
integral_constant<AddressSpace, BlockDstAddressSpace>) const
{ {
SrcData p_thread_buffer[GetThreadBufferSize()]; constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
constexpr auto block_dst_address_space =
integral_constant<AddressSpace, BlockDstAddressSpace>{};
RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, address_space_t::generic>( mThreadwiseStore.Run(
p_block_src, p_thread_buffer); p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
}
template <typename ThreadBufferData, typename BlockDstData>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
RunStoreThreadBuffer(
p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
}
template <typename BlockSrcData,
typename BlockDstData,
AddressSpace BlockSrcAddressSpace,
AddressSpace BlockDstAddressSpace>
__device__ void
Run(const BlockSrcData* p_block_src,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
{
BlockSrcData p_thread_buffer[GetThreadBufferSize()];
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
RunLoadThreadBuffer(
p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
// if there is type conversion, it's done during store // if there is type conversion, it's done during store
RunStoreThreadBuffer<SrcData, DstData, address_space_t::generic, BlockDstAddressSpace>( RunStoreThreadBuffer(
p_thread_buffer, p_block_dst); p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
}
template <typename BlockSrcData, typename BlockDstData>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
} }
template <typename T, bool PositiveDirection> template <typename T, bool PositiveDirection>
...@@ -533,25 +584,25 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -533,25 +584,25 @@ struct BlockwiseGenericTensorSliceCopy_v2
private: private:
using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc, using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<SrcDesc,
ThreadBufferDesc, ThreadBufferDesc,
SubLengths, SubLengths,
SrcDimAccessOrder, SrcDimAccessOrder,
SrcDimAccessOrder, SrcDimAccessOrder,
SrcVectorAccessDim, SrcVectorAccessDim,
SrcVectorAccessDim, SrcVectorAccessDim,
SrcDataPerAccess, SrcDataPerAccess,
1>; 1>;
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<ThreadBufferDesc, using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<ThreadBufferDesc,
DstDesc, DstDesc,
SubLengths, SubLengths,
DstDimAccessOrder, DstDimAccessOrder,
DstDimAccessOrder, DstDimAccessOrder,
DstVectorAccessDim, DstVectorAccessDim,
DstVectorAccessDim, DstVectorAccessDim,
1, 1,
DstDataPerAccess>; DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad; ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore; ThreadwiseStore mThreadwiseStore;
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define CK_THREADWISE_DIRECT_CONVOLUTION_HPP #define CK_THREADWISE_DIRECT_CONVOLUTION_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
#include "threadwise_tensor_slice_copy.hpp" #include "threadwise_tensor_slice_copy.hpp"
namespace ck { namespace ck {
......
...@@ -3,102 +3,164 @@ ...@@ -3,102 +3,164 @@
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantMatrixDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp"
#include "math.hpp"
namespace ck { namespace ck {
template <class Float, class Matrix> template <typename Float, class Matrix>
__device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread) __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
{ {
for(index_t i = 0; i < Matrix::NRow(); ++i) for(index_t i = 0; i < Matrix::NRow(); ++i)
{ {
for(index_t j = 0; j < Matrix::NCol(); ++j) for(index_t j = 0; j < Matrix::NCol(); ++j)
{ {
const index_t id = Matrix::GetOffsetFromMultiIndex(i, j); const index_t id = Matrix::CalculateOffset(i, j);
p_thread[id] = Float(0); p_thread[id] = Float(0);
} }
} }
} }
template <class Float, template <typename SrcMatrix,
class SrcMatrix, typename DstMatrix,
class DstMatrix, index_t NSliceRow,
index_t NRow, index_t NSliceCol,
index_t NCol, index_t DataPerAccess>
index_t DataPerRead> struct ThreadwiseMatrixSliceCopy
__device__ void threadwise_matrix_copy(SrcMatrix,
const Float* __restrict__ p_src,
DstMatrix,
Float* __restrict__ p_dst,
Sequence<NRow, NCol>,
Number<DataPerRead>)
{ {
static_assert(NCol % DataPerRead == 0, "wrong! should be NCol % == DataPerRead == 0"); __device__ constexpr ThreadwiseMatrixSliceCopy()
{
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType; static_assert(SrcMatrix::RowStride() % DataPerAccess == 0 &&
DstMatrix::RowStride() % DataPerAccess == 0,
constexpr auto src_mtx = SrcMatrix{}; "wrong! wrong alignment");
constexpr auto dst_mtx = DstMatrix{}; static_assert(NSliceCol % DataPerAccess == 0,
"wrong! should be NSliceCol % DataPerAccess == 0");
}
for(index_t i = 0; i < NRow; ++i) template <typename Data>
__device__ static void Run(const Data* p_src, Data* p_dst)
{ {
for(index_t j = 0; j < NCol; j += DataPerRead) using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
for(index_t i = 0; i < NSliceRow; ++i)
{ {
const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j); for(index_t j = 0; j < NSliceCol; j += DataPerAccess)
const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j); {
const index_t src_index = SrcMatrix::CalculateOffset(i, j);
const index_t dst_index = DstMatrix::CalculateOffset(i, j);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) = *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]); *reinterpret_cast<const vector_t*>(&p_src[src_index]);
}
} }
} }
} };
template <class MatrixA, // C += transpose(A) * B
class MatrixB, // Element of matrix can be vectorized data
class MatrixC, template <typename MatrixA, typename MatrixB, typename MatrixC>
bool TransA, struct ThreadwiseGemmTransANormalBNormalC
bool TransB,
bool TransC,
class FloatA,
class FloatB,
class FloatC>
__device__ void threadwise_gemm(MatrixA,
integral_constant<bool, TransA>,
const FloatA* __restrict__ p_a_thread,
MatrixB,
integral_constant<bool, TransB>,
const FloatB* __restrict__ p_b_thread,
MatrixC,
integral_constant<bool, TransC>,
FloatC* __restrict__ p_c_thread)
{ {
static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) { __device__ constexpr ThreadwiseGemmTransANormalBNormalC()
constexpr auto a_mtx = MatrixA{}; {
constexpr auto b_mtx = MatrixB{}; static_assert(MatrixA::NRow() == MatrixB::NRow() && MatrixA::NCol() == MatrixC::NRow() &&
constexpr auto c_mtx = MatrixC{}; MatrixB::NCol() == MatrixC::NCol(),
"wrong!");
}
constexpr index_t M = c_mtx.NRow(); template <typename FloatA, typename FloatB, typename FloatC>
constexpr index_t N = c_mtx.NCol(); __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
constexpr index_t K = a_mtx.NRow(); // A is transposed {
constexpr index_t M = MatrixC::NRow();
constexpr index_t N = MatrixC::NCol();
constexpr index_t K = MatrixA::NRow(); // A is transposed
for(index_t k = 0; k < K; ++k) for(index_t k = 0; k < K; ++k)
{ {
for(index_t i = 0; i < M; ++i) for(index_t m = 0; m < M; ++m)
{ {
for(index_t j = 0; j < N; ++j) for(index_t n = 0; n < N; ++n)
{ {
const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j); const index_t bindex = MatrixB::CalculateOffset(k, n);
const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j); const index_t cindex = MatrixC::CalculateOffset(m, n);
p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex]; p_c[cindex] +=
inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
} }
} }
} }
}).Else([&](auto fwd) { }
// not implemented
static_assert(fwd(false), "wrong! support for this config is not implemented"); #if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
}); template <typename FloatA, typename FloatB, typename FloatC>
} __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
constexpr index_t M = MatrixC::NRow();
constexpr index_t N = MatrixC::NCol();
constexpr index_t K = MatrixA::NRow(); // A is transposed
static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
for(index_t k = 0; k < K; ++k)
{
for(index_t m = 0; m < M; ++m)
{
const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
static_if<N == 2>{}([&](auto) {
const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
__outer_product_1x2(
p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
});
static_if<N == 4>{}([&](auto) {
const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
const index_t bindex_2 = MatrixB::CalculateOffset(k, 2);
const index_t bindex_3 = MatrixB::CalculateOffset(k, 3);
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
__outer_product_1x4(p_a[aindex],
p_b[bindex_0],
p_b[bindex_1],
p_b[bindex_2],
p_b[bindex_3],
p_c[cindex_0],
p_c[cindex_1],
p_c[cindex_2],
p_c[cindex_3]);
});
}
}
}
#endif
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
(is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
(is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
static_if<has_amd_asm>{}([&](auto fwd) {
Run_amd_asm(p_a, p_b, fwd(p_c));
}).Else([&](auto) { Run_source(p_a, p_b, p_c); });
#else
Run_source(p_a, p_b, p_c);
#endif
}
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
#define CK_THREADWISE_GENERIC_TENSOR_OP_HPP #define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor_deprecated.hpp"
namespace ck { namespace ck {
template <class Float, class TDesc> template <class Float, class TDesc>
......
...@@ -6,14 +6,6 @@ ...@@ -6,14 +6,6 @@
#include "tensor_descriptor_helper.hpp" #include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp" #include "tensor_coordinate.hpp"
#ifndef CK_USE_AMD_INTRINSIC
#define CK_USE_AMD_INTRINSIC 1
#endif
#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#endif
namespace ck { namespace ck {
// This version use multi-index transformation // This version use multi-index transformation
...@@ -76,9 +68,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -76,9 +68,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on dst data: No write if dst data is in paddin area. // Will do padding check on dst data: No write if dst data is in paddin area.
template <typename SrcData, template <typename SrcData,
typename DstData, typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, AddressSpace SrcAddressSpace,
address_space_t DstAddressSpace = address_space_t::generic> AddressSpace DstAddressSpace>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const __device__ void Run(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{ {
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
...@@ -122,15 +117,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -122,15 +117,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check src vector's padding situation, only check the first data in this src // Check src vector's padding situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector // vector. It's user's responsiblity to make sure all data in the src vector
// has // has the same padding situation
// the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset()) if(src_coord.IsUpperIndexMappedToValidOffset())
{ {
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) { static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>( __buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_coord.GetOffset(), 0); fwd(p_src), src_coord.GetOffset(), 0);
#else #else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]); *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
...@@ -163,15 +157,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -163,15 +157,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check dst vector's padding situation, only check the first data in this dst // Check dst vector's padding situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector // vector. It's user's responsiblity to make sure all data in the dst vector
// has // has the same padding situation
// the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset()) if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) { static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>( __buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst, fwd(p_dst),
dst_coord.GetOffset(), dst_coord.GetOffset(),
0); 0);
#else #else
...@@ -188,6 +181,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -188,6 +181,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}); });
} }
template <typename SrcData, typename DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
Run(p_src, p_dst, generic_address_space, generic_address_space);
}
// Modify Length to 1, if Mask is set to false // Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions // Used for isolating linear dimension from non-linear dimensions
template <index_t... Lengths, index_t... Mask> template <index_t... Lengths, index_t... Mask>
...@@ -202,12 +204,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -202,12 +204,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area. // Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor // This version is optimized for address calculation of src tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData, template <typename SrcData,
typename DstData, typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, AddressSpace SrcAddressSpace,
address_space_t DstAddressSpace = address_space_t::generic> AddressSpace DstAddressSpace>
__device__ void Run_optimized_src_address_calculation(const SrcData* p_src, __device__ void
DstData* p_dst) const Run_optimized_src_address_calculation(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{ {
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
...@@ -287,14 +293,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -287,14 +293,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto src_coord = const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id); src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if 1 // tweaking #if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is src compile-time offset // this is src compile-time offset
const index_t src_linear_offset = const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else #else
// this is src compile-time offset // this is src compile-time offset
const index_t src_linear_offset = const index_t src_linear_offset =
src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#endif #endif
// Check src vector's padding situation, only check the first data in // Check src vector's padding situation, only check the first data in
...@@ -302,8 +308,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -302,8 +308,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the src vector has the same padding situation // the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset()) if(src_coord.IsUpperIndexMappedToValidOffset())
{ {
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) { static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>( __buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
...@@ -360,12 +366,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -360,12 +366,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area. // Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor // This version is optimized for address calculation of dst tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData, template <typename SrcData,
typename DstData, typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, AddressSpace SrcAddressSpace,
address_space_t DstAddressSpace = address_space_t::generic> AddressSpace DstAddressSpace>
__device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, __device__ void
DstData* p_dst) const Run_optimized_dst_address_calculation(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{ {
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
...@@ -476,14 +486,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -476,14 +486,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto dst_coord = const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if 1 // tweaking #if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is dst compile-time offset // this is dst compile-time offset
const index_t dst_linear_offset = const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else #else
// this is dst compile-time offset // this is dst compile-time offset
const index_t dst_linear_offset = const index_t dst_linear_offset =
dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#endif #endif
// Check dst vector's padding situation, only check the first data in // Check dst vector's padding situation, only check the first data in
...@@ -491,8 +501,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -491,8 +501,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the dst vector has the same padding situation // the dst vector has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset()) if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) { static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>( __buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst, p_dst,
...@@ -514,6 +524,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -514,6 +524,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}); });
} }
__device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
{
#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
return true;
#else
return false;
#endif
}
template <typename T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(const T& step_sizes_, __device__ void MoveSrcSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>) integral_constant<bool, PositiveDirection>)
......
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
#define CK_AMD_BUFFER_ADDRESSING_HPP
#include "float_type.hpp"
namespace ck {
// For 128bit SGPRs in buffer_load and buffer_store instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T>
union BufferLoadStoreDwordConfig
{
int32x4_t data;
T* address[2];
int32_t range[4];
};
__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load");
__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
__device__ void __llvm_amdgcn_buffer_store(float vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store");
__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset);
template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
float dst;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
dst = __llvm_amdgcn_buffer_load(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
asm volatile(
"\n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
#endif
return dst;
}
template <>
__device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
float2_t dst;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
dst = __llvm_amdgcn_buffer_loadx2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
asm volatile(
"\n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
#endif
return dst;
}
template <>
__device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
float4_t dst;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
dst = __llvm_amdgcn_buffer_loadx4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
asm volatile(
"\n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
#endif
return dst;
}
template <>
__device__ void __buffer_store<float, 1>(const float& src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_store(src,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else
asm volatile("\n \
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_config.data),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#endif
}
template <>
__device__ void __buffer_store<float, 2>(const float2_t& src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_storex2(src,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else
asm volatile("\n \
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_config.data),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#endif
}
template <>
__device__ void __buffer_store<float, 4>(const float4_t& src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_storex4(src,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else
asm volatile("\n \
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_config.data),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#endif
}
} // namespace ck
#endif
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment