Commit 05fd7ff8 authored by Jakub Piasecki's avatar Jakub Piasecki
Browse files

Merge remote-tracking branch 'origin/develop' into gemm_f16_int8

parents 2784b516 84832fc4
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -14,22 +14,28 @@ namespace wrapper { ...@@ -14,22 +14,28 @@ namespace wrapper {
* \tparam Shape Tuple of Number<> (for compile-time layout) or index_t * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
* (dynamic layout). It is possible to pass nested shapes * (dynamic layout). It is possible to pass nested shapes
* (e.g. ((4, 2), 2)), nested dimensions are merged. * (e.g. ((4, 2), 2)), nested dimensions are merged.
* \tparam UnnestedDescriptorType Tensor descriptor for unnested shape dims. * \tparam UnrolledDescriptorType Tensor descriptor for unnested shape dims.
*/ */
template <typename Shape, typename UnnestedDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
struct Layout struct Layout
{ {
private: private:
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
// Generate default idxs tuple (idx with all merged nested shapes) /**
* \brief Generate default indices tuple (idx with all merged nested shapes)
*
* \param shape Shape to align.
* \return Multi idx tuple with zeros.
*/
template <typename... Ts> template <typename... Ts>
__host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple<Ts...>&) __host__ __device__ constexpr static auto
GenerateDefaultIdxsTuple([[maybe_unused]] const Tuple<Ts...>& shape)
{ {
return generate_tuple( return generate_tuple(
[&](auto) { [&](auto) {
if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
{ {
// runtime layout // runtime layout
return index_t(0); return index_t(0);
...@@ -43,11 +49,18 @@ struct Layout ...@@ -43,11 +49,18 @@ struct Layout
Number<Tuple<Ts...>::Size()>{}); Number<Tuple<Ts...>::Size()>{});
} }
// Generate LowerDims in Compile-time for MergeTrasform using passed Type /**
// If element of Tuple<Ts...> is also tuple, then merge (generate sequence for merge) * \brief Generate lower dims in compile-time for the Merge transform using
// If tuple is element, then pass through (sequence with one element) * provided type. If element of nested Tuple<Ts...> is also a tuple, then
* merge (generate sequence for merge). If tuple is element, then pass
* through (sequence with one element).
*
* \param shape Shape to align.
* \return LowerDims for MergeTrasform.
*/
template <typename Idx, typename... Ts> template <typename Idx, typename... Ts>
__host__ __device__ constexpr static auto GenerateLowerDim(const Tuple<Ts...>&) __host__ __device__ constexpr static auto
GenerateLowerDim([[maybe_unused]] const Tuple<Ts...>& shape)
{ {
if constexpr(Idx::value == 0) if constexpr(Idx::value == 0)
{ {
...@@ -87,11 +100,17 @@ struct Layout ...@@ -87,11 +100,17 @@ struct Layout
} }
} }
// Iterate over nested tuples in shape /**
// Unroll nested tuples to align Tuple<ShapeDims...> to Tuple<IdxDims...> * \brief Iterate over the nested tuples in the shape.
// Example idx: (1, 1), 1, 1 * Unroll nested tuples to align Tuple<ShapeDims...> to Tuple<IdxDims...>
// Example shape: (2, (2, 2)), 2, (2, 2) * Example idx: (1, 1), 1, 1
// Unrolled shape: 2, (2, 2), 2, (2, 2) * Example shape: (2, (2, 2)), 2, (2, 2)
* Unrolled shape: 2, (2, 2), 2, (2, 2)
*
* \param shape Layout shape.
* \param idx Idx to align.
* \return Algined shape.
*/
template <typename... ShapeDims, typename... IdxDims> template <typename... ShapeDims, typename... IdxDims>
__host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple<ShapeDims...>& shape, __host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple<ShapeDims...>& shape,
const Tuple<IdxDims...>& idx) const Tuple<IdxDims...>& idx)
...@@ -126,6 +145,13 @@ struct Layout ...@@ -126,6 +145,13 @@ struct Layout
} }
} }
/**
* \brief Merge descriptor to 1D.
*
* \param shape Layout shape.
* \param desc Descriptor to merge.
* \return 1D descriptor.
*/
template <typename... ShapeDims, typename DescriptorToMerge> template <typename... ShapeDims, typename DescriptorToMerge>
__host__ __device__ constexpr static auto MakeMerge1d(const Tuple<ShapeDims...>& shape, __host__ __device__ constexpr static auto MakeMerge1d(const Tuple<ShapeDims...>& shape,
const DescriptorToMerge& desc) const DescriptorToMerge& desc)
...@@ -137,18 +163,41 @@ struct Layout ...@@ -137,18 +163,41 @@ struct Layout
const auto lower_dims = make_tuple(MergeElemsSequence::Reverse()); const auto lower_dims = make_tuple(MergeElemsSequence::Reverse());
const auto upper_dims = make_tuple(Sequence<0>{}); const auto upper_dims = make_tuple(Sequence<0>{});
// Merge to 1d // Merge to 1d
return transform_tensor_descriptor( if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims); {
return transform_tensor_descriptor(
desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
}
else
{
// If the descriptor is known at the compilation time,
// use `make_merge_transform_v1_carry_check` because it doesn't use
// memcpy.
return transform_tensor_descriptor(
desc,
make_tuple(make_merge_transform_v1_carry_check(merge_elems)),
lower_dims,
upper_dims);
}
} }
// Merge nested shape dims when corresponding index is also nested. /**
// Input desc shape: 2, 2, 2, 2, 2, 2 * \brief Merge nested shape dims when corresponding index is also merged.
// Example idx: 1, 1, 1, 1 * Input desc shape: 2, 2, 2, 2, 2, 2
// Example shape: 2, (2, 2), 2, (2, 2) * Example idx: 1, 1, 1, (1, 1)
// Merged shape: 2, 4, 2, 4 * Example shape: 2, (2, 2), 2, (2, 2)
* Merged shape: 2, 4, 2, 2, 2
*
* \param shape Layout shape.
* \param idxs Indexes to align descriptor.
* \param desc Descriptor to merge.
* \return Aligned descriptor to idx.
*/
template <typename... ShapeDims, typename... IdxDims, typename DescriptorToMerge> template <typename... ShapeDims, typename... IdxDims, typename DescriptorToMerge>
__host__ __device__ constexpr static auto CreateMergedDescriptor( __host__ __device__ constexpr static auto
const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc) CreateMergedDescriptor(const Tuple<ShapeDims...>& shape,
[[maybe_unused]] const Tuple<IdxDims...>& idxs,
DescriptorToMerge& desc)
{ {
const auto transforms = generate_tuple( const auto transforms = generate_tuple(
[&](auto i) { [&](auto i) {
...@@ -160,7 +209,17 @@ struct Layout ...@@ -160,7 +209,17 @@ struct Layout
// If shape element is tuple and idx element is Number, then merge // If shape element is tuple and idx element is Number, then merge
// Unroll and reverse tuple to traverse column-major // Unroll and reverse tuple to traverse column-major
const auto merge_elems = TupleReverse(UnrollNestedTuple(shape.At(i))); const auto merge_elems = TupleReverse(UnrollNestedTuple(shape.At(i)));
return make_merge_transform(merge_elems); if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
{
return make_merge_transform(merge_elems);
}
else
{
// If the descriptor is known at the compilation time,
// use `make_merge_transform_v1_carry_check` because
// it doesn't use memcpy.
return make_merge_transform_v1_carry_check(merge_elems);
}
} }
else else
{ {
...@@ -185,14 +244,23 @@ struct Layout ...@@ -185,14 +244,23 @@ struct Layout
} }
using Descriptor1dType = using Descriptor1dType =
remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnnestedDescriptorType{}))>; remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnrolledDescriptorType{}))>;
using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>; using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
public:
/**
* \brief Transform descriptor to align to passed indexes.
*
* \param shape Layout shape.
* \param idxs Indexes to align descriptor.
* \param naive_descriptor Descriptor to merge.
* \return Aligned descriptor to idx.
*/
template <typename... ShapeDims, typename... IdxDims> template <typename... ShapeDims, typename... IdxDims>
__host__ __device__ constexpr static auto __host__ __device__ constexpr static auto
TransformDesc(const Tuple<ShapeDims...>& shape, TransformDesc(const Tuple<ShapeDims...>& shape,
const Tuple<IdxDims...>& idx, const Tuple<IdxDims...>& idxs,
const UnnestedDescriptorType& naive_descriptor) const UnrolledDescriptorType& naive_descriptor)
{ {
if constexpr(Tuple<IdxDims...>::Size() == I1) if constexpr(Tuple<IdxDims...>::Size() == I1)
{ {
...@@ -208,19 +276,18 @@ struct Layout ...@@ -208,19 +276,18 @@ struct Layout
static_assert(Tuple<ShapeDims...>::Size() == Tuple<IdxDims...>::Size(), static_assert(Tuple<ShapeDims...>::Size() == Tuple<IdxDims...>::Size(),
"Idx rank and Shape rank must be the same (except 1d)."); "Idx rank and Shape rank must be the same (except 1d).");
// Unroll while IdxDims is nested // Unroll while IdxDims is nested
const auto aligned_shape = AlignShapeToIdx(shape, idx); const auto aligned_shape = AlignShapeToIdx(shape, idxs);
// Transform correct form of shape // Transform correct form of shape
return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor); return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idxs), naive_descriptor);
} }
} }
using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc( using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc(
Shape{}, DefaultIdxsTupleType{}, UnnestedDescriptorType{}))>; Shape{}, DefaultIdxsTupleType{}, UnrolledDescriptorType{}))>;
public:
__host__ __device__ constexpr auto GetElementSpaceSize() const __host__ __device__ constexpr auto GetElementSpaceSize() const
{ {
return unnested_descriptor_.GetElementSpaceSize(); return unrolled_descriptor_.GetElementSpaceSize();
} }
__host__ __device__ Layout() = delete; __host__ __device__ Layout() = delete;
...@@ -232,16 +299,15 @@ struct Layout ...@@ -232,16 +299,15 @@ struct Layout
* \param unnested_descriptor Descriptor * \param unnested_descriptor Descriptor
*/ */
__host__ __device__ constexpr Layout(const Shape& shape, __host__ __device__ constexpr Layout(const Shape& shape,
const UnnestedDescriptorType& unnested_descriptor) const UnrolledDescriptorType& unnested_descriptor)
: shape_(shape) : unrolled_descriptor_(unnested_descriptor), shape_(shape)
{ {
// Construct if runtime mode // Construct if runtime mode
if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
{ {
unnested_descriptor_ = unnested_descriptor; descriptor_1d_ = MakeMerge1d(shape_, unrolled_descriptor_);
descriptor_1d_ = MakeMerge1d(shape_, unnested_descriptor_);
merged_nests_descriptor_ = merged_nests_descriptor_ =
TransformDesc(shape_, DefaultIdxsTupleType{}, unnested_descriptor_); TransformDesc(shape_, DefaultIdxsTupleType{}, unrolled_descriptor_);
} }
} }
...@@ -254,9 +320,9 @@ struct Layout ...@@ -254,9 +320,9 @@ struct Layout
template <typename Idxs> template <typename Idxs>
__host__ __device__ constexpr index_t operator()() const __host__ __device__ constexpr index_t operator()() const
{ {
static_assert(UnnestedDescriptorType::IsKnownAtCompileTime(), static_assert(remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime(),
"Compiletime operator used on runtime layout."); "Compiletime operator used on runtime layout.");
using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnnestedDescriptorType{})); using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnrolledDescriptorType{}));
using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{})); using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{}));
return TransformedDesc{}.CalculateOffset(UnrolledIdx{}); return TransformedDesc{}.CalculateOffset(UnrolledIdx{});
} }
...@@ -283,7 +349,7 @@ struct Layout ...@@ -283,7 +349,7 @@ struct Layout
else else
{ {
// Custom index, need to transform descriptor // Custom index, need to transform descriptor
const auto transformed_desc = TransformDesc(shape_, Idx, unnested_descriptor_); const auto transformed_desc = TransformDesc(shape_, Idx, unrolled_descriptor_);
return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
} }
} }
...@@ -350,29 +416,55 @@ struct Layout ...@@ -350,29 +416,55 @@ struct Layout
} }
/** /**
* \brief Get default descriptor (with the same size as Shape) * \brief Get descriptor with all nested dimensions merged.
* Example, shape: ((2, 2), 2)
* Descriptor lengths: (4, 2)
* *
* \return Default descriptor. * \note The size of merged descriptor is the same as Layout's shape.
*
* \return Merged nests descriptor.
*/ */
__host__ __device__ constexpr const MergedNestsDescriptorType& GetDefaultDescriptor() const __host__ __device__ constexpr const MergedNestsDescriptorType&
GetMergedNestingDescriptor() const
{ {
return merged_nests_descriptor_; return merged_nests_descriptor_;
} }
/**
* \brief Get descriptor with all dimensions are merged (1D).
* Example, shape: ((2, 2), 2)
* Descriptor lengths: (8)
*
* \return 1D descriptor.
*/
__host__ __device__ constexpr const Descriptor1dType& Get1DDescriptor() const
{
return descriptor_1d_;
}
/** /**
* \brief Get unnested descriptor (with unrolled dims) * \brief Get unnested descriptor (with unrolled dims)
* Example, shape: ((2, 2), 2)
* Descriptor lengths: (2, 2, 2)
* *
* \return Flatten descriptor. * \return Flattened descriptor.
*/ */
__host__ __device__ constexpr const UnnestedDescriptorType& GetUnnestedDescriptor() const __host__ __device__ constexpr const UnrolledDescriptorType& GetUnrolledDescriptor() const
{ {
return unnested_descriptor_; return unrolled_descriptor_;
} }
private: private:
UnnestedDescriptorType unnested_descriptor_; // All dimensions are unrolled
UnrolledDescriptorType unrolled_descriptor_;
// 1D descriptor
Descriptor1dType descriptor_1d_; Descriptor1dType descriptor_1d_;
// All nesting are merged
MergedNestsDescriptorType merged_nests_descriptor_; MergedNestsDescriptorType merged_nests_descriptor_;
// Example, shape: ((2, 2), 2)
// UnrolledDescriptorType lengths: (2, 2, 2)
// Descriptor1dType lengths: (8)
// MergedNestsDescriptorType lengths: (4, 2)
const Shape shape_; const Shape shape_;
}; };
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "../utils/tensor_utils.hpp" #include "../utils/tensor_utils.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/** /**
* \brief Perform generic copy between two tensors. Tensors must have the * \brief Perform generic copy between two tensors partitions (threadwise copy).
* same size. * Tensors must have the same size.
* *
* \param src_tensor Source tensor. * \param src_tensor Source tensor.
* \param dst_tensor Destination tensor. * \param dst_tensor Destination tensor.
...@@ -37,5 +42,134 @@ __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& ds ...@@ -37,5 +42,134 @@ __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& ds
} }
} }
/**
* \brief Perform optimized copy between two tensors partitions (threadwise copy).
* Tensors must have the same size.
*
* \tparam DimAccessOrderTuple Tuple with dimension access order.
* \tparam VectorDim Dimension for vectorized read and write.
* \tparam ScalarPerVector Number of scalar per vectorized read and write.
* \param src_tensor Source tensor.
* \param dst_tensor Destination tensor.
*/
template <typename DimAccessOrderTuple,
index_t VectorDim,
index_t ScalarPerVector,
typename SrcTensorType,
typename DstTensorType>
__device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
{
static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const auto& in_grid_desc = layout(src_tensor).GetUnrolledDescriptor();
const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor();
using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
constexpr index_t num_dims = SrcShapeType::Size();
constexpr auto thread_slice_lengths =
generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
constexpr auto dim_access_order = generate_sequence_v2(
[](auto I) { return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
{
// Perform a copy between DynamicBuffers
auto transfer = ThreadwiseTensorSliceTransfer_v7<
Tuple<typename SrcTensorType::TensorElementType>,
Tuple<typename DstTensorType::TensorElementType>,
decltype(tie(in_grid_desc)),
decltype(tie(out_grid_desc)),
tensor_operation::element_wise::PassThrough,
Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
decltype(thread_slice_lengths),
decltype(dim_access_order),
VectorDim,
ScalarPerVector,
Sequence<false>,
Sequence<false>>{in_grid_desc,
make_tuple(src_tensor.GetMultiIdxOffsets()),
out_grid_desc,
make_tuple(dst_tensor.GetMultiIdxOffsets()),
tensor_operation::element_wise::PassThrough{}};
transfer.Run(tie(in_grid_desc),
tie(src_tensor.GetBuffer()),
tie(out_grid_desc),
tie(dst_tensor.GetBuffer()));
}
else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
{
// Perform copy from StaticBuffer to DynamicBuffer
const auto src_slice_origin_idxs =
generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
auto transfer =
ThreadwiseTensorSliceTransfer_v1r3<typename SrcTensorType::TensorElementType,
typename DstTensorType::TensorElementType,
remove_cvref_t<decltype(in_grid_desc)>,
remove_cvref_t<decltype(out_grid_desc)>,
tensor_operation::element_wise::PassThrough,
decltype(thread_slice_lengths),
decltype(dim_access_order),
VectorDim,
ScalarPerVector,
InMemoryDataOperationEnum::Set,
I1,
true>{out_grid_desc,
dst_tensor.GetMultiIdxOffsets(),
tensor_operation::element_wise::PassThrough{}};
transfer.Run(in_grid_desc,
src_slice_origin_idxs,
src_tensor.GetBuffer(),
out_grid_desc,
dst_tensor.GetBuffer());
}
else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
{
// Perform copy from DynamicBuffer to StaticBuffer
const auto src_dst_slice_origin =
generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
constexpr auto src_vector_tensor_lengths = generate_sequence_v2(
[&](auto I) {
if constexpr(I == VectorDim)
{
return Number<ScalarPerVector>{};
}
else
{
return I1;
}
},
Number<num_dims>{});
auto transfer =
ThreadwiseTensorSliceTransfer_v4r1<typename SrcTensorType::TensorElementType,
typename DstTensorType::TensorElementType,
remove_cvref_t<decltype(in_grid_desc)>,
remove_cvref_t<decltype(out_grid_desc)>,
decltype(thread_slice_lengths),
decltype(dim_access_order),
decltype(src_vector_tensor_lengths),
decltype(dim_access_order)>{
src_tensor.GetMultiIdxOffsets()};
transfer.Run(in_grid_desc,
src_dst_slice_origin,
src_tensor.GetBuffer(),
out_grid_desc,
src_dst_slice_origin,
dst_tensor.GetBuffer());
}
else
{
// Perform copy between StaticBuffers
copy(src_tensor, dst_tensor);
}
}
} // namespace wrapper } // namespace wrapper
} // namespace ck } // namespace ck
...@@ -10,189 +10,205 @@ ...@@ -10,189 +10,205 @@
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
namespace detail {
namespace {
/** /**
* \brief Tensor wrapper that performs static and dynamic buffer logic. * \brief Check if Tuple contains Slice object
* *
* \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). * \return True if tuple contains Slice object.
* \tparam ElementType Element data type.
* \tparam Shape Tensor shape (layout component).
* \tparam UnnestedDescriptorType Unnested descriptor (layout component).
* \tparam NumVectors Number of vectors (only for VGPR, SGPR).
* \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
*/ */
template <MemoryTypeEnum BufferAddressSpace, template <typename T>
typename ElementType, __host__ __device__ constexpr bool HasSlice(T&&)
typename Shape,
typename UnnestedDescriptorType,
index_t NumVectors, // param for Register memory
index_t ScalarPerVector // param for Register memory
>
struct Tensor
{ {
private: return is_detected<is_slice, T>::value;
// Check if Tuple contains Slice object }
template <typename T> template <typename... Ts>
__host__ __device__ constexpr static bool IsSlicing(T&&) __host__ __device__ constexpr bool HasSlice(Tuple<Ts...>&&)
{ {
return is_detected<is_slice, T>::value; return (HasSlice(Ts{}) || ...);
} }
template <typename... Ts>
__host__ __device__ constexpr static bool IsSlicing(Tuple<Ts...>&&)
{
return (IsSlicing(Ts{}) || ...);
}
// Calculate new tensor shape after slice /**
template <typename... Ts, typename ShapeTmpType> * \brief Calculate new shape after slice from parent shape.
__host__ __device__ constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx, *
const ShapeTmpType& shape) const * \param idxs Tuple of indexes defining slice ranges.
{ * \param shape Shape which will be sliced.
// Pack each value in tuple to remove empty tuples after generation * \return New tensor shape.
auto new_shape = generate_tuple( */
[&](auto i) { template <typename... Ts, typename SlicedShape>
constexpr auto num_i = Number<i>{}; __host__ __device__ constexpr auto GetSlicedShape(const Tuple<Ts...>& idxs,
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value) const SlicedShape& shape)
{ {
if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{})) // Pack each value in tuple to remove empty tuples after generation
{ auto new_shape = generate_tuple(
// if tuple does not have any slice then we can remove dimension [&](auto i) {
return Tuple<>{}; constexpr auto num_i = Number<i>{};
} if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
else {
{ if constexpr(!detail::HasSlice(tuple_element_t<i.value, Tuple<Ts...>>{}))
// if tuple then recurrence
return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i)));
}
}
else if constexpr(is_detected<is_slice,
tuple_element_t<i.value, Tuple<Ts...>>>::value)
{
// calculate new dimension
const auto& dim = size(shape.At(num_i));
const auto val = idx.At(num_i).range(dim);
return make_tuple(val);
}
else
{ {
// remove dimension for just value // if tuple does not have any slice then we can remove dimension
return Tuple<>{}; return Tuple<>{};
} }
},
Number<Tuple<Ts...>::Size()>{});
// Remove empty tuples (deleted elements) and return
return UnrollNestedTuple<0, 1>(new_shape);
}
// Generate Freeze for each of nested shape
template <typename T, typename ShapeTmpType>
__host__ __device__ constexpr auto GenerateMultipleFreeze(T idx,
const ShapeTmpType& shape) const
{
const auto unrolled_shape = UnrollNestedTuple(shape);
return generate_tuple(
[&](auto i) {
// dimension offset from idx
const auto dim = unrolled_shape.At(Number<i>{});
const auto dim_idx = idx % dim;
idx /= dim;
return make_freeze_transform(dim_idx);
},
Number<decltype(unrolled_shape)::Size()>{});
}
template <typename... Ts, typename ShapeTmpType>
__host__ __device__ constexpr auto
GetTransformsFromSlicedTensor(const Tuple<Ts...>& idx, const ShapeTmpType& shape) const
{
// Pack each value in tuple to remove empty tuples after generation
auto transforms = generate_tuple(
[&](auto i) {
constexpr auto num_i = Number<i>{};
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
{
return GetTransformsFromSlicedTensor(idx.At(num_i), shape.At(num_i));
}
else if constexpr(is_detected<is_slice,
tuple_element_t<i.value, Tuple<Ts...>>>::value)
{
const auto from = idx.At(num_i).from_;
const auto dim = shape.At(num_i);
const auto range = idx.At(num_i).range(dim);
return make_slice_transform(range, from, from + range);
}
else else
{ {
// remove dimension for just value // if tuple then recurrence
return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i)); return make_tuple(GetSlicedShape(idxs.At(num_i), shape.At(num_i)));
} }
}, }
Number<Tuple<Ts...>::Size()>{}); else if constexpr(is_detected<is_slice, tuple_element_t<i.value, Tuple<Ts...>>>::value)
// Remove empty tuples (deleted elements) and return {
return UnrollNestedTuple(transforms); // calculate new dimension
} const auto& dim = size(shape.At(num_i));
const auto val = idxs.At(num_i).range(dim);
return make_tuple(val);
}
else
{
// remove dimension for just value
return Tuple<>{};
}
},
Number<Tuple<Ts...>::Size()>{});
// Remove empty tuples (deleted elements) and return
return UnrollNestedTuple<0, 1>(new_shape);
}
/**
* \brief Generate Freeze for each of nested shape.
*
* \param idx Tuple of start indices for slice.
* \param shape Shape which will be freezed.
* \return Generated freeze transforms.
*/
template <typename T, typename Shape>
__host__ __device__ constexpr auto GenerateMultipleFreeze(T idx, const Shape& shape)
{
const auto unrolled_shape = UnrollNestedTuple(shape);
return generate_tuple(
[&](auto i) {
// dimension offset from idx
const auto dim = unrolled_shape.At(Number<i>{});
const auto dim_idx = idx % dim;
idx /= dim;
return make_freeze_transform(dim_idx);
},
Number<decltype(unrolled_shape)::Size()>{});
}
/**
* \brief Generate transforms for slice tensor.
*
* \param idx Tuple of start indices for slice.
* \param shape Shape which will be sliced.
* \return Generated transforms.
*/
template <typename... Ts, typename Shape>
__host__ __device__ constexpr auto GenerateSliceTransforms(const Tuple<Ts...>& idx,
const Shape& shape)
{
// Pack each value in tuple to remove empty tuples after generation
auto transforms = generate_tuple(
[&](auto i) {
constexpr auto num_i = Number<i>{};
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
{
return GenerateSliceTransforms(idx.At(num_i), shape.At(num_i));
}
else if constexpr(is_detected<is_slice, tuple_element_t<i.value, Tuple<Ts...>>>::value)
{
const auto from = idx.At(num_i).from_;
const auto dim = size<num_i>(shape);
const auto range = idx.At(num_i).range(dim);
return make_slice_transform(range, from, from + range);
}
else
{
// remove dimension for just value
return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i));
}
},
Number<Tuple<Ts...>::Size()>{});
// Remove empty tuples (deleted elements) and return
return UnrollNestedTuple(transforms);
}
template <index_t i, typename LowerIndex>
__host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze<LowerIndex>&)
{
// There is no output for Freeze transform // There is no output for Freeze transform
template <index_t i, typename LowerIndex> return Sequence<>{};
__host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze<LowerIndex>&) const }
{
return Sequence<>{};
}
template <index_t i, typename LowLength, typename SliceBegin, typename SliceEnd> template <index_t i, typename LowLength, typename SliceBegin, typename SliceEnd>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto GetSequenceVal(const ck::Slice<LowLength, SliceBegin, SliceEnd>&)
GetSequenceVal(const ck::Slice<LowLength, SliceBegin, SliceEnd>&) const {
{ return Sequence<i>{};
return Sequence<i>{}; }
}
template <index_t i> template <index_t i>
__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) const __host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&)
{
return Tuple<>{};
}
template <index_t i, typename... Transforms>
__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<Transforms...>& transforms)
{
constexpr auto num_transforms = Tuple<Transforms...>::Size();
// Deduce Sequence element for specific transform
const auto current_elem = GetSequenceVal<i>(transforms.At(Number<0>{}));
if constexpr(is_same_v<decltype(current_elem), const Sequence<>>)
{ {
return Tuple<>{}; const auto next_tuple = GenerateUpperDims<i>(TupleSlice<1, num_transforms>(transforms));
return concat_tuple(make_tuple(current_elem), next_tuple);
} }
else
template <index_t i, typename... Transforms>
__host__ __device__ constexpr auto
GenerateUpperDims(const Tuple<Transforms...>& transforms) const
{ {
constexpr auto num_transforms = Tuple<Transforms...>::Size(); // Increase i if current_elem is Slice transform
// Deduce Sequence element for specific transform const auto next_tuple = GenerateUpperDims<i + 1>(TupleSlice<1, num_transforms>(transforms));
const auto currect_elem = GetSequenceVal<i>(transforms.At(Number<0>{})); return concat_tuple(make_tuple(current_elem), next_tuple);
if constexpr(is_same_v<decltype(currect_elem), const Sequence<>>)
{
const auto next_tuple = GenerateUpperDims<i>(TupleSlice<1, num_transforms>(transforms));
return concat_tuple(make_tuple(currect_elem), next_tuple);
}
else
{
// Increase i if current_elem is Slice transform
const auto next_tuple =
GenerateUpperDims<i + 1>(TupleSlice<1, num_transforms>(transforms));
return concat_tuple(make_tuple(currect_elem), next_tuple);
}
} }
}
template <typename... Ts, typename ShapeTmpType, typename FlattenDescriptor> template <typename... Ts, typename Shape, typename FlattenDescriptor>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>& idx,
GetDescriptorFromSlicedTensor(const Tuple<Ts...>& idx, const Shape& shape,
const ShapeTmpType& shape, const FlattenDescriptor& flatten_desc)
const FlattenDescriptor& flatten_desc) const {
{ constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
const auto transforms = GetTransformsFromSlicedTensor(idx, shape); const auto transforms = GenerateSliceTransforms(idx, shape);
using TransformsTupleType = decltype(transforms); using TransformsTupleType = decltype(transforms);
const auto lower_dims = const auto lower_dims =
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{}); generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){}; const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){};
return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
} }
} // namespace
} // namespace detail
/**
* \brief Tensor wrapper that performs static and dynamic buffer logic.
* The tensor is based on a descriptor stored in the Layout. Additionally,
* tensor can be sliced or shifted using multi-index offset.
*
* \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
* \tparam ElementType Element data type.
* \tparam Shape Tensor shape (layout component).
* \tparam UnrolledDescriptorType Flatten descriptor (layout component).
*/
template <MemoryTypeEnum BufferAddressSpace,
typename ElementType,
typename Shape,
typename UnrolledDescriptorType>
struct Tensor
{
public: public:
using ElementSpaceSize = decltype(Layout<Shape, UnnestedDescriptorType>{ using ElementSpaceSize = decltype(Layout<Shape, UnrolledDescriptorType>{
Shape{}, UnnestedDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer Shape{}, UnrolledDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
using TensorElementType = ElementType; // DataType using TensorElementType = ElementType; // DataType
static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace; static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
...@@ -200,134 +216,207 @@ struct Tensor ...@@ -200,134 +216,207 @@ struct Tensor
BufferAddressSpace == MemoryTypeEnum ::Vgpr); BufferAddressSpace == MemoryTypeEnum ::Vgpr);
__host__ __device__ Tensor() = delete; __host__ __device__ Tensor() = delete;
__host__ __device__ Tensor(ElementType* pointer, __host__ __device__ constexpr Tensor(ElementType* pointer,
const Layout<Shape, UnnestedDescriptorType>& layout) const Layout<Shape, UnrolledDescriptorType>& layout)
: layout_(layout), : layout_(layout),
buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize())) buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize())),
multi_idx_offset_(make_zero_multi_index<Shape::Size()>()),
base_offset_(0)
{ {
static_assert(IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
} }
__host__ __device__ Tensor(const Layout<Shape, UnnestedDescriptorType>& layout) __host__ __device__ constexpr Tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
: layout_(layout) : layout_(layout),
multi_idx_offset_(make_zero_multi_index<Shape::Size()>()),
base_offset_(0)
{ {
static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register."); static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
} }
__host__ __device__ constexpr const Layout<Shape, UnnestedDescriptorType>& GetLayout() const __host__ __device__ constexpr const Layout<Shape, UnrolledDescriptorType>& GetLayout() const
{ {
return layout_; return layout_;
} }
// Getter for new sliced tensor /**
template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false> * \brief Get the new sliced tensor.
__host__ __device__ auto operator[](const Tuple<Ts...>& idx) const *
* \param idx Tuple of indices: slice(from,to) or scalar.
* \return Sliced tensor.
*/
template <typename... Ts, enable_if_t<detail::HasSlice(Tuple<Ts...>{}), bool> = false>
__host__ __device__ auto operator[](const Tuple<Ts...>& idx)
{ {
static_assert(IsDynamicBuffer, "Register slice is not supported"); static_assert(IsDynamicBuffer, "Register slice is not supported");
const auto& shape = layout_.GetShape(); const auto& shape = layout_.GetShape();
auto new_shape = GetShapeFromSlicedTensor(idx, shape); auto new_shape = detail::GetSlicedShape(idx, shape);
const auto& flatten_desc = layout_.GetUnnestedDescriptor(); const auto& flatten_desc = layout_.GetUnrolledDescriptor();
auto new_desc = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc); auto new_desc = detail::GenerateSlicedDescriptor(idx, shape, flatten_desc);
const auto new_layout = const auto new_layout =
Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc); Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
// Update embed offset
base_offset_ -= new_layout(make_tuple(Number<0>{}));
return make_tensor<BufferAddressSpace>(buffer_.p_data_, new_layout); return make_tensor<BufferAddressSpace>(buffer_.p_data_, new_layout);
} }
template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false> template <typename... Ts, enable_if_t<detail::HasSlice(Tuple<Ts...>{}), bool> = false>
__host__ __device__ auto operator()(const Tuple<Ts...>& idx) const __host__ __device__ auto operator()(const Tuple<Ts...>& idx)
{ {
return this->operator[](idx); return this->operator[](idx);
} }
template <typename... Idxs, enable_if_t<IsSlicing(Tuple<Idxs...>{}), bool> = false> template <typename... Idxs, enable_if_t<detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
__host__ __device__ auto operator()(Idxs... idxs) const __host__ __device__ auto operator()(Idxs... idxs)
{ {
return this->operator[](make_tuple(idxs...)); return this->operator[](make_tuple(idxs...));
} }
// Getter for the const value /**
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false> * \brief Getter of the tensor's const value reference.
*
* \param idx Tuple of indices.
* \return Requested value.
*/
template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
__host__ __device__ const ElementType& operator[](const Tuple<Ts...>& idx) const __host__ __device__ const ElementType& operator[](const Tuple<Ts...>& idx) const
{ {
if constexpr(IsDynamicBuffer) if constexpr(IsDynamicBuffer)
{ {
const index_t offset = layout_(idx); const index_t offset = layout_(idx) + base_offset_;
return buffer_[offset]; return buffer_[offset];
} }
else else
{ {
constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{ constexpr index_t index_offset = Layout<Shape, UnrolledDescriptorType>{
Shape{}, Shape{},
UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>(); UnrolledDescriptorType{}}.template operator()<Tuple<Ts...>>();
return buffer_[Number<offset>{}]; // Calculate and apply base offset in compile-time
constexpr index_t base_offset = Layout<Shape, UnrolledDescriptorType>{
Shape{},
UnrolledDescriptorType{}}.template operator()<MultiIndex<Shape::Size()>>();
return buffer_[Number<index_offset + base_offset>{}];
} }
} }
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false> template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
__host__ __device__ const ElementType& operator()(const Tuple<Ts...>& idx) const __host__ __device__ const ElementType& operator()(const Tuple<Ts...>& idx) const
{ {
return this->operator[](idx); return this->operator[](idx);
} }
template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false> template <typename... Idxs, enable_if_t<!detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
__host__ __device__ const ElementType& operator()(Idxs... idxs) const __host__ __device__ const ElementType& operator()(Idxs... idxs) const
{ {
return this->operator[](make_tuple(idxs...)); return this->operator[](make_tuple(idxs...));
} }
// Getter for the value reference /**
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false> * \brief Getter of tensor value reference.
*
* \param idx Tuple of indices.
* \return Requested value.
*/
template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
__host__ __device__ ElementType& operator[](const Tuple<Ts...>& idx) __host__ __device__ ElementType& operator[](const Tuple<Ts...>& idx)
{ {
if constexpr(IsDynamicBuffer) if constexpr(IsDynamicBuffer)
{ {
const index_t offset = layout_(idx); const index_t offset = layout_(idx) + base_offset_;
return buffer_(offset); return buffer_(offset);
} }
else else
{ {
constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{ constexpr index_t index_offset = Layout<Shape, UnrolledDescriptorType>{
Shape{},
UnrolledDescriptorType{}}.template operator()<Tuple<Ts...>>();
// Apply embed offset (calculate in compiletime)
constexpr index_t base_offset = Layout<Shape, UnrolledDescriptorType>{
Shape{}, Shape{},
UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>(); UnrolledDescriptorType{}}.template operator()<MultiIndex<Shape::Size()>>();
return buffer_(Number<offset>{}); return buffer_(Number<index_offset + base_offset>{});
} }
} }
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false> template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
__host__ __device__ ElementType& operator()(const Tuple<Ts...>& idx) __host__ __device__ ElementType& operator()(const Tuple<Ts...>& idx)
{ {
return this->operator[](idx); return this->operator[](idx);
} }
template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false> template <typename... Idxs, enable_if_t<!detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
__host__ __device__ ElementType& operator()(Idxs... idxs) __host__ __device__ ElementType& operator()(Idxs... idxs)
{ {
return this->operator[](make_tuple(idxs...)); return this->operator[](make_tuple(idxs...));
} }
__host__ __device__ constexpr auto GetDefaultDescriptor() /**
* \brief Get descriptor with all nested dimensions merged.
*
* \return Merged nests descriptor.
*/
__host__ __device__ constexpr auto GetMergedNestingDescriptor()
{ {
return layout_.GetDefaultDescriptor(); return layout_.GetMergedNestingDescriptor();
} }
/**
* \brief Get pointer to the data.
*
* \return Pointer.
*/
__host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; } __host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; }
__host__ __device__ constexpr auto& GetBuffer() { return buffer_; }
__host__ __device__ constexpr auto& GetBuffer() const { return buffer_; }
/**
* \brief Get multi index offset to the data.
*
* \return Multi index offset.
*/
__host__ __device__ constexpr auto& GetMultiIdxOffsets() const { return multi_idx_offset_; }
/**
* \brief Apply multi index offset on the tensor.
*
* \param multi_idx_offset Multi index offset.
*/
template <typename MultiIdxOffsets>
__host__ __device__ constexpr void SetMultiIdxOffset(const MultiIdxOffsets multi_idx_offset)
{
multi_idx_offset_ = multi_idx_offset;
base_offset_ += layout_(multi_idx_offset);
}
private: private:
using DynamicBufferType = DynamicBuffer<BufferAddressSpace, using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
ElementType, ElementType,
ElementSpaceSize, ElementSpaceSize,
true /*InvalidElementUseNumericalZeroValue*/>; true /*InvalidElementUseNumericalZeroValue*/>;
using StaticBufferType = using StaticBufferType = StaticBuffer<BufferAddressSpace,
StaticBufferTupleOfVector<BufferAddressSpace, ElementType,
ElementType, size(Shape{}),
NumVectors, true /*InvalidElementUseNumericalZeroValue*/>;
ScalarPerVector,
true /*InvalidElementUseNumericalZeroValue*/>;
// If register use static buffer, else use dynamic buffer // If register use static buffer, else use dynamic buffer
using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>; using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
const Layout<Shape, UnnestedDescriptorType> layout_; const Layout<Shape, UnrolledDescriptorType> layout_;
Buffer buffer_; Buffer buffer_;
// We use multi_idx_offset_ to enable the creation of a descriptor in
// compile time for partitions or tiles if tile shape and thread layout
// is known at compile time (We can use the same descriptor for each
// thread). Additionally, the copy between the static and dynamic buffer
// requires a descriptor known at compile time, so we can shift data using
// such multi_idx_offset_.
MultiIndex<Shape::Size()> multi_idx_offset_;
// Base offset and multi index offset are corresponding to exactly the
// same element in tensor ( and in physical memory ). Multi index offset
// is multi dimensional index. However base offset is calculated using
// tensor descriptor (thus all it's transforms) and is linear (1D).
// We store base_offset_ to avoid multiple recalculations.
index_t base_offset_;
}; };
} // namespace wrapper } // namespace wrapper
......
...@@ -22,14 +22,19 @@ namespace wrapper { ...@@ -22,14 +22,19 @@ namespace wrapper {
// Disable from doxygen docs generation // Disable from doxygen docs generation
/// @cond /// @cond
// forward declaration // forward declaration
template <typename Shape, typename UnnestedDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
struct Layout; struct Layout;
template <typename T> template <typename T>
using is_tuple = decltype(std::declval<T&>().IsTuple()); using is_tuple = decltype(std::declval<T&>().IsTuple());
namespace { namespace {
// Generate packed (column-major) strides if not passed /**
* \brief Generate packed (column-major) strides if not passed
*
* \param shape Tensor shape.
* \return Generated column-major strides.
*/
template <typename... Ts> template <typename... Ts>
__host__ __device__ constexpr static auto __host__ __device__ constexpr static auto
GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape) GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
...@@ -50,9 +55,16 @@ GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape) ...@@ -50,9 +55,16 @@ GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
Number<decltype(unrolled_shape)::Size()>{}); Number<decltype(unrolled_shape)::Size()>{});
} }
/**
* \brief Create naive tensor descriptor from nested shape.
*
* \param shape Tensor shape.
* \param strides Tensor strides.
* \return Unrolled descriptor
*/
template <typename LayoutShape, typename LayoutStrides> template <typename LayoutShape, typename LayoutStrides>
__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape, __host__ __device__ constexpr auto MakeUnrolledDescriptor(const LayoutShape& shape,
const LayoutStrides& strides) const LayoutStrides& strides)
{ {
const auto unrolled_shape = UnrollNestedTuple(shape); const auto unrolled_shape = UnrollNestedTuple(shape);
if constexpr(is_same_v<LayoutStrides, Tuple<>>) if constexpr(is_same_v<LayoutStrides, Tuple<>>)
...@@ -86,8 +98,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap ...@@ -86,8 +98,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap
template <typename Shape, typename Strides> template <typename Shape, typename Strides>
__host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides) __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides)
{ {
using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{})); using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Strides{}));
return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, strides)); return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, strides));
} }
/** /**
...@@ -100,15 +112,19 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides ...@@ -100,15 +112,19 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides
template <typename Shape> template <typename Shape>
__host__ __device__ constexpr auto make_layout(const Shape& shape) __host__ __device__ constexpr auto make_layout(const Shape& shape)
{ {
using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{})); using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Tuple<>{}));
return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, Tuple<>{})); return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, Tuple<>{}));
} }
// Layout helpers // Layout helpers
// get // get
// Get dim (could be returned from get with empty Idxs)
/** /**
* \private * \private
* \brief Get dim.
*
* \param dim Dimension.
* \return Returned the same dimension.
*/ */
template <typename T> template <typename T>
__host__ __device__ T constexpr get(const T& dim) __host__ __device__ T constexpr get(const T& dim)
...@@ -178,7 +194,7 @@ __host__ __device__ constexpr auto get(const Layout<Shape, FlattenDesc>& layout) ...@@ -178,7 +194,7 @@ __host__ __device__ constexpr auto get(const Layout<Shape, FlattenDesc>& layout)
}, },
Number<old_shape_dims>{}); Number<old_shape_dims>{});
const auto& flatten_desc = layout.GetUnnestedDescriptor(); const auto& flatten_desc = layout.GetUnrolledDescriptor();
auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc); return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
} }
...@@ -197,9 +213,12 @@ __host__ __device__ constexpr auto get(const T& elem) ...@@ -197,9 +213,12 @@ __host__ __device__ constexpr auto get(const T& elem)
} }
// size // size
// Get dim size (could be returned from get function)
/** /**
* \private * \private
* \brief Get size.
*
* \param dim Size.
* \return Returned the same size.
*/ */
template <typename T> template <typename T>
__host__ __device__ T constexpr size(const T& dim) __host__ __device__ T constexpr size(const T& dim)
...@@ -214,8 +233,8 @@ __host__ __device__ T constexpr size(const T& dim) ...@@ -214,8 +233,8 @@ __host__ __device__ T constexpr size(const T& dim)
* \param layout Layout to get Shape of. * \param layout Layout to get Shape of.
* \return Requsted length. * \return Requsted length.
*/ */
template <index_t idx, typename Shape, typename UnnestedDescriptorType> template <index_t idx, typename Shape, typename UnrolledDescriptorType>
__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout) __host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
{ {
return layout.template GetLength<idx>(); return layout.template GetLength<idx>();
} }
...@@ -240,8 +259,8 @@ __host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape) ...@@ -240,8 +259,8 @@ __host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
* \param layout Layout to calculate shape size. * \param layout Layout to calculate shape size.
* \return Requsted size. * \return Requsted size.
*/ */
template <typename Shape, typename UnnestedDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout) __host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
{ {
return layout.GetLengths(); return layout.GetLengths();
} }
...@@ -280,9 +299,9 @@ __host__ __device__ constexpr auto size(const T& elem) ...@@ -280,9 +299,9 @@ __host__ __device__ constexpr auto size(const T& elem)
* \param layout Layout to calculate rank. * \param layout Layout to calculate rank.
* \return Requsted rank. * \return Requsted rank.
*/ */
template <typename Shape, typename UnnestedDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
rank([[maybe_unused]] const Layout<Shape, UnnestedDescriptorType>& layout) rank([[maybe_unused]] const Layout<Shape, UnrolledDescriptorType>& layout)
{ {
return Shape::Size(); return Shape::Size();
} }
...@@ -302,17 +321,25 @@ __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& t ...@@ -302,17 +321,25 @@ __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& t
/** /**
* \private * \private
* \brief Rank for scalar
*
* \param dim Dimension scalar.
* \return Returned 1.
*/ */
template <index_t IDim> template <index_t IDim>
__host__ __device__ constexpr index_t rank(const Number<IDim>&) __host__ __device__ constexpr index_t rank([[maybe_unused]] const Number<IDim>& dim)
{ {
return 1; return 1;
} }
/** /**
* \private * \private
* \brief Rank for scalar
*
* \param dim Dimension scalar.
* \return Returned 1.
*/ */
__host__ __device__ constexpr index_t rank(const index_t&) { return 1; } __host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t& dim) { return 1; }
/** /**
* \brief Hierarchical rank. * \brief Hierarchical rank.
...@@ -334,8 +361,8 @@ __host__ __device__ constexpr auto rank(const T& elem) ...@@ -334,8 +361,8 @@ __host__ __device__ constexpr auto rank(const T& elem)
* \param layout Layout to calculate depth. * \param layout Layout to calculate depth.
* \return Requsted depth. * \return Requsted depth.
*/ */
template <typename Shape, typename UnnestedDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
__host__ __device__ constexpr auto depth(const Layout<Shape, UnnestedDescriptorType>& layout) __host__ __device__ constexpr auto depth(const Layout<Shape, UnrolledDescriptorType>& layout)
{ {
const auto& shape = layout.GetShape(); const auto& shape = layout.GetShape();
return TupleDepth(shape); return TupleDepth(shape);
...@@ -355,17 +382,25 @@ __host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple) ...@@ -355,17 +382,25 @@ __host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)
/** /**
* \private * \private
* \brief Depth for scalar
*
* \param dim Scalar.
* \return Returned 0.
*/ */
template <index_t IDim> template <index_t IDim>
__host__ __device__ constexpr index_t depth(const Number<IDim>&) __host__ __device__ constexpr index_t depth([[maybe_unused]] const Number<IDim>& dim)
{ {
return 0; return 0;
} }
/** /**
* \private * \private
* \brief Depth for scalar
*
* \param dim Scalar.
* \return Returned 0.
*/ */
__host__ __device__ constexpr index_t depth(const index_t&) { return 0; } __host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t& dim) { return 0; }
/** /**
* \brief Hierarchical depth. * \brief Hierarchical depth.
......
...@@ -6,12 +6,22 @@ ...@@ -6,12 +6,22 @@
#include "tensor_utils.hpp" #include "tensor_utils.hpp"
#include "layout_utils.hpp" #include "layout_utils.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
namespace { namespace {
// Calculate shape for partition based on number of threads per each dim and
// previous shape /**
* \brief Calculate shape for partition based on number of threads per each dim and
* previous shape
*
* \param shape Base tensor shape.
* \param thread_lengths Tuple of thread lengths.
* \return Partition shape.
*/
template <typename... Ts, typename... Ls> template <typename... Ts, typename... Ls>
__host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape, __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape,
const Tuple<Ls...>& thread_lengths) const Tuple<Ls...>& thread_lengths)
...@@ -20,265 +30,165 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts.. ...@@ -20,265 +30,165 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts..
return generate_tuple( return generate_tuple(
[&](auto i) { [&](auto i) {
constexpr auto num_i = Number<i>{}; constexpr auto num_i = Number<i>{};
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value) const auto slice_len = size<num_i>(shape) / thread_lengths.At(num_i);
{ return slice_len;
// if tuple then recurrence
return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i));
}
else
{
const auto slice_len = shape.At(num_i) / thread_lengths.At(num_i);
return slice_len;
}
},
Number<Tuple<Ts...>::Size()>{});
}
// Calculate shape for partition based on number of threads per each dim,
// previous strides and steps
template <typename... Ts, typename... Ls, typename... Steps, typename FlattenDescType>
__host__ __device__ constexpr auto
CalculateLocalPartitionDescriptor(const Tuple<Ts...>& shape,
const Tuple<Ls...>& thread_lengths,
const Tuple<Steps...>& steps,
const FlattenDescType& flatten_desc)
{
static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths);
const auto unrolled_shape = UnrollNestedTuple(shape);
constexpr auto dims = decltype(unrolled_thread_lengths)::Size();
using UnrolledStepsType = decltype(UnrollNestedTuple(steps));
using I1 = Number<1>;
const auto transforms = generate_tuple(
[&](auto i) {
constexpr auto num_i = Number<i>{};
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
{
// By default raked partition
const auto partition_stride = unrolled_thread_lengths.At(num_i);
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
make_tuple(partition_stride));
}
else if constexpr(!is_same_v<tuple_element_t<i.value, UnrolledStepsType>, index_t>)
{
// Compiletime partition
if constexpr(is_same_v<tuple_element_t<i.value, UnrolledStepsType>, I1>)
{
// raked
const auto partition_stride = unrolled_thread_lengths.At(num_i);
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
make_tuple(partition_stride));
}
else
{
// packed
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
make_tuple(I1{}));
}
}
else
{
// Runtime partition
if(steps.At(num_i) == 1)
{
// raked
const auto partition_stride = unrolled_thread_lengths.At(num_i);
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
make_tuple(partition_stride));
}
else
{
// packed
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
make_tuple(I1{}));
}
}
},
Number<dims>{});
const auto lower_dims =
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
const auto upper_dims =
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
}
template <typename... Ls, typename... Steps>
__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple<Ls...>& thread_lengths,
const Tuple<Steps...>& steps,
index_t& thread_id)
{
return generate_tuple(
[&](auto i) {
constexpr auto num_i = Number<i>{};
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ls...>>>::value)
{
// if tuple then recurrence
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
{
return CalculateLayoutOffsetIdxImpl(
thread_lengths.At(num_i), Tuple<>{}, thread_id);
}
else
{
return CalculateLayoutOffsetIdxImpl(
thread_lengths.At(num_i), steps.At(num_i), thread_id);
}
}
else
{
// Update thread_id after each dim
const auto dim_thread_id = thread_id % thread_lengths.At(num_i);
thread_id /= thread_lengths.At(num_i);
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
{
return dim_thread_id;
}
else
{
// Apply step
return steps.At(num_i) * dim_thread_id;
}
}
}, },
Number<Tuple<Ls...>::Size()>{}); Number<Tuple<Ls...>::Size()>{});
} }
// Convert integer thread_idx to tuple index with steps applied /**
template <typename... Ls, typename... Steps> * \brief Calculate total number of blocks.
__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths, *
const Tuple<Steps...>& steps, * \param shape Base tensor shape.
const index_t thread_id) * \param tile_shape Tile shape.
* \return Tuple with blocks number.
*/
template <typename... Ts, typename... Ls>
__host__ __device__ constexpr auto CalculateGridSize(const Tuple<Ts...>& shape,
const Tuple<Ls...>& tile_shape)
{ {
// Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
index_t thread_id_copy = thread_id; return generate_tuple([&](auto i) { return size<i>(shape) / size<i>(tile_shape); },
return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy); Number<Tuple<Ls...>::Size()>{});
} }
// Apply steps to index represented as tuple /**
template <typename... Steps, typename... Idxs> * \brief Calculate scaled offset for new partition/tile.
__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...>& steps, *
const Tuple<Idxs...>& block_idxs) * \param thread_idxs Thread 1d id.
* \param partition_lengths_seq Sequence of partition shape.
* \param old_offset_idxs Multi index offset from base tensor to shift values.
* \return Partition shape.
*/
template <typename ThreadIdxs, typename PartitionLengthsSeq, typename OldOffsetIdxs>
__host__ __device__ constexpr auto
CalculateOffsetMultiIdxs(const ThreadIdxs& thread_idxs,
const PartitionLengthsSeq& partition_lengths_seq,
const OldOffsetIdxs& old_offset_idxs)
{ {
return generate_tuple( return thread_idxs * partition_lengths_seq + old_offset_idxs;
[&](auto i) {
constexpr auto num_i = Number<i>{};
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Idxs...>>>::value)
{
// if tuple then recurrence
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
{
return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i));
}
else
{
return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i));
}
}
else
{
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
{
return block_idxs.At(num_i);
}
else
{
// apply step
return steps.At(num_i) * block_idxs.At(num_i);
}
}
},
Number<Tuple<Idxs...>::Size()>{});
} }
// User passes only shape per block to the make_local_tile function. This function calculates
// block layout based on the shape.
template <typename... Ts, typename... BlockDims>
__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& shape,
const Tuple<BlockDims...>& tile_shape)
{
return generate_tuple(
[&](auto i) {
constexpr auto num_i = Number<i>{};
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
{
// if tuple then recurrence
return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i));
}
else
{
return shape.At(num_i) / tile_shape.At(num_i);
}
},
Number<Tuple<Ts...>::Size()>{});
}
} // namespace } // namespace
/** /**
* \brief Create local partition for thread. * \brief Create local partition for thread (At now only packed partition
* is supported).
* *
* \param tensor Tensor for partition. * \param tensor Tensor for partition.
* \param thread_lengths Layout of threads. * \param thread_lengths Layout of threads (could not be nested).
* \param thread_id Thread index represented as integer. * \param thread_id Thread index represented as integer.
* \param steps Thread step (default=1, raked partition)
* \return Partition tensor. * \return Partition tensor.
*/ */
template <typename TensorType, typename ThreadLengthsTuple, typename StepsTuple = Tuple<>> template <typename TensorType, typename ThreadLengthsTuple>
__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor, __host__ __device__ constexpr auto
const ThreadLengthsTuple& thread_lengths, make_local_partition(TensorType& tensor,
const index_t thread_id, [[maybe_unused]] const ThreadLengthsTuple& thread_lengths,
const StepsTuple steps = StepsTuple{}) const index_t thread_id)
{ {
// Create shape, strides and layout for new partition tensor static_assert(!IsNestedTuple(ThreadLengthsTuple{}));
const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths); // Calculate new partition shape
// Create new descriptor and layout const auto& tensor_shape = shape(tensor);
const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); constexpr auto partition_shape =
auto partition_desc = CalculateLocalPartitionShape(decltype(tensor_shape){}, ThreadLengthsTuple{});
CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc); // Create Thread Cluster Descriptor
const auto partition_layout = Layout<decltype(partition_shape), decltype(partition_desc)>( constexpr auto partition_lengths_seq = generate_sequence_v2(
partition_shape, partition_desc); [&](auto I) { return size<I>(partition_shape); }, Number<ThreadLengthsTuple::Size()>{});
// Calculate offset for new partition tensor constexpr auto thread_lengths_seq =
const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id); generate_sequence_v2([&](auto I) { return size<I>(ThreadLengthsTuple{}); },
const auto partition_offset = layout(tensor)(offset_idx); Number<ThreadLengthsTuple::Size()>{});
return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset, constexpr auto thread_cluster_desc_ = make_cluster_descriptor(thread_lengths_seq);
partition_layout); // Calculate thread idxs and offsets
const auto thread_idxs = thread_cluster_desc_.CalculateBottomIndex(make_multi_index(thread_id));
const auto offset_multi_idxs =
CalculateOffsetMultiIdxs(thread_idxs, partition_lengths_seq, tensor.GetMultiIdxOffsets());
// Create new layout and tensor
auto& flatten_desc = layout(tensor).GetUnrolledDescriptor();
const auto partition_layout =
Layout<remove_reference_t<decltype(partition_shape)>, decltype(flatten_desc)>(
partition_shape, flatten_desc);
auto partition_tensor =
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), partition_layout);
// Apply offsets
partition_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
return partition_tensor;
} }
/** /**
* \brief Create local tile for thread block. * \brief Create local tile for thread block. (At now only packed tile
* is supported).
*
* \note Temporary to gain the best performance use 2d
* tile_shape.
*
* *
* \param tensor Tensor for partition. * \param tensor Tensor for partition.
* \param tile_shape Shapes of requested tile. * \param tile_shape Shapes of requested tile.
* \param block_idx Block index represented as tuple. * \param block_id Block index represented as integer.
* \param steps Block step (default=1, raked partition)
* \return Tile tensor. * \return Tile tensor.
*/ */
template <typename TensorType, template <typename TensorType, typename BlockShapeTuple>
typename BlockShapeTuple, __host__ __device__ constexpr auto
typename BlockIdxTuple, make_local_tile(const TensorType& tensor, const BlockShapeTuple& tile_shape, const index_t block_id)
typename StepsTuple = Tuple<>>
__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
const BlockShapeTuple& tile_shape,
const BlockIdxTuple& block_idx,
const StepsTuple steps = StepsTuple{})
{ {
// Create block lengths, strides and layout for new tile tensor static_assert(!IsNestedTuple(BlockShapeTuple{}));
const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape);
// Create new descriptor and layout constexpr auto I0 = Number<0>{};
const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); constexpr auto I1 = Number<1>{};
auto tile_desc = constexpr auto I2 = Number<2>{};
CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc);
const auto tile_layout = Layout<remove_reference_t<decltype(tile_shape)>, decltype(tile_desc)>( auto& aligned_desc = layout(tensor).GetMergedNestingDescriptor();
tile_shape, tile_desc);
// Calculate offset for new partition tensor if constexpr(BlockShapeTuple::Size() == I2)
const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx); {
const auto tile_offset = layout(tensor)(offset_idx); // Optimized version for 2d tile shape [MxK]
return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + tile_offset, const auto block_2_tile_map =
tile_layout); BlockToCTileMap_M00_N0_M01Adapt<BlockShapeTuple{}.At(I0),
BlockShapeTuple{}.At(I1),
remove_cvref_t<decltype(aligned_desc)>>(aligned_desc);
const auto block_work_idx =
block_2_tile_map.CalculateBottomIndex(make_multi_index(block_id));
const index_t m_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I0] * size<0>(tile_shape));
const index_t k_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * size<1>(tile_shape));
const auto offset_multi_idxs =
make_tuple(m_block_data_idx_on_grid, k_block_data_idx_on_grid);
// Create new layout and tensor
const auto tile_layout =
Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
aligned_desc);
auto tile_tensor =
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
// Apply offsets
tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
return tile_tensor;
}
else
{
// Calculate offsets
// Sequence with data to process per block
constexpr auto tile_shape_seq =
generate_sequence_v2([](auto I) { return size(BlockShapeTuple{}.At(I)); },
Number<BlockShapeTuple::Size()>{});
// Tuple with number of blocks
const auto block_lengths = CalculateGridSize(shape(tensor), tile_shape);
constexpr auto block_cluster_desc_ = make_cluster_descriptor(block_lengths);
const auto block_idxs =
block_cluster_desc_.CalculateBottomIndex(make_multi_index(block_id));
const auto offset_multi_idxs =
CalculateOffsetMultiIdxs(block_idxs, tile_shape_seq, tensor.GetMultiIdxOffsets());
// Create new layout and tensor
const auto tile_layout =
Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
aligned_desc);
auto tile_tensor =
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
// Apply offsets
tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
return tile_tensor;
}
} }
} // namespace wrapper } // namespace wrapper
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck/utility/tuple_helper.hpp" #include "ck/utility/tuple_helper.hpp"
#include "ck/utility/dynamic_buffer.hpp" #include "ck/utility/dynamic_buffer.hpp"
#include "ck/utility/amd_address_space.hpp" #include "ck/utility/amd_address_space.hpp"
#include "ck/utility/multi_index.hpp"
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
...@@ -27,16 +28,12 @@ using MemoryTypeEnum = AddressSpaceEnum; ...@@ -27,16 +28,12 @@ using MemoryTypeEnum = AddressSpaceEnum;
// Disable from doxygen docs generation // Disable from doxygen docs generation
/// @cond /// @cond
// forward declarations // forward declarations
template <typename Shape, typename UnnestedDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
struct Layout; struct Layout;
template <MemoryTypeEnum BufferAddressSpace, template <MemoryTypeEnum BufferAddressSpace,
typename ElementType, typename ElementType,
typename Shape, typename Shape,
typename UnnestedDescriptorType, typename UnrolledDescriptorType>
index_t NumVectors, // params for Register memory
index_t ScalarPerVector // param for Register memory
>
struct Tensor; struct Tensor;
template <typename FromType, typename ToType> template <typename FromType, typename ToType>
...@@ -45,13 +42,22 @@ struct Slice ...@@ -45,13 +42,22 @@ struct Slice
__host__ __device__ constexpr Slice() : from_(), to_() {} __host__ __device__ constexpr Slice() : from_(), to_() {}
__host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {} __host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}
/**
* \brief Calculate slice range.
*
* \param dim Dimension size.
* \return Slice range.
*/
template <typename T> template <typename T>
__host__ __device__ constexpr auto range(const T& dim) const __host__ __device__ constexpr auto range(const T& dim) const
{ {
if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> || if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
is_same_v<T, index_t>) is_same_v<T, index_t>)
{ {
assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range"); if(!(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_)))
{
throw std::runtime_error("Invalid range");
}
if(to_ < 0) if(to_ < 0)
{ {
return dim - from_ + to_ + 1; return dim - from_ + to_ + 1;
...@@ -101,40 +107,27 @@ using is_tuple = decltype(std::declval<T&>().IsTuple()); ...@@ -101,40 +107,27 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
template <MemoryTypeEnum MemoryType, template <MemoryTypeEnum MemoryType,
typename ElementType, typename ElementType,
typename Shape, typename Shape,
typename UnnestedDescriptorType> typename UnrolledDescriptorType>
constexpr auto make_tensor(ElementType* pointer, constexpr auto make_tensor(ElementType* pointer,
const Layout<Shape, UnnestedDescriptorType>& layout) const Layout<Shape, UnrolledDescriptorType>& layout)
{ {
return Tensor<MemoryType, return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(pointer, layout);
ElementType,
Shape,
UnnestedDescriptorType,
0 /*NumVectors*/,
0 /*ScalarPerVector*/>(pointer, layout);
} }
/** /**
* \brief Make SGPR or VGPR tensor function. * \brief Make SGPR or VGPR tensor function.
* *
* \tparam MemoryType Type of memory. * \tparam MemoryType Type of memory.
* \tparam NumVectors Number of vectors.
* \tparam ScalarPerVector Scalars per vector.
* \tparam ElementType Memory data type. * \tparam ElementType Memory data type.
* \return Constructed tensor. * \return Constructed tensor.
*/ */
template <MemoryTypeEnum MemoryType, template <MemoryTypeEnum MemoryType,
index_t NumVectors, typename ElementType,
index_t ScalarPerVector, typename Shape,
typename ElementType> typename UnrolledDescriptorType>
constexpr auto make_register_tensor() constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
{ {
const auto layout = make_layout(make_tuple(Number<NumVectors>{}), make_tuple(Number<1>{})); return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(layout);
return Tensor<MemoryType,
ElementType,
Tuple<Number<NumVectors>>,
std::remove_const_t<remove_reference_t<decltype(layout.GetUnnestedDescriptor())>>,
NumVectors,
ScalarPerVector>(layout);
} }
/** /**
...@@ -146,15 +139,9 @@ constexpr auto make_register_tensor() ...@@ -146,15 +139,9 @@ constexpr auto make_register_tensor()
template <MemoryTypeEnum BufferAddressSpace, template <MemoryTypeEnum BufferAddressSpace,
typename ElementType, typename ElementType,
typename Shape, typename Shape,
typename UnnestedDescriptorType, typename UnrolledDescriptorType>
index_t NumVectors, __host__ __device__ constexpr const auto&
index_t ScalarPerVector> layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
__host__ __device__ constexpr const auto& layout(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
{ {
return tensor.GetLayout(); return tensor.GetLayout();
} }
...@@ -170,15 +157,9 @@ template <index_t... Idxs, ...@@ -170,15 +157,9 @@ template <index_t... Idxs,
MemoryTypeEnum BufferAddressSpace, MemoryTypeEnum BufferAddressSpace,
typename ElementType, typename ElementType,
typename Shape, typename Shape,
typename UnnestedDescriptorType, typename UnrolledDescriptorType>
index_t NumVectors, __host__ __device__ constexpr auto
index_t ScalarPerVector> size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
__host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
{ {
return size<Idxs...>(tensor.GetLayout()); return size<Idxs...>(tensor.GetLayout());
} }
...@@ -194,15 +175,9 @@ template <index_t... Idxs, ...@@ -194,15 +175,9 @@ template <index_t... Idxs,
MemoryTypeEnum BufferAddressSpace, MemoryTypeEnum BufferAddressSpace,
typename ElementType, typename ElementType,
typename Shape, typename Shape,
typename UnnestedDescriptorType, typename UnrolledDescriptorType>
index_t NumVectors, __host__ __device__ constexpr auto
index_t ScalarPerVector> rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
__host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
{ {
return rank<Idxs...>(tensor.GetLayout()); return rank<Idxs...>(tensor.GetLayout());
} }
...@@ -218,15 +193,9 @@ template <index_t... Idxs, ...@@ -218,15 +193,9 @@ template <index_t... Idxs,
MemoryTypeEnum BufferAddressSpace, MemoryTypeEnum BufferAddressSpace,
typename ElementType, typename ElementType,
typename Shape, typename Shape,
typename UnnestedDescriptorType, typename UnrolledDescriptorType>
index_t NumVectors, __host__ __device__ constexpr auto
index_t ScalarPerVector> depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
__host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
{ {
return depth<Idxs...>(tensor.GetLayout()); return depth<Idxs...>(tensor.GetLayout());
} }
...@@ -240,15 +209,9 @@ __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace, ...@@ -240,15 +209,9 @@ __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
template <MemoryTypeEnum BufferAddressSpace, template <MemoryTypeEnum BufferAddressSpace,
typename ElementType, typename ElementType,
typename Shape, typename Shape,
typename UnnestedDescriptorType, typename UnrolledDescriptorType>
index_t NumVectors, __host__ __device__ constexpr const auto&
index_t ScalarPerVector> shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
__host__ __device__ constexpr const auto& shape(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
{ {
return shape(tensor.GetLayout()); return shape(tensor.GetLayout());
} }
......
...@@ -265,6 +265,8 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -265,6 +265,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
return 0; return 0;
} }
throw std::runtime_error("Col2Img: number of dimensions should be between 1 and 3.");
return 1;
} }
float Run(const device::BaseArgument* p_arg, float Run(const device::BaseArgument* p_arg,
......
...@@ -313,6 +313,9 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -313,6 +313,9 @@ struct ReferenceConvBwdData : public device::BaseOperator
return 0; return 0;
} }
throw std::runtime_error(
"Conv_bwd_data: number of dimensions must be between 1 and 3.");
return 1;
} }
float Run(const device::BaseArgument* p_arg, float Run(const device::BaseArgument* p_arg,
......
...@@ -265,6 +265,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator ...@@ -265,6 +265,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
return 0; return 0;
} }
throw std::runtime_error("Conv_bwd: number of dimensions must be between 1 and 3.");
return 1;
} }
float Run(const device::BaseArgument* p_arg, float Run(const device::BaseArgument* p_arg,
......
...@@ -360,6 +360,8 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -360,6 +360,8 @@ struct ReferenceConvFwd : public device::BaseOperator
return 0; return 0;
} }
throw std::runtime_error("Conv_fwd: number of dimensions must be between 1 and 3.");
return 1;
} }
float Run(const device::BaseArgument* p_arg, float Run(const device::BaseArgument* p_arg,
......
...@@ -63,12 +63,11 @@ struct ReferenceGemm : public device::BaseOperator ...@@ -63,12 +63,11 @@ struct ReferenceGemm : public device::BaseOperator
const int K = arg.a_m_k_.mDesc.GetLengths()[1]; const int K = arg.a_m_k_.mDesc.GetLengths()[1];
AccDataType v_acc = 0; AccDataType v_acc = 0;
ComputeTypeA v_a = 0;
ComputeTypeB v_b = 0;
for(int k = 0; k < K; ++k) for(int k = 0; k < K; ++k)
{ {
ComputeTypeA v_a;
ComputeTypeB v_b;
// use PassThrough instead of ConvertBF16RTN for reference calculation // use PassThrough instead of ConvertBF16RTN for reference calculation
if constexpr(is_same_v<AElementwiseOperation, if constexpr(is_same_v<AElementwiseOperation,
ck::tensor_operation::element_wise::ConvertBF16RTN>) ck::tensor_operation::element_wise::ConvertBF16RTN>)
...@@ -94,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator ...@@ -94,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator
ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b); ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
} }
CDataType v_c; CDataType v_c = 0;
arg.c_element_op_(v_c, v_acc); arg.c_element_op_(v_c, v_acc);
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/numeric.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -229,6 +230,8 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -229,6 +230,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
return 0; return 0;
} }
throw std::runtime_error("Img2Col: number of dimensions should be between 1 and 3.");
return 1;
} }
float Run(const device::BaseArgument* p_arg, float Run(const device::BaseArgument* p_arg,
......
...@@ -106,9 +106,8 @@ struct DeviceOperationInstanceFactory< ...@@ -106,9 +106,8 @@ struct DeviceOperationInstanceFactory<
return op_ptrs; return op_ptrs;
} }
}; };
#endif
} // namespace instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
...@@ -114,9 +114,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmSt ...@@ -114,9 +114,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmSt
return op_ptrs; return op_ptrs;
} }
}; };
#endif
} // namespace instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
#ifdef CK_ENABLE_FP32
// FP32
void add_device_groupnorm_bwd_gamma_beta_f32_instances(
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F32, F32, F32, F32, F32, 5, 3>>>&);
#endif
template <typename DYDataType,
typename XDataType,
typename MeanInvStdDataType,
typename DGammaDataType,
typename DBetaDataType>
struct DeviceOperationInstanceFactory<
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
XDataType,
MeanInvStdDataType,
DGammaDataType,
DBetaDataType,
5,
3>>
{
using DeviceOp = DeviceNormalizationBwdGammaBeta<DYDataType,
XDataType,
MeanInvStdDataType,
DGammaDataType,
DBetaDataType,
5,
3>;
static auto GetInstances()
{
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
#ifdef CK_ENABLE_FP32
if constexpr(is_same_v<DYDataType, F32> && is_same_v<XDataType, F32> &&
is_same_v<MeanInvStdDataType, F32> && is_same_v<DGammaDataType, F32> &&
is_same_v<DBetaDataType, F32>)
{
add_device_groupnorm_bwd_gamma_beta_f32_instances(op_ptrs);
}
#endif
return op_ptrs;
}
};
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
#ifdef CK_ENABLE_FP16
// FP16
void add_device_layernorm2d_bwd_gamma_beta_f16_instances(
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F16, F16, F16, F16, F16, 2, 1>>>&);
#endif
#ifdef CK_ENABLE_FP32
// FP32
void add_device_layernorm2d_bwd_gamma_beta_f32_instances(
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F32, F32, F32, F32, F32, 2, 1>>>&);
#endif
template <typename DYDataType,
typename XDataType,
typename MeanInvStdDataType,
typename DGammaDataType,
typename DBetaDataType,
index_t Rank,
index_t NumReduceDim>
struct DeviceOperationInstanceFactory<
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
XDataType,
MeanInvStdDataType,
DGammaDataType,
DBetaDataType,
Rank,
NumReduceDim>>
{
using DeviceOp = DeviceNormalizationBwdGammaBeta<DYDataType,
XDataType,
MeanInvStdDataType,
DGammaDataType,
DBetaDataType,
Rank,
NumReduceDim>;
static auto GetInstances()
{
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
#ifdef CK_ENABLE_FP16
if constexpr(is_same_v<DYDataType, F16> && is_same_v<XDataType, F16> &&
is_same_v<MeanInvStdDataType, F16> && is_same_v<DGammaDataType, F16> &&
is_same_v<DBetaDataType, F16>)
{
if constexpr(Rank == 2 && NumReduceDim == 1)
{
add_device_layernorm2d_bwd_gamma_beta_f16_instances(op_ptrs);
}
}
#endif
#ifdef CK_ENABLE_FP32
if constexpr(is_same_v<DYDataType, F32> && is_same_v<XDataType, F32> &&
is_same_v<MeanInvStdDataType, F32> && is_same_v<DGammaDataType, F32> &&
is_same_v<DBetaDataType, F32>)
{
if constexpr(Rank == 2 && NumReduceDim == 1)
{
add_device_layernorm2d_bwd_gamma_beta_f32_instances(op_ptrs);
}
}
#endif
return op_ptrs;
}
};
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
...@@ -57,7 +58,8 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances = std::tuple< ...@@ -57,7 +58,8 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances = std::tuple<
DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1> DeviceGemm_Xdl_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffleV2< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 2, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
// pipeline v1, 2 waves // pipeline v1, 2 waves
, ,
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
...@@ -52,7 +53,8 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple< ...@@ -52,7 +53,8 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple<
DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1> DeviceGemm_Xdl_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffleV2< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 2, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
// pipeline v1, 2 waves // pipeline v1, 2 waves
, ,
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
...@@ -57,7 +58,8 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple< ...@@ -57,7 +58,8 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1> DeviceGemm_Xdl_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffleV2< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 2, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
// pipeline v1, 2 waves // pipeline v1, 2 waves
, ,
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
...@@ -52,7 +53,8 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple< ...@@ -52,7 +53,8 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1> DeviceGemm_Xdl_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, LoopScheduler::Default, PipelineVersion::v1>,
DeviceGemm_Xdl_CShuffleV2< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 2, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, LoopScheduler::Default, PipelineVersion::v1>
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
// pipeline v1, 2 waves // pipeline v1, 2 waves
, ,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment