Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
52423948
"include/vscode:/vscode.git/clone" did not exist on "1c8126a4c2372530db822c28fe6d2a4eb8f3998b"
Commit
52423948
authored
Sep 27, 2019
by
Jehandad Khan
Browse files
Merge branch 'master' into jd_redux
parents
b97af4ec
98a2cfcc
Changes
67
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4563 additions
and
1673 deletions
+4563
-1673
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
...l/include/tensor_description/ConstantTensorDescriptor.hpp
+75
-17
composable_kernel/include/tensor_description/dimension.hpp
composable_kernel/include/tensor_description/dimension.hpp
+30
-0
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+421
-0
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+135
-221
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+346
-0
composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
...l/include/tensor_description/tensor_coordinate_helper.hpp
+16
-0
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+502
-0
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+374
-0
composable_kernel/include/tensor_description/tensor_view.hpp
composable_kernel/include/tensor_description/tensor_view.hpp
+100
-0
composable_kernel/include/tensor_description/tensor_visit.hpp
...osable_kernel/include/tensor_description/tensor_visit.hpp
+124
-0
composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
...ernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+9
-9
composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
...ernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+6
-6
composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
...ernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+6
-6
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
...ernel/include/tensor_operation/blockwise_batched_gemm.hpp
+6
-139
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+108
-487
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
...ration/blockwise_generic_tensor_slice_copy_deprecated.hpp
+692
-0
composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
.../include/tensor_operation/blockwise_tensor_slice_copy.hpp
+8
-8
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+371
-780
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
...ation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+902
-0
composable_kernel/include/utility/amd_intrinsic.hpp
composable_kernel/include/utility/amd_intrinsic.hpp
+332
-0
No files found.
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
View file @
52423948
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_packed
(
Lengths
)
__host__
__device__
constexpr
auto
calculate_tensor_strides_packed
_old
(
Lengths
)
{
{
return
reverse_inclusive_scan_sequence
(
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
Lengths
{}.
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
...
@@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
...
@@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
}
}
template
<
class
Lengths
,
index_t
Align
>
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_aligned
(
Lengths
,
Number
<
Align
>
)
__host__
__device__
constexpr
auto
calculate_tensor_strides_aligned
_old
(
Lengths
,
Number
<
Align
>
)
{
{
constexpr
index_t
L_back_align
=
constexpr
index_t
L_back_align
=
Align
*
math
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
Align
*
math
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
return
calculate_tensor_strides_packed
(
return
calculate_tensor_strides_packed
_old
(
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
}
}
...
@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor
...
@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor
__host__
__device__
static
constexpr
auto
GetElementSize
()
__host__
__device__
static
constexpr
auto
GetElementSize
()
{
{
return
Number
<
accumulate_on_sequence
(
return
Number
<
reduce_on_sequence
(
Lengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
>
{};
Lengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
>
{};
}
}
__host__
__device__
static
constexpr
auto
GetElementSpace
()
__host__
__device__
static
constexpr
auto
GetElementSpace
()
{
{
constexpr
index_t
element_space_unaligned
=
accumulat
e_on_sequence
(
constexpr
index_t
element_space_unaligned
=
reduc
e_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
return
Number
<
element_space_unaligned
>
{};
return
Number
<
element_space_unaligned
>
{};
...
@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor
...
@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
return
Number
<
accumulat
e_on_sequence
(
return
Number
<
reduc
e_on_sequence
(
multi_id
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
0
>
{})
>
{};
multi_id
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
0
>
{})
>
{};
}
}
...
@@ -178,7 +177,7 @@ struct ConstantTensorDescriptor
...
@@ -178,7 +177,7 @@ struct ConstantTensorDescriptor
{
{
constexpr
auto
IDim
=
IDim_
{};
constexpr
auto
IDim
=
IDim_
{};
constexpr
index_t
stride
=
PackedStrides
::
Get
(
IDim
);
constexpr
index_t
stride
=
PackedStrides
::
Get
(
IDim
);
multi_id
.
Set
(
IDim
,
id
/
stride
)
;
multi_id
(
IDim
)
=
id
/
stride
;
id
-=
multi_id
[
IDim
]
*
stride
;
id
-=
multi_id
[
IDim
]
*
stride
;
}
}
};
};
...
@@ -187,12 +186,12 @@ struct ConstantTensorDescriptor
...
@@ -187,12 +186,12 @@ struct ConstantTensorDescriptor
{
{
Array
<
index_t
,
nDim
>
multi_id
;
Array
<
index_t
,
nDim
>
multi_id
;
using
PackedStrides
=
decltype
(
calculate_tensor_strides_packed
(
GetLengths
()));
using
PackedStrides
=
decltype
(
calculate_tensor_strides_packed
_old
(
GetLengths
()));
// calculate index in each of the dimensions in the order of their dimension
// calculate index in each of the dimensions in the order of their dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_GetMultiIndexFrom1dIndex
<
PackedStrides
>
(
id
,
multi_id
));
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_GetMultiIndexFrom1dIndex
<
PackedStrides
>
(
id
,
multi_id
));
multi_id
.
Set
(
Number
<
nDim
-
1
>
{}
,
id
/
PackedStrides
::
Get
(
Number
<
nDim
-
1
>
{})
)
;
multi_id
(
Number
<
nDim
-
1
>
{}
)
=
id
/
PackedStrides
::
Get
(
Number
<
nDim
-
1
>
{});
return
multi_id
;
return
multi_id
;
}
}
...
@@ -204,7 +203,7 @@ struct ConstantTensorDescriptor
...
@@ -204,7 +203,7 @@ struct ConstantTensorDescriptor
}
}
// This function doesn't do carry check on the highest dimension for positive stepping (or
// This function doesn't do carry check on the highest dimension for positive stepping (or
// borrow check on the
low
est dimension for negative stepping) , for performance reason. It is
// borrow check on the
high
est dimension for negative stepping) , for performance reason. It is
// the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// highest dimension for positive stepping (or on the lowest dimension for negative stepping)
// highest dimension for positive stepping (or on the lowest dimension for negative stepping)
template
<
bool
PositiveDirection
>
template
<
bool
PositiveDirection
>
...
@@ -304,14 +303,73 @@ struct ConstantTensorDescriptor
...
@@ -304,14 +303,73 @@ struct ConstantTensorDescriptor
GetStrides
().
PushBack
(
leaf_tensor
::
GetStrides
()))
>
{};
GetStrides
().
PushBack
(
leaf_tensor
::
GetStrides
()))
>
{};
}
}
template
<
index_t
IDimVector
,
index_t
DataPerVector
>
struct
lambda_IsVectorizationAllowed
{
bool
&
is_allowed
;
__host__
__device__
constexpr
lambda_IsVectorizationAllowed
(
bool
&
is_allowed_
)
:
is_allowed
(
is_allowed_
)
{
}
template
<
index_t
IDim_
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim_
>
)
const
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
if
(
IDimVector
!=
IDim
&&
Strides
::
Get
(
IDim
)
%
DataPerVector
!=
0
)
{
is_allowed
=
false
;
}
}
};
template
<
index_t
IDimVector
,
index_t
DataPerVector
>
__host__
__device__
static
constexpr
bool
IsVectorizationAllowed
(
Number
<
IDimVector
>
,
Number
<
DataPerVector
>
)
{
bool
is_allowed
=
(
Strides
{}[
IDimVector
]
==
1
||
DataPerVector
==
1
)
&&
Lengths
{}[
IDimVector
]
%
DataPerVector
==
0
;
static_for
<
0
,
nDim
,
1
>
{}(
lambda_IsVectorizationAllowed
<
IDimVector
,
DataPerVector
>
{
is_allowed
});
return
is_allowed
;
}
template
<
index_t
IDim
,
index_t
DataPerVector
>
__host__
__device__
static
constexpr
auto
Vectorize
(
Number
<
IDim
>
,
Number
<
DataPerVector
>
)
{
constexpr
auto
idim
=
Number
<
IDim
>
{};
constexpr
auto
data_per_vector
=
Number
<
DataPerVector
>
{};
static_assert
(
IsVectorizationAllowed
(
idim
,
data_per_vector
),
"wrong!"
);
using
vectorized_lengths
=
decltype
(
Lengths
::
Modify
(
Number
<
IDim
>
{},
Number
<
Lengths
{}[
IDim
]
/
DataPerVector
>
{}));
using
vectorized_strides
=
decltype
((
Strides
{}
/
Number
<
DataPerVector
>
{}).
Modify
(
Number
<
IDim
>
{},
Number
<
1
>
{}));
return
ConstantTensorDescriptor
<
vectorized_lengths
,
vectorized_strides
>
{};
}
template
<
index_t
IDim
,
index_t
SliceLen
>
template
<
index_t
IDim
,
index_t
SliceLen
>
__host__
__device__
static
constexpr
auto
Slice
(
Number
<
IDim
>
,
Number
<
SliceLen
>
)
__host__
__device__
static
constexpr
auto
Slice
(
Number
<
IDim
>
,
Number
<
SliceLen
>
)
{
{
using
slice_lengths
=
decltype
(
Lengths
{}.
Modify
(
Number
<
IDim
>
{},
Number
<
SliceLen
>
{}));
using
slice_lengths
=
decltype
(
Lengths
::
Modify
(
Number
<
IDim
>
{},
Number
<
SliceLen
>
{}));
return
ConstantTensorDescriptor
<
slice_lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
slice_lengths
,
Strides
>
{};
}
}
template
<
index_t
...
Is
>
__host__
__device__
static
constexpr
auto
Slice
(
Sequence
<
Is
...
>
slice_lengths
)
{
static_assert
(
slice_lengths
.
GetSize
()
==
nDim
,
"wrong!"
);
return
ConstantTensorDescriptor
<
decltype
(
slice_lengths
),
Strides
>
{};
}
template
<
index_t
IDim
,
index_t
SliceLength
,
index_t
SliceStride
>
template
<
index_t
IDim
,
index_t
SliceLength
,
index_t
SliceStride
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
StridedSlice
(
Number
<
IDim
>
,
Number
<
SliceLength
>
,
Number
<
SliceStride
>
)
StridedSlice
(
Number
<
IDim
>
,
Number
<
SliceLength
>
,
Number
<
SliceStride
>
)
...
@@ -330,7 +388,7 @@ struct ConstantTensorDescriptor
...
@@ -330,7 +388,7 @@ struct ConstantTensorDescriptor
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
index_t
fold_intervals_product
=
constexpr
index_t
fold_intervals_product
=
accumulat
e_on_sequence
(
fold_intervals
,
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
reduc
e_on_sequence
(
fold_intervals
,
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
...
@@ -388,7 +446,7 @@ struct ConstantTensorDescriptor
...
@@ -388,7 +446,7 @@ struct ConstantTensorDescriptor
static_assert
(
Type
::
Extract
(
middle
).
AreDimensionsContinuous
(),
"wrong! not unfoldable"
);
static_assert
(
Type
::
Extract
(
middle
).
AreDimensionsContinuous
(),
"wrong! not unfoldable"
);
// unfolded length, stride
// unfolded length, stride
constexpr
index_t
unfold_length
=
accumulat
e_on_sequence
(
constexpr
index_t
unfold_length
=
reduc
e_on_sequence
(
GetLengths
().
Extract
(
middle
),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
GetLengths
().
Extract
(
middle
),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
...
@@ -409,7 +467,7 @@ struct ConstantTensorDescriptor
...
@@ -409,7 +467,7 @@ struct ConstantTensorDescriptor
__host__
__device__
static
constexpr
auto
Pack
()
__host__
__device__
static
constexpr
auto
Pack
()
{
{
using
packed_strides
=
decltype
(
calculate_tensor_strides_packed
(
Lengths
{}));
using
packed_strides
=
decltype
(
calculate_tensor_strides_packed
_old
(
Lengths
{}));
return
ConstantTensorDescriptor
<
Lengths
,
packed_strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
packed_strides
>
{};
}
}
...
@@ -431,7 +489,7 @@ struct ConstantTensorDescriptor
...
@@ -431,7 +489,7 @@ struct ConstantTensorDescriptor
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_packed
(
Lengths
)
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_packed
(
Lengths
)
{
{
using
Strides
=
decltype
(
calculate_tensor_strides_packed
(
Lengths
{}));
using
Strides
=
decltype
(
calculate_tensor_strides_packed
_old
(
Lengths
{}));
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
}
...
@@ -444,7 +502,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
...
@@ -444,7 +502,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
template
<
class
Lengths
,
index_t
Align
>
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_aligned
(
Lengths
,
Number
<
Align
>
)
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_aligned
(
Lengths
,
Number
<
Align
>
)
{
{
using
Strides
=
decltype
(
calculate_tensor_strides_aligned
(
Lengths
{},
Number
<
Align
>
{}));
using
Strides
=
decltype
(
calculate_tensor_strides_aligned
_old
(
Lengths
{},
Number
<
Align
>
{}));
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
}
...
...
composable_kernel/include/tensor_description/dimension.hpp
0 → 100644
View file @
52423948
#ifndef CK_DIMENSION_HPP
#define CK_DIMENSION_HPP
#include "common_header.hpp"
namespace
ck
{
template
<
index_t
Length
>
struct
Dimension
{
__host__
__device__
static
constexpr
auto
GetLength
()
{
return
Number
<
Length
>
{};
}
};
template
<
index_t
Length
,
index_t
Stride
>
struct
NativeDimension
{
__host__
__device__
static
constexpr
auto
GetLength
()
{
return
Number
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
GetStride
()
{
return
Number
<
Stride
>
{};
}
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
index_t
i
)
{
return
i
*
Stride
;
}
__host__
__device__
static
constexpr
index_t
CalculateOffsetDiff
(
index_t
i_diff
)
{
return
i_diff
*
Stride
;
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/multi_index_transform.hpp
0 → 100644
View file @
52423948
#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
#define CK_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
namespace
ck
{
template
<
index_t
N
>
using
MultiIndex
=
Array
<
index_t
,
N
>
;
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
make_multi_index
(
Xs
...
xs
)
{
return
MultiIndex
<
sizeof
...(
Xs
)
>
(
xs
...);
}
template
<
index_t
Length
>
struct
PassThrough
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
// LowerLengths: Sequence<...>
template
<
typename
LowerLengths
,
typename
LeftPads
,
typename
RightPads
>
struct
Pad
{
static
constexpr
index_t
nDim
=
LowerLengths
::
Size
();
using
LowerIndex
=
MultiIndex
<
nDim
>
;
using
UpperIndex
=
MultiIndex
<
nDim
>
;
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
nDim
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
nDim
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
LowerLengths
{}
+
LeftPads
{}
+
RightPads
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
-
LeftPads
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
idx_up
)
const
{
#if 0
struct lambda_no_pad
{
__host__ __device__ constexpr bool operator()(index_t x) const { return x == 0; }
};
if(sequence_all_of(LeftPads{}, lambda_no_pad{}) &&
sequence_all_of(RightPads{}, lambda_no_pad{}))
{
return true;
}
else
#endif
{
bool
flag
=
true
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// only check if there is left-padding
static_if
<
(
LeftPads
::
At
(
idim
)
!=
0
)
>
{}(
[
&
](
auto
)
{
flag
=
flag
&&
idx_up
[
idim
]
>=
LeftPads
::
At
(
idim
);
});
// only check if there is right-padding
static_if
<
(
RightPads
::
At
(
idim
)
!=
0
)
>
{}([
&
](
auto
)
{
flag
=
flag
&&
(
idx_up
[
idim
]
<
LeftPads
::
At
(
idim
)
+
LowerLengths
::
At
(
idim
));
});
});
return
flag
;
}
}
};
// LowerLengths: Sequence<...>
template
<
typename
LowerLengths
>
struct
Merge
{
static
constexpr
index_t
nDimLow
=
LowerLengths
::
Size
();
static
constexpr
index_t
nDimUp
=
1
;
using
LowerIndex
=
MultiIndex
<
nDimLow
>
;
using
UpperIndex
=
MultiIndex
<
nDimUp
>
;
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
nDimLow
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
nDimUp
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
reduce_on_sequence
(
LowerLengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
>
{};
}
// emulate constexpr lambda
template
<
typename
PseudoLowStrides
>
struct
lambda_CalculateLowerIndex
{
index_t
&
itmp
;
LowerIndex
&
idx_low
;
__host__
__device__
explicit
constexpr
lambda_CalculateLowerIndex
(
index_t
&
itmp_
,
LowerIndex
&
idx_low_
)
:
itmp
(
itmp_
),
idx_low
(
idx_low_
)
{
}
template
<
typename
IDim
>
__host__
__device__
constexpr
void
operator
()(
IDim
idim
)
const
{
constexpr
index_t
stride
=
PseudoLowStrides
::
At
(
idim
);
idx_low
(
idim
)
=
itmp
/
stride
;
itmp
-=
idx_low
[
idim
]
*
stride
;
}
};
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
;
index_t
itmp
=
idx_up
[
0
];
constexpr
auto
pseudo_low_strides
=
reverse_inclusive_scan_sequence
(
LowerLengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
idx_low
(
nDimLow
-
1
)
=
itmp
/
pseudo_low_strides
[
nDimLow
-
1
];
return
idx_low
;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
// do nothing if idx_up_diff == 0
if
(
idx_up_diff
[
0
]
==
0
)
{
return
make_zero_array
<
index_t
,
nDimLow
>
();
}
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// If idx_up_diff is known at compile-time, the calculation can
// be done at compile-time. However, if idx_up_diff is only known
// at run-time, then the calculation will also be computed at
// run-time, and can be very expensive.
LowerIndex
idx_low_new
=
idx_low_old
+
CalculateLowerIndex
(
idx_up_diff
);
if
(
idx_up_diff
[
0
]
>
0
)
{
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDimLow
-
1
,
1
>
{}([
&
](
auto
ireverse
)
{
constexpr
index_t
i
=
nDimLow
-
1
-
ireverse
;
if
(
carry
)
{
++
idx_low_new
(
i
);
}
carry
=
false
;
if
(
idx_low_new
[
i
]
>=
LowerLengths
::
At
(
i
))
{
idx_low_new
(
i
)
-=
LowerLengths
::
At
(
i
);
carry
=
true
;
}
});
// highest dimension, no out-of-bound check
if
(
carry
)
{
++
idx_low_new
(
0
);
}
}
else
if
(
idx_up_diff
[
0
]
<
0
)
{
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDimLow
-
1
,
1
>
{}([
&
](
auto
ireverse
)
{
constexpr
index_t
i
=
nDimLow
-
1
-
ireverse
;
if
(
borrow
)
{
--
idx_low_new
(
i
);
}
borrow
=
false
;
if
(
idx_low_new
[
i
]
<
0
)
{
idx_low_new
(
i
)
+=
LowerLengths
::
At
(
i
);
borrow
=
true
;
}
});
// highest dimension, no out-of-bound check
if
(
borrow
)
{
--
idx_low_new
(
0
);
}
}
return
idx_low_new
-
idx_low_old
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
// UpperLengths: Sequence<...>
template
<
typename
UpperLengths
>
struct
UnMerge
{
static
constexpr
index_t
nDimLow
=
1
;
static
constexpr
index_t
nDimUp
=
UpperLengths
::
Size
();
using
LowerIndex
=
MultiIndex
<
nDimLow
>
;
using
UpperIndex
=
MultiIndex
<
nDimUp
>
;
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
nDimLow
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
nDimUp
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
UpperLengths
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
{
0
};
constexpr
auto
pseudo_up_strides
=
reverse_inclusive_scan_sequence
(
UpperLengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
static_for
<
0
,
nDimUp
,
1
>
{}(
[
&
](
auto
idim
)
{
idx_low
(
0
)
+=
idx_up
[
idim
]
*
pseudo_up_strides
[
idim
];
});
return
idx_low
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
CalculateLowerIndex
(
idx_up_diff
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
// UpperLengths: Sequence<...>
// Coefficients: Sequence<...>
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
template
<
typename
UpperLengths
,
typename
Coefficients
>
struct
Embed
{
static
constexpr
index_t
nDimLow
=
1
;
static
constexpr
index_t
nDimUp
=
UpperLengths
::
Size
();
using
LowerIndex
=
MultiIndex
<
nDimLow
>
;
using
UpperIndex
=
MultiIndex
<
nDimUp
>
;
__host__
__device__
explicit
constexpr
Embed
()
{
static_assert
(
UpperLengths
::
GetSize
()
==
nDimUp
&&
Coefficients
::
GetSize
()
==
nDimUp
+
1
,
"wrong! # of dimensions not consistent"
);
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
nDimUp
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
nDimLow
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
UpperLengths
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
(
Coefficients
{}[
nDimUp
]);
static_for
<
0
,
nDimUp
,
1
>
{}(
[
&
](
auto
idim
)
{
idx_low
(
0
)
+=
idx_up
[
idim
]
*
Coefficients
{}[
idim
];
});
return
idx_low
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
LowerIndex
idx_low_diff
{
0
};
static_for
<
0
,
nDimUp
,
1
>
{}(
[
&
](
auto
idim
)
{
idx_low_diff
(
0
)
+=
idx_up_diff
[
idim
]
*
Coefficients
{}[
idim
];
});
return
idx_low_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
template
<
index_t
LowerLength
,
index_t
VectorSize
>
struct
Vectorize
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
__host__
__device__
constexpr
Vectorize
()
{
static_assert
(
VectorSize
>
0
&&
LowerLength
%
VectorSize
==
0
,
"wrong! cannot evenly divide"
);
}
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
LowerLength
/
VectorSize
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
VectorSize
*
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
VectorSize
*
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
52423948
#ifndef CK_TENSOR_COORDINATE_HPP
#ifndef CK_TENSOR_COORDINATE_
V2_
HPP
#define CK_TENSOR_COORDINATE_HPP
#define CK_TENSOR_COORDINATE_
V2_
HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "dimension.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "multi_index_transform.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
namespace
ck
{
template
<
class
TensorDesc
>
template
<
typename
TensorDesc
>
struct
NormalTensorCoordinate
struct
TensorCoordinate
;
{
using
type
=
NormalTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
{
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
NativeTensorCoordinate
(
Index
idx
)
:
mIndex
(
idx
),
mOffset
(
tensor_desc_type
::
CalculateOffset
(
idx
))
{
}
__host__
__device__
constexpr
NormalTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
template
<
typename
...
Xs
>
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
__host__
__device__
constexpr
NativeTensorCoordinate
(
Xs
...
xs
)
:
NativeTensorCoordinate
(
Index
{
xs
...})
{
{
}
}
template
<
class
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
N
ormal
TensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
N
ative
TensorCoordinate
(
Sequence
<
Xs
...
>
)
:
N
ormal
TensorCoordinate
(
Array
<
index_t
,
nDim
>
{
x
s
...})
:
N
ative
TensorCoordinate
(
Index
{
X
s
...})
{
{
}
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{}
;
}
// T is Array or Sequence
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
mOffset
+=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
mOffset
-=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
+=
step_sizes
;
coord
+=
idx_diff
;
return
coord
;
return
coord
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
-=
step_sizes
;
coord
-=
idx_diff
;
return
coord
;
return
coord
;
}
}
// reposition point of origin, and return compensated offset.
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
return
offset_diff
;
}
private:
private:
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
index_t
mOffset
;
};
};
template
<
class
TensorDesc
>
template
<
typename
Transformed
TensorDesc
>
struct
Merg
edTensorCoordinate
struct
Transform
edTensorCoordinate
{
{
using
type
=
MergedTensorCoordinate
;
using
tensor_desc_type
=
TransformedTensorDesc
;
using
tensor_desc_type
=
TensorDesc
;
using
LowerCoord
=
typename
TensorCoordinate
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
UpperCoord
=
TransformedTensorCoordinate
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
using
UpperIndex
=
MultiIndex
<
nDim
>
;
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
Merg
edTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_inde
x
)
__host__
__device__
constexpr
Transform
edTensorCoordinate
(
UpperIndex
id
x
)
:
m
OriginalIndex
{
tensor_desc_type
::
GetOrigin
al
M
ul
tiIndexFromMultiIndex
(
tensor_inde
x
)}
:
m
IndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
C
al
c
ul
ateLowerIndex
(
id
x
)}
{
{
// partial offset on each dimension
}
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
template
<
typename
...
Xs
>
mOffset
=
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
:
TransformedTensorCoordinate
(
UpperIndex
{
xs
...})
{
}
}
template
<
class
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
Merg
edTensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
Transform
edTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
Merg
edTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
x
s
...})
:
Transform
edTensorCoordinate
(
UpperIndex
{
X
s
...})
{
{
}
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{}
;
}
template
<
class
IDim
,
class
T
,
bool
PositiveDirection
>
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
void
MoveOnDimension
(
IDim
idim_
,
T
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetIndex
()
const
{
return
GetUpperIndex
();
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
{
constexpr
auto
idim
=
idim_
;
return
GetLowerCoordinate
().
GetOffset
();
// if step_size is known at compile time
static_if
<
is_static
<
T
>::
value
>
{}(
[
&
](
auto
)
{
static_if
<
T
{}
==
0
>
{}([
&
](
auto
)
{
return
;
});
});
// update original index
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
}
// T is Array or Sequence
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
index_t
normal_offset_diff
=
0
;
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// mIndexUp is updated here, but some (or all) of its entries may never be used
if
(
step_sizes
[
idim
]
!=
0
)
// compiler should remove those entries as dead code
{
mIndexUp
+=
idx_up_diff
;
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
}
});
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// mIndex is updated here, but some (or all) of its entries may never be used
if
(
step_sizes
[
idim
]
!=
0
)
// compiler should remove those entries as dead code
{
mIndexUp
-=
idx_up_diff
;
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
}
});
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
UpperCoord
coord
_up
=
*
this
;
coord
+=
step_sizes
;
coord
_up
+=
idx_up_diff
;
return
coord
;
return
coord
_up
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
UpperCoord
coord
_up
=
*
this
;
coord
-=
step_sizes
;
coord
_up
-=
idx_up_diff
;
return
coord
;
return
coord
_up
;
}
}
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
// this function should be inexpensive, because there is no upper-to-lower index transformation
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidOffset
()
const
{
return
tensor_desc_type
::
IsUpperIndexMappedToValidLowerIndex
(
GetIndex
())
&&
mCoordLow
.
IsUpperIndexMappedToValidOffset
();
}
private:
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// However, only those merged dimensions, whose index will be involved in arithmetic
// may
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// never be used. Compiler should be able to remove these entries as well as its calculation
// window on the merged dimension), will use these register memory.
// as dead code.
// Let's hope compiler will optimize away those register memory allocated for normal
// TODO: make sure compiler indeed remove these dead code
// dimensions, and those merged dimensions, that would never be involved in index
UpperIndex
mIndexUp
;
// arithmetic after construction of TensorCoordinate.
LowerCoord
mCoordLow
;
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
};
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize way those register memory for us
template
<
typename
TensorDesc
>
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
struct
TensorCoordinate
Array
<
index_t
,
nDim
>
mPartialOffsets
;
{
private:
// complete offset
template
<
typename
...
Ts
>
index_t
mOffset
;
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
0 → 100644
View file @
52423948
#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
#define CK_TENSOR_COORDINATE_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
namespace
ck
{
// TensorDesc is ConstantTensorDescriptor
template
<
class
TensorDesc
>
struct
NormalTensorCoordinate_deprecated
{
using
type
=
NormalTensorCoordinate_deprecated
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
{
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Xs
...
xs
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Sequence
<
Xs
...
>
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
Xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
// reposition point of origin, and return compensated offset.
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
return
offset_diff
;
}
private:
index_t
mOffset
;
};
// TensorDesc is ConstantMergedTensorDescriptor
template
<
class
TensorDesc
>
struct
MergedTensorCoordinate
{
using
type
=
MergedTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOriginalIndex
{
tensor_desc_type
::
GetOriginalMultiIndexFromMultiIndex
(
tensor_index
)}
{
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
mOffset
=
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
MergedTensorCoordinate
(
Xs
...
xs
)
:
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
IDim
,
class
T
,
bool
PositiveDirection
>
__host__
__device__
void
MoveOnDimension
(
IDim
idim_
,
T
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
constexpr
auto
idim
=
idim_
;
// if step_size is known at compile time
static_if
<
is_static
<
T
>::
value
>
{}(
[
&
](
auto
)
{
static_if
<
T
{}
==
0
>
{}([
&
](
auto
)
{
return
;
});
});
// update original index
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
// highest dimension
if
(
carry
)
{
++
partial_original_id
(
0
);
}
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// highest dimension
if
(
borrow
)
{
--
partial_original_id
(
0
);
}
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// However, only those merged dimensions, whose index will be involved in arithmetic
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// window on the merged dimension), will use these register memory.
// Let's hope compiler will optimize away those register memory allocated for normal
// dimensions, and those merged dimensions, that would never be involved in index
// arithmetic after construction of TensorCoordinate.
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize away those register memory for us
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// complete offset
index_t
mOffset
;
};
template
<
class
TensorDesc
>
struct
TensorCoordinate_deprecated
{
private:
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor
<
Ts
...
>>
();
}
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
}
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
0 → 100644
View file @
52423948
#ifndef CK_TENSOR_COORDINATE_HELPER_HPP
#define CK_TENSOR_COORDINATE_HELPER_HPP
#include "tensor_coordiante_v2.hpp"
namespace
ck
{
template
<
typename
TensorDesc
>
__host__
__device__
constexpr
auto
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
{
return
typename
TensorCoordinate
<
TensorDesc
>::
type
(
idx
);
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_descriptor.hpp
0 → 100644
View file @
52423948
#ifndef CK_TENSOR_DESCRIPTOR_HPP
#define CK_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp"
#include "dimension.hpp"
#include "multi_index_transform.hpp"
namespace
ck
{
template
<
typename
...
NativeDimensions
>
struct
NativeTensorDescriptor
{
using
type
=
NativeTensorDescriptor
;
static
constexpr
index_t
nDim
=
sizeof
...(
NativeDimensions
);
static
constexpr
auto
mDimensions
=
make_tuple
(
NativeDimensions
{}...);
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
Number
<
nDim
>
{};
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetLength
(
Number
<
IDim
>
)
{
return
mDimensions
.
At
(
Number
<
IDim
>
{}).
GetLength
();
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetStride
(
Number
<
IDim
>
)
{
return
mDimensions
.
At
(
Number
<
IDim
>
{}).
GetStride
();
}
template
<
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
GetLengths
(
Sequence
<
IDims
...
>
)
{
return
Sequence
<
GetLength
(
Number
<
IDims
>
{})...
>
{};
}
template
<
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
GetStrides
(
Sequence
<
IDims
...
>
)
{
return
Sequence
<
GetStride
(
Number
<
IDims
>
{})...
>
{};
}
template
<
index_t
IDim
,
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
GetLengths
(
Number
<
IDim
>
,
Number
<
IDims
>
...)
{
return
GetLengths
(
Sequence
<
IDim
,
IDims
...
>
{});
}
template
<
index_t
IDim
,
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
GetStrides
(
Number
<
IDim
>
,
Number
<
IDims
>
...)
{
return
GetStrides
(
Sequence
<
IDim
,
IDims
...
>
{});
}
__host__
__device__
static
constexpr
auto
GetLengths
()
{
return
GetLengths
(
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
{});
}
__host__
__device__
static
constexpr
auto
GetStrides
()
{
return
GetStrides
(
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
reduce_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
return
reduce_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
}
// TODO: this cannot return constepxr because of use of lambda
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
const
Index
&
idx
)
{
index_t
offset
=
0
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
offset
+=
idx
[
idim
]
*
GetStride
(
idim
);
});
return
offset
;
}
__host__
__device__
static
constexpr
index_t
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
{
index_t
offset_diff
=
0
;
static_for
<
0
,
nDim
,
1
>
{}(
[
&
](
auto
idim
)
{
offset_diff
+=
idx_diff
[
idim
]
*
GetStride
(
idim
);
});
return
offset_diff
;
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
IsLinearDimension
(
Number
<
IDim
>
)
{
return
true
;
}
__host__
__device__
static
constexpr
auto
GetLinearDimensionMask
()
{
return
typename
uniform_sequence_gen
<
nDim
,
1
>::
type
{};
}
__host__
__device__
static
constexpr
auto
GetNonLinearDimensionMask
()
{
return
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{};
}
__host__
__device__
static
constexpr
auto
GetNonLinearDimensions
()
{
return
Sequence
<>
{};
}
#if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
return Tuple<>{};
}
#endif
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
(
const
Index
&
/* idx */
)
{
return
true
;
}
};
// LowerTensorDescriptor
// Transforms: Tuple<DimensionTransforms...>
// LowerDimensionIds: Tuple<Sequence<...>>
// UpperDimensionIds: Tuple<Sequence<...>>
template
<
typename
LowTensorDescriptor
,
typename
Transforms
,
typename
LowDimensionIds
,
typename
UpDimensionIds
>
struct
TransformedTensorDescriptor
{
using
type
=
TransformedTensorDescriptor
;
static
constexpr
index_t
nTransform
=
Transforms
::
Size
();
struct
lambda_merge_sequences
{
template
<
typename
...
Seqs
>
__host__
__device__
constexpr
auto
operator
()(
Seqs
...
seqs
)
const
{
return
merge_sequences
(
seqs
...);
}
};
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
// Here, we assume all lower-dimensions are active
// TODO: sanity-check all lower-dimension are indeed active
using
duplicated_low_active_dims
=
decltype
(
unpack
(
lambda_merge_sequences
{},
LowDimensionIds
{}));
using
low_active_dims
=
typename
sequence_unique_sort
<
duplicated_low_active_dims
,
math
::
less
<
index_t
>
,
math
::
equal
<
index_t
>>::
type
;
return
low_active_dims
::
Size
();
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
using
duplicated_up_active_dims
=
decltype
(
unpack
(
lambda_merge_sequences
{},
UpDimensionIds
{}));
using
up_active_dims
=
typename
sequence_unique_sort
<
duplicated_up_active_dims
,
math
::
less
<
index_t
>
,
math
::
equal
<
index_t
>>::
type
;
return
up_active_dims
::
Size
();
}
static
constexpr
index_t
nDimUp
=
GetNumOfUpperDimension
();
static
constexpr
index_t
nDimLow
=
GetNumOfLowerDimension
();
using
UpperIndex
=
MultiIndex
<
nDimUp
>
;
using
LowerIndex
=
MultiIndex
<
nDimLow
>
;
__host__
__device__
constexpr
TransformedTensorDescriptor
()
{
static_assert
(
nTransform
==
Transforms
::
Size
()
&&
nTransform
==
LowDimensionIds
::
Size
()
&&
nTransform
==
UpDimensionIds
::
Size
(),
"wrong! # of transformations not the same"
);
// sanity check:
// LowDimensionIds should include all low-dimensions,
// UpDimensionIds should include all up-dimensions
using
mingled_up_dimension_ids
=
decltype
(
unpack
(
lambda_merge_sequences
{},
UpDimensionIds
{}));
using
sorted_up_dimension_ids
=
typename
sequence_sort
<
mingled_up_dimension_ids
,
math
::
less
<
index_t
>>::
type
;
static_assert
(
sorted_up_dimension_ids
::
Size
()
==
nDimUp
&&
is_valid_sequence_map
<
sorted_up_dimension_ids
>
{},
"wrong! UpDimensionIds is not configured correctly"
);
using
mingled_low_dimension_ids
=
decltype
(
unpack
(
lambda_merge_sequences
{},
LowDimensionIds
{}));
using
sorted_low_dimension_ids
=
typename
sequence_sort
<
mingled_low_dimension_ids
,
math
::
less
<
index_t
>>::
type
;
static_assert
(
sorted_low_dimension_ids
::
Size
()
==
nDimLow
&&
is_valid_sequence_map
<
sorted_low_dimension_ids
>
{},
"wrong! LowDimensionIds is not configured correctly"
);
// TODO: sanity check: while a up-dimension could be associated with multille
// transformation, a low-dimension should be associated with only one transformation
// TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths
// of lower-tensor-descriptor
}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
GetNumOfUpperDimension
();
}
__host__
__device__
static
constexpr
auto
GetLowerTensorDescriptor
()
{
return
LowTensorDescriptor
{};
}
struct
lambda_GetUpperLengths
{
template
<
typename
Transform
>
__host__
__device__
constexpr
auto
operator
()(
const
Transform
&
tran
)
const
{
return
tran
.
GetUpperLengths
();
}
};
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
constexpr
auto
tuple_of_up_lengths
=
transform_tuples
(
lambda_GetUpperLengths
{},
Transforms
{});
constexpr
auto
mingled_up_lengths
=
unpack
(
lambda_merge_sequences
{},
tuple_of_up_lengths
);
constexpr
auto
mingled_up_dimension_ids
=
unpack
(
lambda_merge_sequences
{},
UpDimensionIds
{});
// TODO: sanity-check mingled_up_dimension_ids contain all upper-dimensions
// TODO: sanity-check mingled_up_lengths have no conflicting upper-length
// sort by upper-dimension-ids
using
sort_up_dimension_ids
=
sequence_unique_sort
<
decltype
(
mingled_up_dimension_ids
),
math
::
less
<
index_t
>
,
math
::
equal
<
index_t
>>
;
// sanity-check sorted-upper-dimension-ids should be Sequence<0, 1, ... nDimUp-1>
static_assert
(
is_same
<
typename
sort_up_dimension_ids
::
type
,
typename
arithmetic_sequence_gen
<
0
,
nDimUp
,
1
>::
type
>
{},
"wrong! UpDimensionIds is not configured correctly"
);
constexpr
auto
sorted2unsorted_map
=
typename
sort_up_dimension_ids
::
sorted2unsorted_map
{};
constexpr
auto
sorted_up_lengths
=
pick_sequence_elements_by_ids
(
mingled_up_lengths
,
sorted2unsorted_map
);
return
sorted_up_lengths
;
}
__host__
__device__
static
constexpr
auto
GetLengths
()
{
return
GetUpperLengths
();
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetLength
(
Number
<
IDim
>
)
{
return
GetLengths
()[
IDim
];
}
template
<
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
GetLengths
(
Sequence
<
IDims
...
>
)
{
return
Sequence
<
GetLength
(
Number
<
IDims
>
{})...
>
{};
}
template
<
index_t
IDim
,
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
GetLengths
(
Number
<
IDim
>
,
Number
<
IDims
>
...)
{
return
GetLengths
(
Sequence
<
IDim
,
IDims
...
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
reduce_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
// TODO: Is this the correct definition for transformed tensor?
return
GetLowerTensorDescriptor
().
GetElementSpace
();
}
// TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_low_part
=
pick_array_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part
=
tran
.
CalculateLowerIndex
(
to_array
(
idx_up_part
));
});
return
idx_low
;
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
idx_up_old
,
const
LowerIndex
&
idx_low_old
)
{
LowerIndex
idx_low_diff
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_diff_part
=
pick_array_element
(
idx_up_diff
,
UpDimensionIds
{}.
At
(
itran
));
const
auto
idx_up_old_part
=
pick_array_element
(
idx_up_old
,
UpDimensionIds
{}.
At
(
itran
));
const
auto
idx_low_old_part
=
pick_array_element
(
idx_low_old
,
LowDimensionIds
{}.
At
(
itran
));
auto
idx_low_diff_part
=
pick_array_element
(
idx_low_diff
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part
=
tran
.
CalculateLowerIndexDiff
(
to_array
(
idx_up_diff_part
),
to_array
(
idx_up_old_part
),
to_array
(
idx_low_old_part
));
});
return
idx_low_diff
;
}
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
const
UpperIndex
&
idx_up
)
{
return
GetLowerTensorDescriptor
().
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
}
struct
lambda_sequence_logical_and
{
template
<
typename
...
Seqs
>
__host__
__device__
constexpr
auto
operator
()(
Seqs
...)
const
{
return
typename
sequence_reduce
<
logical_and
<
index_t
>
,
Seqs
...
>::
type
{};
}
};
template
<
typename
T
>
struct
lambda_is_true
{
__host__
__device__
constexpr
auto
operator
()(
const
T
&
x
)
const
{
// TODO: remove static_cast once Sequence can take bool as entries
return
static_cast
<
bool
>
(
x
)
==
true
;
}
};
struct
lambda_get_linear_dimension_mask_of_single_tranform
{
// check only one transform at a time
template
<
typename
Transform
,
typename
LowDimensionId
,
typename
UpDimensionId
>
__host__
__device__
constexpr
auto
operator
()(
Transform
,
LowDimensionId
,
UpDimensionId
)
const
{
// judge if transformation is linear
constexpr
bool
is_linear_transform
=
Transform
::
IsLinearTransform
();
// judge if all lower dimension are linear
constexpr
bool
are_all_low_dim_linear
=
sequence_all_of
(
pick_sequence_elements_by_ids
(
GetLowerTensorDescriptor
().
GetLinearDimensionMask
(),
LowDimensionId
{}),
lambda_is_true
<
index_t
>
{});
// create linear mask for upper dimensions
constexpr
bool
are_up_dim_linear
=
is_linear_transform
&&
are_all_low_dim_linear
;
constexpr
auto
mask_of_up_linear_dims
=
modify_sequence_elements_by_ids
(
typename
uniform_sequence_gen
<
nDimUp
,
1
>::
type
{},
typename
uniform_sequence_gen
<
UpDimensionId
::
Size
(),
are_up_dim_linear
>::
type
{},
UpDimensionId
{});
return
mask_of_up_linear_dims
;
}
};
// TODO: this is a hack, transform_tuples() doesn't compile, would complain about constexpr
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
,
index_t
...
Is
>
__host__
__device__
static
constexpr
auto
dummy_transform_tuples_impl
(
F
f
,
X
x
,
Y
y
,
Z
z
,
Sequence
<
Is
...
>
)
{
return
make_tuple
(
f
(
x
.
At
(
Number
<
Is
>
{}),
y
.
At
(
Number
<
Is
>
{}),
z
.
At
(
Number
<
Is
>
{}))...);
}
__host__
__device__
static
constexpr
auto
GetLinearDimensionMask
()
{
#if 0
// create tuple of linear dimension masks, for all transformations
constexpr auto tuple_of_linear_dimension_mask =
transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{},
Transforms{},
LowDimensionIds{},
UpDimensionIds{});
#else
// create tuple of linear dimension masks, for all transformations
// TODO: this is a hack, transform_tuples() doesn't compile, complain about constexpr
constexpr
auto
tuple_of_linear_dimension_mask
=
dummy_transform_tuples_impl
(
lambda_get_linear_dimension_mask_of_single_tranform
{},
Transforms
{},
LowDimensionIds
{},
UpDimensionIds
{},
typename
arithmetic_sequence_gen
<
0
,
Transforms
::
Size
(),
1
>::
type
{});
#endif
// reduce tuple of masks into one mask
constexpr
auto
linear_dimension_mask
=
unpack
(
lambda_sequence_logical_and
{},
tuple_of_linear_dimension_mask
);
return
linear_dimension_mask
;
}
__host__
__device__
static
constexpr
auto
GetNonLinearDimensionMask
()
{
return
GetLinearDimensionMask
().
Transform
(
logical_not
<
index_t
>
{});
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
IsLinearDimension
(
Number
<
IDim
>
)
{
return
GetLinearDimensionMask
().
At
(
Number
<
IDim
>
{});
}
__host__
__device__
static
constexpr
auto
GetLinearDimensions
()
{
constexpr
auto
linear_dimension_mask
=
GetLinearDimensionMask
();
return
pick_sequence_elements_by_mask
(
typename
arithmetic_sequence_gen
<
0
,
nDimUp
,
1
>::
type
{},
linear_dimension_mask
);
}
__host__
__device__
static
constexpr
auto
GetNonLinearDimensions
()
{
constexpr
auto
nonlinear_dimension_mask
=
GetNonLinearDimensionMask
();
return
pick_sequence_elements_by_mask
(
typename
arithmetic_sequence_gen
<
0
,
nDimUp
,
1
>::
type
{},
nonlinear_dimension_mask
);
}
#if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
// not implemented
}
#endif
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
idx_up
)
{
bool
flag
=
true
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
flag
=
flag
&&
tran
.
IsUpperIndexMappedToValidLowerIndex
(
to_array
(
idx_up_part
));
});
return
flag
;
}
// Whenever this function is called, it will call CalculateLowerIndex() recursively.
// If you have created a tensor coordinate already, instead of calling this function,
// you should call TensorCoordinate::IsUpperIndexMappedToValidOffset() which would
// be less expensive.
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
(
const
UpperIndex
&
idx_up
)
{
return
IsUpperIndexMappedToValidLowerIndex
(
idx_up
)
&&
GetLowerTensorDescriptor
().
IsUpperIndexMappedToValidOffset
(
CalculateLowerIndex
(
idx_up
));
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
0 → 100644
View file @
52423948
#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
template
<
typename
Lengths
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_packed
(
Lengths
)
{
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
}
template
<
typename
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_aligned
(
Lengths
,
Number
<
Align
>
)
{
constexpr
index_t
L_back_align
=
Align
*
math
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
return
calculate_tensor_strides_packed
(
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
}
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
__host__
__device__
constexpr
auto
make_native_tensor_descriptor
(
Sequence
<
Lengths
...
>
,
Sequence
<
Strides
...
>
)
{
return
NativeTensorDescriptor
<
NativeDimension
<
Lengths
,
Strides
>
...
>
{};
}
template
<
typename
Lengths
>
__host__
__device__
constexpr
auto
make_native_tensor_descriptor_packed
(
Lengths
)
{
constexpr
auto
strides
=
calculate_tensor_strides_packed
(
Lengths
{});
return
make_native_tensor_descriptor
(
Lengths
{},
strides
);
}
template
<
typename
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
make_native_tensor_descriptor_aligned
(
Lengths
,
Number
<
Align
>
)
{
constexpr
auto
strides
=
calculate_tensor_strides_aligned
(
Lengths
{},
Number
<
Align
>
{});
return
make_native_tensor_descriptor
(
Lengths
{},
strides
);
}
template
<
typename
LowTensorDescriptor
,
typename
Transforms
,
typename
LowDimensionIds
,
typename
UpDimensionIds
>
__host__
__device__
constexpr
auto
transform_tensor_descriptor
(
LowTensorDescriptor
,
Transforms
,
LowDimensionIds
,
UpDimensionIds
)
{
return
TransformedTensorDescriptor
<
LowTensorDescriptor
,
Transforms
,
LowDimensionIds
,
UpDimensionIds
>
{};
}
template
<
typename
LowerTensorDescriptor
,
index_t
...
LowerLengths
,
index_t
...
LowerDimensionIds
,
index_t
...
UpperDimensionIds
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_impl
(
LowerTensorDescriptor
,
Sequence
<
LowerLengths
...
>
,
Sequence
<
LowerDimensionIds
...
>
,
Sequence
<
UpperDimensionIds
...
>
)
{
return
TransformedTensorDescriptor
<
LowerTensorDescriptor
,
Tuple
<
PassThrough
<
LowerLengths
>
...
>
,
Tuple
<
Sequence
<
LowerDimensionIds
>
...
>
,
Tuple
<
Sequence
<
UpperDimensionIds
>
...
>>
{};
}
template
<
typename
LowerTensorDescriptor
,
typename
MapLower2Upper
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_lower2upper
(
LowerTensorDescriptor
,
MapLower2Upper
)
{
static_assert
(
is_valid_sequence_map
<
MapLower2Upper
>
{},
"wrong! MapLower2Upper is not a valid map"
);
return
reorder_tensor_descriptor_impl
(
LowerTensorDescriptor
{},
LowerTensorDescriptor
::
GetLengths
(),
typename
arithmetic_sequence_gen
<
0
,
LowerTensorDescriptor
::
GetNumOfDimension
(),
1
>::
type
{},
MapLower2Upper
{});
}
template
<
typename
LowerTensorDescriptor
,
typename
MapUpper2Lower
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_upper2lower
(
LowerTensorDescriptor
,
MapUpper2Lower
)
{
return
reorder_tensor_descriptor_given_lower2upper
(
LowerTensorDescriptor
{},
typename
sequence_map_inverse
<
MapUpper2Lower
>::
type
{});
}
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
bool
AreDimensionsUnfoldable
(
Lengths
,
Strides
)
{
static_assert
(
Lengths
::
Size
()
==
Strides
::
Size
(),
"wrong!"
);
bool
flag
=
true
;
for
(
index_t
i
=
0
;
i
<
Lengths
::
Size
()
-
1
;
++
i
)
{
flag
=
flag
&&
Strides
::
At
(
i
)
==
Strides
::
At
(
i
+
1
)
*
Lengths
::
At
(
i
+
1
);
}
return
flag
;
}
// unfold only support NativeTennsorDescriptor, for now
template
<
index_t
FirstUnfoldDim
,
index_t
LastUnfoldDim
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
unfold_tensor_descriptor
(
NativeTensorDescriptor
<
Ts
...
>
desc
,
Number
<
FirstUnfoldDim
>
,
Number
<
LastUnfoldDim
>
)
{
constexpr
index_t
nDim
=
desc
.
GetNumOfDimension
();
static_assert
(
FirstUnfoldDim
>=
0
&&
LastUnfoldDim
<
nDim
&&
FirstUnfoldDim
<=
LastUnfoldDim
,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
type
{};
constexpr
auto
middle
=
typename
arithmetic_sequence_gen
<
FirstUnfoldDim
,
LastUnfoldDim
+
1
,
1
>::
type
{};
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
nDim
,
1
>::
type
{};
// sanity-checknfoldable
static_assert
(
AreDimensionsUnfoldable
(
desc
.
GetLengths
(
middle
),
desc
.
GetStrides
(
middle
)),
"wrong! not unfoldable"
);
// unfolded length, stride
constexpr
index_t
unfold_length
=
reduce_on_sequence
(
desc
.
GetLengths
(
middle
),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
desc
.
GetStride
(
Number
<
LastUnfoldDim
>
{});
// new lengths, strides
constexpr
auto
new_lengths
=
desc
.
GetLengths
(
left
).
PushBack
(
Number
<
unfold_length
>
{}).
PushBack
(
desc
.
GetLengths
(
right
));
constexpr
auto
new_strides
=
desc
.
GetStrides
(
left
).
PushBack
(
Number
<
unfold_stride
>
{}).
PushBack
(
desc
.
GetStrides
(
right
));
return
make_native_tensor_descriptor
(
new_lengths
,
new_strides
);
}
#if 0
// not implemented
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename LeftPads,
typename RightPads>
__host__ __device__ constexpr auto
pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
constexpr auto non_pad_low_dim_ids = xxx;
return transform_tensor_descriptor(
LowerTensorDescriptor{},
make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
LeftPads,
RightPads>{})
.PushBack(PassThrough<xxxx>...),
make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
}
#endif
// a cluster map 1d index to N-d index
template
<
typename
Lengths
,
typename
ArrangeOrder
>
struct
ClusterDescriptor
{
static
constexpr
index_t
nDim
=
Lengths
::
Size
();
static
constexpr
auto
mDesc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Lengths
{}),
make_tuple
(
Merge
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
ArrangeOrder
{}))
>
{}),
make_tuple
(
ArrangeOrder
{}),
make_tuple
(
Sequence
<
0
>
{}));
__host__
__device__
constexpr
ClusterDescriptor
()
{
static_assert
(
Lengths
::
Size
()
==
nDim
&&
ArrangeOrder
::
Size
()
==
nDim
,
"wrong! size not the same"
);
static_assert
(
is_valid_sequence_map
<
ArrangeOrder
>
{},
"wrong! ArrangeOrder is wrong"
);
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
mDesc
.
GetElementSize
();
}
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
}
};
template
<
typename
Lengths
,
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
__host__
__device__
constexpr
auto
make_cluster_descriptor
(
Lengths
,
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
{
return
ClusterDescriptor
<
Lengths
,
ArrangeOrder
>
{};
}
template
<
typename
...
NativeDimensions
>
__host__
__device__
void
print_tensor_descriptor
(
const
char
*
s
,
const
NativeTensorDescriptor
<
NativeDimensions
...
>&
desc
)
{
print_tensor_descriptor_impl
(
s
,
desc
.
GetLengths
(),
desc
.
GetStrides
());
}
template
<
typename
...
Ts
>
__host__
__device__
void
print_tensor_descriptor
(
const
char
*
s
,
const
TransformedTensorDescriptor
<
Ts
...
>&
desc
)
{
print_tensor_descriptor_impl
(
s
,
desc
.
GetLengths
());
}
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
__host__
__device__
void
print_tensor_descriptor_impl
(
const
char
*
s
,
Sequence
<
Lengths
...
>
,
Sequence
<
Strides
...
>
)
{
constexpr
index_t
nDim
=
sizeof
...(
Lengths
);
static_assert
(
nDim
>
0
&&
nDim
<=
12
,
"wrong!"
);
static_if
<
nDim
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u}, strides {%u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u}, strides {%u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
3
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
4
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
5
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
6
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
7
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"%u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
11
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
"%u %u "
"%u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
static_if
<
nDim
==
12
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
"%u %u %u %u "
"%u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...,
Strides
...);
});
}
template
<
index_t
...
Lengths
>
__host__
__device__
void
print_tensor_descriptor_impl
(
const
char
*
s
,
Sequence
<
Lengths
...
>
)
{
constexpr
index_t
nDim
=
sizeof
...(
Lengths
);
static_assert
(
nDim
>
0
&&
nDim
<=
12
,
"wrong!"
);
static_if
<
nDim
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
2
>
{}(
[
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
3
>
{}(
[
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
4
>
{}(
[
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
5
>
{}(
[
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
6
>
{}(
[
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u},
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
7
>
{}(
[
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
11
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
static_if
<
nDim
==
12
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nDim
,
Lengths
...);
});
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_view.hpp
0 → 100644
View file @
52423948
#ifndef CK_TENSOR_VIEW_HPP
#define CK_TENSOR_VIEW_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate_deprecated.hpp"
namespace
ck
{
// TensorDesc is ConstantTensorDescriptor or ConstantMergedTensorDescriptor
template
<
class
TensorDesc
,
class
TData
>
struct
NormalTensorView
{
using
type
=
NormalTensorView
;
using
tensor_desc_type
=
TensorDesc
;
using
coordinate_type
=
typename
NormalTensorCoordinate_deprecated
<
TensorDesc
>::
type
;
using
data_type
=
TData
;
static
constexpr
auto
nDim
=
TensorDesc
::
GetNumOfDimension
();
__host__
__device__
constexpr
NormalTensorView
(
TData
*
p_data
)
:
mpData
{
p_data
}
{}
__host__
__device__
constexpr
NormalTensorView
()
:
NormalTensorView
{
nullptr
}
{}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
nDim
;
}
__host__
__device__
static
constexpr
auto
GetLengths
()
{
return
TensorDesc
::
GetLengths
();
}
__host__
__device__
const
TData
&
operator
[](
coordinate_type
coord
)
const
{
return
mpData
[
coord
.
GetOffset
()];
}
__host__
__device__
TData
&
operator
()(
coordinate_type
coord
)
const
{
return
mpData
[
coord
.
GetOffset
()];
}
template
<
class
IDim
,
class
DataPerVector
>
__host__
__device__
static
constexpr
auto
IsVectorizationAllowed
(
IDim
,
DataPerVector
)
{
return
TensorDesc
::
IsVectorizationAllowed
(
IDim
{},
DataPerVector
{});
}
template
<
class
IDim
,
class
DataPerVector
>
__host__
__device__
auto
Vectorize
(
IDim
idim
,
DataPerVector
data_per_vector
)
const
{
static_assert
(
IsVectorizationAllowed
(
idim
,
data_per_vector
),
"wrong!"
);
using
vector_t
=
typename
vector_type
<
TData
,
data_per_vector
>::
MemoryType
;
return
NormalTensorView
<
decltype
(
TensorDesc
::
Vectorize
(
idim
,
data_per_vector
)),
vector_t
>
(
reinterpret_cast
<
vector_t
*>
(
mpData
));
}
template
<
index_t
...
Is
>
__host__
__device__
auto
Slice
(
coordinate_type
slice_origin
,
Sequence
<
Is
...
>
slice_lengths
)
{
static_assert
(
slice_lengths
.
GetSize
()
==
nDim
,
"wrong!"
);
return
NormalTensorView
<
decltype
(
TensorDesc
::
Slice
(
slice_lengths
)),
TData
>
(
mpData
+
slice_origin
.
GetOffset
());
}
template
<
class
IDim
,
class
SliceLen
>
__host__
__device__
auto
Slice
(
coordinate_type
slice_origin
,
IDim
idim
,
SliceLen
slice_len
)
const
{
return
NormalTensorView
<
decltype
(
TensorDesc
::
Slice
(
idim
,
slice_len
)),
TData
>
(
mpData
+
slice_origin
.
GetOffset
());
}
// slice_window is a slicing window on "*this"
template
<
class
SliceWindow
,
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSliceWindow
(
SliceWindow
&
slice_window
,
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
if
(
PositiveDirection
)
{
slice_window
.
mpData
+=
coordinate_type
{
step_sizes
}.
GetOffset
();
}
else
{
slice_window
.
mpData
-=
coordinate_type
{
step_sizes
}.
GetOffset
();
}
}
// private:
data_type
*
mpData
;
};
template
<
class
...
Xs
,
class
TData
>
__host__
__device__
constexpr
auto
make_TensorView
(
ConstantTensorDescriptor
<
Xs
...
>
,
TData
*
p_data
)
{
return
NormalTensorView
<
ConstantTensorDescriptor
<
Xs
...
>
,
TData
>
{
p_data
};
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_visit.hpp
0 → 100644
View file @
52423948
#ifndef CK_TENSOR_VISIT_HPP
#define CK_TENSOR_VISIT_HPP
#include "common_header.hpp"
#include "dimension.hpp"
#include "dimension_transform.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate.hpp"
namespace
ck
{
template
<
class
TensorDescriptor
>
struct
TensorVisit
{
using
Index
=
typename
TensorDescriptor
::
Index
;
using
Coordinate
=
typename
TensorCoordinate
<
TensorDescriptor
>::
type
;
__host__
__device__
static
void
Run_v1
(
Index
idx_begin
)
{
const
auto
coord_begin
=
Coordinate
(
idx_begin
);
ford
<
TensorDescriptor
::
GetLengths
()
>
{}(
[
&
](
auto
idx_diff
)
{
index_t
offset
=
(
coord_begin
+
idx_diff
).
GetOffset
();
});
}
__host__
__device__
static
void
Run_v2
(
Index
idx_begin
)
{
const
auto
coord_begin
=
Coordinate
(
idx_begin
);
ford
<
TensorDescriptor
::
GetLengths
()
>
{}([
&
](
auto
idx_diff
)
{
index_t
offset_diff
=
coord_begin
.
GetOffsetDiff
(
idx_diff
);
index_t
offset
=
coord_begin
.
GetOffset
()
+
offset_diff
;
});
}
__host__
__device__
static
void
Run_v3
(
Index
idx_begin
)
{
const
auto
coord_begin
=
Coordinate
(
idx_begin
);
constexpr
auto
linear_dimensions
=
TensorDescriptor
::
GetLinearDimensions
();
constexpr
auto
nonlinear_dimensions
=
TensorDescriptor
::
GetNonLinearDimensions
();
constexpr
auto
lengths
=
TensorDescriptor
::
GetLengths
();
constexpr
auto
linear_dimension_lengths_hack
=
lambda_HackLengths
{}(
lengths
,
linear_dimensions
);
constexpr
auto
nonlinear_dimension_lengths_hack
=
lambda_HackLengths
{}(
lengths
,
nonlinear_dimensions
);
ford
<
nonlinear_dimension_lengths_hack
>
{}([
&
](
auto
idx_diff_nonlinear_hack
)
{
// run-time component
index_t
offset_diff_nonlinear
=
coord_begin
.
GetOffsetDiff
(
idx_diff_nonlinear_hack
);
ford
<
linear_dimension_lengths_hack
>
{}([
&
](
auto
idx_diff_linear_hack
)
{
// compile-time component
index_t
offset_diff_linear
=
coord_begin
.
GetOffsetDiff
(
idx_diff_linear_hack
);
index_t
offset
=
coord_begin
.
GetOffset
()
+
offset_diff_nonlinear
+
offset_diff_linear
;
});
});
}
__host__
__device__
static
void
Run_v4
(
Index
idx_begin
)
{
const
auto
coord_begin
=
Coordinate
(
idx_begin
);
constexpr
auto
linear_dimensions
=
TensorDescriptor
::
GetLinearDimensions
();
constexpr
auto
nonlinear_independent_dimension_groups
=
TensorDescriptor
::
GetNonLinearIndependentDimensionGroups
();
constexpr
auto
lengths
=
TensorDescriptor
::
GetLengths
();
constexpr
auto
linear_dimension_lengths
=
lambda_HackLengths
{}(
lengths
,
linear_dimensions
);
// run-time component
index_t
offset_diff_nonlinear
=
0
;
template
<
index_t
NGroup
>
struct
f_recursion
{
template
<
index_t
IGroup
>
__host__
__device__
void
Run
(
Number
<
IGroup
>
)
{
constexpr
auto
nonlinear_independent_dimensions_igroup
=
nonlinear_independent_dimension_groups
.
Get
(
igroup
);
constexpr
auto
nonlinear_independent_lengths_igroup
=
lambda_HackLengths
{}(
lengths
,
nonlinear_independent_dimensions_igroup
);
ford
<
nonlinear_independent_lengths_igroup
>
{}(
[
&
](
auto
idx_diff_nonlinear_igroup_hack
)
{
// run-time component
offset_diff_nonlinear
+=
coord_begin
.
GetOffsetDiff
(
idx_diff_nonlinear_igroup_hack
);
Run
(
Number
<
IGroup
+
1
>
{});
});
};
// inner-most work
template
<
>
__host__
__device__
void
Run
(
Number
<
NGroup
>
)
{
ford
<
linear_dimension_lengths
>
{}([
&
](
auto
idx_diff_linear_hack
)
{
// compile-time component
index_t
offset_diff_linear
=
coord_begin
.
GetOffsetDiff
(
idx_diff_linear_hack
);
index_t
offset
=
coord_begin
.
GetOffset
()
+
offset_diff_nonlinear
+
offset_diff_linear
;
});
}
};
// run-time component
index_t
offset_diff_nonlinear
=
0
;
f_recursion
<
nonlinear_independent_dimension_groups
.
GetSize
()
>
{}.
Run
();
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
View file @
52423948
...
@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3
...
@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3
}
}
}
}
__device__
constexpr
index_t
GetRegister
Clipboard
Size
()
const
__device__
constexpr
index_t
GetRegister
Buffer
Size
()
const
{
{
static_assert
(
is_same
<
Float
,
float
>
{},
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{},
"wrong! only support float!
\n
"
);
...
@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3
...
@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3
return
DataPerRead
*
(
L0
+
thread_per_d0
-
1
)
/
thread_per_d0
;
return
DataPerRead
*
(
L0
+
thread_per_d0
-
1
)
/
thread_per_d0
;
}
}
__device__
void
RunLoadRegister
Clipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegister
Buffer
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
Float
*
__restrict__
p_clipboard
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3
...
@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3
}
}
}
}
__device__
void
RunStoreRegister
Clipboard
(
const
Float
*
__restrict__
p_clipboard
,
__device__
void
RunStoreRegister
Buffer
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3
...
@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3
}
}
#if CK_USE_AMD_INLINE_ASM
#if CK_USE_AMD_INLINE_ASM
__device__
void
RunLoadRegister
Clipboard
_asm
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegister
Buffer
_asm
(
const
Float
*
__restrict__
p_src
,
Float
*
p_clipboard
)
const
Float
*
p_clipboard
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3
...
@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3
}
}
}
}
__device__
void
RunStoreRegister
Clipboard
_asm
(
const
Float
*
__restrict__
p_clipboard
,
__device__
void
RunStoreRegister
Buffer
_asm
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
View file @
52423948
...
@@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3
...
@@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
constexpr
index_t
num_active_thread
=
constexpr
index_t
num_active_thread
=
accumulat
e_on_sequence
(
ThreadPerDims
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
reduc
e_on_sequence
(
ThreadPerDims
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
if
(
BlockSize
>
num_active_thread
)
if
(
BlockSize
>
num_active_thread
)
{
{
...
@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3
...
@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3
}
}
}
}
__device__
static
constexpr
index_t
GetRegister
Clipboard
Size
()
__device__
static
constexpr
index_t
GetRegister
Buffer
Size
()
{
{
static_assert
(
is_same
<
Float
,
float
>
{},
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{},
"wrong! only support float!
\n
"
);
...
@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3
...
@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3
return
DataPerRead
*
nloop_d0
*
nloop_d1
*
nloop_d2
;
return
DataPerRead
*
nloop_d0
*
nloop_d1
*
nloop_d2
;
}
}
__device__
void
RunLoadRegister
Clipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegister
Buffer
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
Float
*
__restrict__
p_clipboard
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3
...
@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3
}
}
}
}
__device__
void
RunStoreRegister
Clipboard
(
const
Float
*
__restrict__
p_clipboard
,
__device__
void
RunStoreRegister
Buffer
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
View file @
52423948
...
@@ -505,7 +505,7 @@ struct Blockwise4dTensorCopy3
...
@@ -505,7 +505,7 @@ struct Blockwise4dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
constexpr
index_t
num_active_thread
=
constexpr
index_t
num_active_thread
=
accumulat
e_on_sequence
(
ThreadPerDims
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
reduc
e_on_sequence
(
ThreadPerDims
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
if
(
BlockSize
>
num_active_thread
)
if
(
BlockSize
>
num_active_thread
)
{
{
...
@@ -596,7 +596,7 @@ struct Blockwise4dTensorCopy3
...
@@ -596,7 +596,7 @@ struct Blockwise4dTensorCopy3
}
}
}
}
__device__
constexpr
index_t
GetRegister
Clipboard
Size
()
const
__device__
constexpr
index_t
GetRegister
Buffer
Size
()
const
{
{
static_assert
(
is_same
<
Float
,
float
>
{},
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>
{},
"wrong! only support float!
\n
"
);
...
@@ -623,8 +623,8 @@ struct Blockwise4dTensorCopy3
...
@@ -623,8 +623,8 @@ struct Blockwise4dTensorCopy3
return
DataPerRead
*
nloop_d0
*
nloop_d1
*
nloop_d2
*
nloop_d3
;
return
DataPerRead
*
nloop_d0
*
nloop_d1
*
nloop_d2
*
nloop_d3
;
}
}
__device__
void
RunLoadRegister
Clipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegister
Buffer
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
Float
*
__restrict__
p_clipboard
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -690,8 +690,8 @@ struct Blockwise4dTensorCopy3
...
@@ -690,8 +690,8 @@ struct Blockwise4dTensorCopy3
}
}
}
}
__device__
void
RunStoreRegister
Clipboard
(
const
Float
*
__restrict__
p_clipboard
,
__device__
void
RunStoreRegister
Buffer
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
View file @
52423948
...
@@ -5,6 +5,10 @@
...
@@ -5,6 +5,10 @@
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "threadwise_gemm.hpp"
#include "threadwise_gemm.hpp"
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
namespace
ck
{
namespace
ck
{
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
...
@@ -97,24 +101,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -97,24 +101,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
mMyThreadOffsetB
=
c_thread_mtx_index
.
batch
*
BlockMatrixStrideB
+
mMyThreadOffsetB
=
c_thread_mtx_index
.
batch
*
BlockMatrixStrideB
+
b_block_mtx
.
GetOffsetFromMultiIndex
(
0
,
c_thread_mtx_index
.
col
);
b_block_mtx
.
GetOffsetFromMultiIndex
(
0
,
c_thread_mtx_index
.
col
);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: ");
print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: ");
print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: ");
printf("%u %u, %u %u %u, %u %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
c_thread_mtx_index.batch,
c_thread_mtx_index.row,
c_thread_mtx_index.col,
mMyThreadOffsetA,
mMyThreadOffsetB);
}
#endif
}
}
__device__
MatrixIndex
GetBeginOfThreadMatrixC
(
index_t
thread_id
)
const
__device__
MatrixIndex
GetBeginOfThreadMatrixC
(
index_t
thread_id
)
const
...
@@ -257,29 +243,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -257,29 +243,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
}
}
}
}
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
printf("a: %f %f %f %f %f %f %f %f, b: %f %f %f %f %f %f %f %f\n",
p_a_thread[0],
p_a_thread[1],
p_a_thread[2],
p_a_thread[3],
p_a_thread[4],
p_a_thread[5],
p_a_thread[6],
p_a_thread[7],
p_b_thread[0],
p_b_thread[1],
p_b_thread[2],
p_b_thread[3],
p_b_thread[4],
p_b_thread[5],
p_b_thread[6],
p_b_thread[7]);
}
#endif
threadwise_gemm
(
a_thread_mtx
,
threadwise_gemm
(
a_thread_mtx
,
True
,
True
,
p_a_thread
,
p_a_thread
,
...
@@ -311,10 +274,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -311,10 +274,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
// thread A, B for GEMM
// thread A, B for GEMM
// A is transposed, b is not
// A is transposed, b is not
constexpr
auto
a_thread_mtx
=
constexpr
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
make_ConstantMatrixDescriptor
_packed
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
constexpr
auto
b_thread_mtx
=
constexpr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
make_ConstantMatrixDescriptor
_packed
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// thread A-sub, B-sub for copy
// thread A-sub, B-sub for copy
constexpr
auto
a_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
constexpr
auto
a_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
...
@@ -382,102 +345,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -382,102 +345,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
}
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run_asm_v2
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
{
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
index_t
M
=
a_block_mtx
.
NCol
();
constexpr
index_t
N
=
b_block_mtx
.
NCol
();
constexpr
index_t
K
=
a_block_mtx
.
NRow
();
// A is transposed
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
NPerThread
=
c_thread_mtx
.
NCol
();
// thread A, B for GEMM
// A is transposed, b is not
constexpr
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
constexpr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// thread A-sub, B-sub for copy
constexpr
auto
a_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThreadSubC
>
{},
Number
<
MPerThread
>
{});
constexpr
auto
b_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThreadSubC
>
{},
Number
<
NPerThread
>
{});
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpace
()];
FloatB
p_b_thread
[
b_thread_mtx
.
GetElementSpace
()];
constexpr
index_t
MPerLevel1Cluster
=
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
;
constexpr
index_t
NPerLevel1Cluster
=
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
;
// assertion for inline asm
static_assert
(
is_same
<
FloatA
,
float
>
{}
&&
is_same
<
FloatB
,
float
>
{}
&&
is_same
<
FloatC
,
float
>
{},
"Run_amd_asm only deal with float
\n
"
);
static_assert
(
MPerThreadSubC
==
4
&&
NPerThreadSubC
==
4
&&
KPerThreadLoop
==
1
&&
MPerThread
==
8
&&
NPerThread
==
8
,
"Run_amd_asm cannot deal with this GEMM shape yet
\n
"
);
static_assert
(
DataPerReadA
==
4
&&
DataPerReadB
==
4
,
"Run_amd_asm only do float4 read
\n
"
);
static_assert
(
BlockMatrixStrideA
==
0
&&
BatchPerThread
==
1
,
"Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
"1 for now
\n
"
);
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
Float4
*
reg_a
=
(
Float4
*
)(
p_a_thread
);
Float4
*
reg_b
=
(
Float4
*
)(
p_b_thread
);
Float4
*
reg_c
=
(
Float4
*
)(
p_c_thread
);
void
*
a_lds_loc
=
(
void
*
)(
p_a_block
+
mMyThreadOffsetA
);
void
*
b_lds_loc
=
(
void
*
)(
p_b_block
+
mMyThreadOffsetB
);
constexpr
index_t
a_lds_row_stride
=
sizeof
(
float
)
*
a_block_mtx
.
RowStride
();
constexpr
index_t
b_lds_row_stride
=
sizeof
(
float
)
*
b_block_mtx
.
RowStride
();
constexpr
index_t
a_lds_cluster_col_stride
=
sizeof
(
float
)
*
MPerLevel1Cluster
;
constexpr
index_t
b_lds_cluster_col_stride
=
sizeof
(
float
)
*
NPerLevel1Cluster
;
ds_read_b128
(
reg_a
[
0
],
a_lds_loc
,
0
);
ds_read_b128
(
reg_b
[
0
],
b_lds_loc
,
0
);
ds_read_b128
(
reg_b
[
1
],
b_lds_loc
,
b_lds_cluster_col_stride
);
ds_read_b128
(
reg_a
[
1
],
a_lds_loc
,
a_lds_cluster_col_stride
);
lgkmcnt
(
2
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
lgkmcnt
(
1
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
#pragma unroll
for
(
index_t
k
=
1
;
k
<
K
;
++
k
)
{
ds_read_b128
(
reg_a
[
0
],
a_lds_loc
,
k
*
a_lds_row_stride
);
lgkmcnt
(
1
);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
ds_read_b128
(
reg_b
[
0
],
b_lds_loc
,
k
*
b_lds_row_stride
);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
ds_read_b128
(
reg_b
[
1
],
b_lds_loc
,
b_lds_cluster_col_stride
+
k
*
b_lds_row_stride
);
ds_read_b128
(
reg_a
[
1
],
a_lds_loc
,
a_lds_cluster_col_stride
+
k
*
a_lds_row_stride
);
lgkmcnt
(
2
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
[
0
],
reg_c
[
2
],
reg_c
[
4
],
reg_c
[
6
]);
lgkmcnt
(
1
);
outerProduct4x4
(
reg_a
[
0
],
reg_b
[
1
],
reg_c
[
1
],
reg_c
[
3
],
reg_c
[
5
],
reg_c
[
7
]);
}
lgkmcnt
(
0
);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
0
],
reg_c
[
8
],
reg_c
[
10
],
reg_c
[
12
],
reg_c
[
14
]);
outerProduct4x4
(
reg_a
[
1
],
reg_b
[
1
],
reg_c
[
9
],
reg_c
[
11
],
reg_c
[
13
],
reg_c
[
15
]);
}
#endif
#endif
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
52423948
...
@@ -2,466 +2,57 @@
...
@@ -2,466 +2,57 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
ConstantT
ensor
D
escriptor.hpp"
#include "
t
ensor
_d
escriptor.hpp"
#include "
ConstantMergedT
ensor
D
escriptor.hpp"
#include "
t
ensor
_d
escriptor
_helper
.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_coordinate.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace
ck
{
namespace
ck
{
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template
<
index_t
BlockSize
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SubLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v1
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimSrc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimDst
=
DstDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
// per-thread offset
index_t
mThreadSrcOffset
;
index_t
mThreadDstOffset
;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array
<
index_t
,
nDim
>
mThreadSrcPartialOffsets
;
Array
<
index_t
,
nDim
>
mThreadDstPartialOffsets
;
// multi-id of original tensor
Array
<
index_t
,
nOriginalDimSrc
>
mThreadSrcOriginalMultiId
;
Array
<
index_t
,
nOriginalDimDst
>
mThreadDstOriginalMultiId
;
__device__
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_id_begin
)
{
// check NDim consistency
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong"
);
// check thread arrange order and read/write access order are valid
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong!"
);
// thread cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
ThreadClusterLengths
{};
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
data_per_cluster_per_dims
.
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide sliced tensor into cluster"
);
});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
data_per_cluster_per_dims
;
// additional check for merged dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
// src
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_src
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
SrcDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_src
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
// dst
static_if
<
DstDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_dst
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
DstDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_dst
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
// original multi-id
mThreadSrcOriginalMultiId
=
SrcDesc
::
GetOriginalMultiIndexFromMultiIndex
(
src_block_data_id_begin
+
thread_data_id_begin
);
mThreadDstOriginalMultiId
=
DstDesc
::
GetOriginalMultiIndexFromMultiIndex
(
dst_block_data_id_begin
+
thread_data_id_begin
);
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
mThreadSrcPartialOffsets
(
IDim
)
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
));
});
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
dst_partial_original_dims
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
dst_partial_original_desc
=
DstDesc
::
GetOriginalTensorDescriptor
().
Extract
(
dst_partial_original_dims
);
mThreadDstPartialOffsets
(
IDim
)
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadDstOriginalMultiId
,
dst_partial_original_dims
));
});
// complete offset
mThreadSrcOffset
=
accumulate_on_array
(
mThreadSrcPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstOffset
=
accumulate_on_array
(
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
__device__
static
constexpr
auto
GetRegisterBufferDescriptor
()
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
return
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
GetRegisterBufferDescriptor
().
GetElementSpace
();
}
template
<
class
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_buffer
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
SrcDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
});
}
template
<
class
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
__restrict__
p_buffer
,
TData
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
thread_buffer_desc
),
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
});
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
RunLoadRegisterBuffer
(
p_src
,
p_buffer
);
RunStoreRegisterBuffer
(
p_buffer
,
p_dst
);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
)
{
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
// calculate new partial original multi-id
auto
old_src_partial_original_id
=
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
);
auto
new_src_partial_original_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
IDimOriginal
=
src_partial_original_dims
[
I
];
mThreadSrcOriginalMultiId
(
IDimOriginal
)
=
new_src_partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_src_partial_offset
=
mThreadSrcPartialOffsets
[
IDim
];
const
index_t
new_src_partial_offset
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
new_src_partial_original_id
);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
(
IDim
)
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr
auto
IDimOriginal
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mThreadSrcOffset
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
+=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
}).
Else
([
&
](
auto
fwd
)
{
mThreadSrcOffset
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
-=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
});
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
if
(
step_sizes
[
idim
]
!=
0
)
{
MoveSlicingWindowOnSourceTensor
(
idim
,
step_sizes
[
idim
],
positive_direction
);
}
});
}
};
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
class
SrcDesc
,
typename
BlockSrcDesc
,
class
DstDesc
,
typename
BlockDstDesc
,
class
SrcCoordinate
,
typename
BlockSliceLengths
,
class
DstCoordinate
,
typename
ThreadSliceLengths
,
class
SliceLengths
,
typename
ThreadClusterLengths
,
class
SubLengths
,
typename
ThreadClusterArrangeOrder
,
class
ThreadClusterLengths
,
typename
SrcDimAccessOrder
,
class
ThreadClusterArrangeOrder
,
typename
DstDimAccessOrder
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v
2
struct
BlockwiseGenericTensorSliceCopy_v
4
{
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
BlockSrcDesc
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v
2
(
SrcCoordinate
src_block_slice_origin
,
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v
4
(
const
Index
&
src_block_slice_origin
,
DstCoordinate
dst_block_slice_origin
)
const
Index
&
dst_block_slice_origin
)
{
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
BlockSrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
BlockDstDesc
::
GetNumOfDimension
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
(),
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
static_assert
(
"wrong! threads should be mapped to cover entire slicing window"
);
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
// map threads to cluster
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
thread_cluster_desc
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data
_cluster_id
*
Sub
Lengths
{};
const
auto
thread_data_id_begin
=
thread
_cluster_id
*
ThreadSlice
Lengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
...
@@ -470,76 +61,106 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -470,76 +61,106 @@ struct BlockwiseGenericTensorSliceCopy_v2
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
static
constexpr
index_t
Get
Register
BufferSize
()
__device__
static
constexpr
index_t
Get
Thread
BufferSize
()
{
{
return
Register
BufferDesc
::
GetElementSpace
();
return
Thread
BufferDesc
::
GetElementSpace
();
}
}
template
<
class
TData
>
template
<
typename
BlockSrcData
,
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
typename
ThreadBufferData
,
address_space_t
BlockSrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
)
const
{
{
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
#if 0
mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src,
p_thread_buffer);
#else
// tweaking
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
BlockSrcData
,
ThreadBufferData
,
BlockSrcAddressSpace
,
ThreadBufferAddressSpace
>(
p_block_src
,
p_thread_buffer
);
#endif
}
}
template
<
class
TData
>
template
<
typename
ThreadBufferData
,
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
typename
BlockDstData
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
,
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
)
const
{
{
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
#if 0
mThreadwiseStore.template Run<ThreadBufferData,
BlockDstData,
ThreadBufferAddressSpace,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
#else
// tweaking
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
ThreadBufferData
,
BlockDstData
,
ThreadBufferAddressSpace
,
BlockDstAddressSpace
>(
p_thread_buffer
,
p_block_dst
);
#endif
}
}
template
<
class
TData
>
template
<
typename
BlockSrcData
,
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
typename
BlockDstData
,
address_space_t
BlockSrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
const
{
{
TData
p_buffer
[
GetRegisterBufferSize
()];
BlockSrcData
p_thread_buffer
[
GetThreadBufferSize
()];
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
RunLoadThreadBuffer
<
BlockSrcData
,
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
BlockSrcData
,
BlockSrcAddressSpace
,
address_space_t
::
generic
>
(
p_block_src
,
p_thread_buffer
);
RunStoreThreadBuffer
<
BlockSrcData
,
BlockDstData
,
address_space_t
::
generic
,
BlockDstAddressSpace
>
(
p_thread_buffer
,
p_block_dst
);
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveSrcSlic
ing
Window
(
T
step_sizes
,
MoveSrcSlic
e
Window
(
const
T
&
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseLoad
.
MoveSrcSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseLoad
.
MoveSrcSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveDstSlic
ing
Window
(
T
step_sizes
,
MoveDstSlic
e
Window
(
const
T
&
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwise
Load
.
MoveDstSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwise
Store
.
MoveDstSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
private:
private:
using
RegisterBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferDesc
=
decltype
(
make_native_tensor_descriptor_packed
(
ThreadSliceLengths
{}));
using
ThreadwiseLoad
=
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
BlockSrcDesc
,
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
ThreadBufferDesc
,
RegisterBufferDesc
,
ThreadSliceLengths
,
SrcCoordinate
,
SrcDimAccessOrder
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
SrcVectorAccessDim
,
SubLengths
,
SrcDataPerAccess
,
SrcDimAccessOrder
,
1
>
;
SrcDimAccessOrder
,
SrcVectorAccessDim
,
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
ThreadBufferDesc
,
SrcVectorAccessDim
,
BlockDstDesc
,
SrcDataPerAccess
,
ThreadSliceLengths
,
1
>
;
DstDimAccessOrder
,
DstVectorAccessDim
,
using
ThreadwiseStore
=
1
,
ThreadwiseGenericTensorSliceCopy_v2r1
<
RegisterBufferDesc
,
DstDataPerAccess
>
;
DstDesc
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
DstCoordinate
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
ThreadwiseStore
mThreadwiseStore
;
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
0 → 100644
View file @
52423948
#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_view.hpp"
#include "tensor_coordinate_deprecated.hpp"
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace
ck
{
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v1
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimSrc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimDst
=
DstDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
// per-thread offset
index_t
mThreadSrcOffset
;
index_t
mThreadDstOffset
;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array
<
index_t
,
nDim
>
mThreadSrcPartialOffsets
;
Array
<
index_t
,
nDim
>
mThreadDstPartialOffsets
;
// multi-id of original tensor
Array
<
index_t
,
nOriginalDimSrc
>
mThreadSrcOriginalMultiId
;
Array
<
index_t
,
nOriginalDimDst
>
mThreadDstOriginalMultiId
;
__device__
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_id_begin
)
{
// check NDim consistency
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong"
);
// check thread arrange order and read/write access order are valid
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong!"
);
// thread cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
ThreadClusterLengths
{};
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
data_per_cluster_per_dims
.
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide sliced tensor into cluster"
);
});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
data_per_cluster_per_dims
;
// additional check for merged dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
// src
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_src
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
SrcDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_src
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
// dst
static_if
<
DstDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_dst
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
DstDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_dst
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
// original multi-id
mThreadSrcOriginalMultiId
=
SrcDesc
::
GetOriginalMultiIndexFromMultiIndex
(
src_block_data_id_begin
+
thread_data_id_begin
);
mThreadDstOriginalMultiId
=
DstDesc
::
GetOriginalMultiIndexFromMultiIndex
(
dst_block_data_id_begin
+
thread_data_id_begin
);
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
mThreadSrcPartialOffsets
(
IDim
)
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
));
});
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
dst_partial_original_dims
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
dst_partial_original_desc
=
DstDesc
::
GetOriginalTensorDescriptor
().
Extract
(
dst_partial_original_dims
);
mThreadDstPartialOffsets
(
IDim
)
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadDstOriginalMultiId
,
dst_partial_original_dims
));
});
// complete offset
mThreadSrcOffset
=
accumulate_on_array
(
mThreadSrcPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstOffset
=
accumulate_on_array
(
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
__device__
static
constexpr
auto
GetRegisterBufferDescriptor
()
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
return
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
GetRegisterBufferDescriptor
().
GetElementSpace
();
}
template
<
typename
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_buffer
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
SrcDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
});
}
template
<
typename
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
__restrict__
p_buffer
,
TData
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
thread_buffer_desc
),
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
});
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
RunLoadRegisterBuffer
(
p_src
,
p_buffer
);
RunStoreRegisterBuffer
(
p_buffer
,
p_dst
);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
)
{
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
// calculate new partial original multi-id
auto
old_src_partial_original_id
=
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
);
auto
new_src_partial_original_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
IDimOriginal
=
src_partial_original_dims
[
I
];
mThreadSrcOriginalMultiId
(
IDimOriginal
)
=
new_src_partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_src_partial_offset
=
mThreadSrcPartialOffsets
[
IDim
];
const
index_t
new_src_partial_offset
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
new_src_partial_original_id
);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
(
IDim
)
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr
auto
IDimOriginal
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mThreadSrcOffset
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
+=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
}).
Else
([
&
](
auto
fwd
)
{
mThreadSrcOffset
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
-=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
});
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
if
(
step_sizes
[
idim
]
!=
0
)
{
MoveSlicingWindowOnSourceTensor
(
idim
,
step_sizes
[
idim
],
positive_direction
);
}
});
}
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v2
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
const
Index
&
src_block_slice_origin
,
const
Index
&
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
static
constexpr
index_t
GetThreadBufferSize
()
{
return
ThreadBufferDesc
::
GetElementSpace
();
}
template
<
typename
TData
,
address_space_t
BlockSrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunLoadThreadBuffer
(
const
TData
*
p_block_src
,
TData
*
p_thread_buffer
)
const
{
#if 0
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
#else
// tweaking
mThreadwiseLoad
.
template
Run_optimized_address_calculation
<
TData
,
BlockSrcAddressSpace
,
ThreadBufferAddressSpace
>(
p_block_src
,
p_thread_buffer
);
#endif
}
template
<
typename
TData
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
,
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunStoreThreadBuffer
(
const
TData
*
p_thread_buffer
,
TData
*
p_block_dst
)
const
{
#if 0
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
#else
// tweaking
mThreadwiseStore
.
template
Run_optimized_address_calculation
<
TData
,
ThreadBufferAddressSpace
,
BlockDstAddressSpace
>(
p_thread_buffer
,
p_block_dst
);
#endif
}
template
<
typename
TData
,
address_space_t
BlockSrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
(
const
TData
*
p_block_src
,
TData
*
p_block_dst
)
const
{
TData
p_thread_buffer
[
GetThreadBufferSize
()];
RunLoadThreadBuffer
<
TData
,
BlockSrcAddressSpace
,
address_space_t
::
generic
>
(
p_block_src
,
p_thread_buffer
);
RunStoreThreadBuffer
<
TData
,
address_space_t
::
generic
,
BlockDstAddressSpace
>
(
p_thread_buffer
,
p_block_dst
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSliceWindow
(
step_sizes
,
positive_direction
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSliceWindow
(
step_sizes
,
positive_direction
);
}
private:
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
ThreadBufferDesc
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
ThreadBufferDesc
,
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
// this version use TensorView and TensorCoordinate_deprecated
template
<
index_t
BlockSize
,
typename
SrcTensor
,
typename
DstTensor
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v3
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v3
(
SrcTensor
src_block
,
SrcCoordinate
src_block_slice_origin
,
DstTensor
dst_block
,
DstCoordinate
dst_block_slice_origin
)
:
mThreadBuffer
{
make_TensorView
(
ThreadBufferDesc
{},
mpBuffer
)}
{
static_assert
(
nDim
==
SrcTensor
::
GetNumOfDimension
()
&&
nDim
==
DstTensor
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
is_same
<
remove_cv_t
<
typename
SrcTensor
::
data_type
>
,
remove_cv_t
<
typename
DstTensor
::
data_type
>>
{},
"wrong! type conversion not supported yet"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
=
ThreadwiseLoad
(
src_block
,
src_block_slice_origin
+
thread_data_id_begin
,
mThreadBuffer
,
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
=
ThreadwiseStore
(
mThreadBuffer
,
make_zero_array
<
index_t
,
nDim
>
(),
dst_block
,
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
void
RunLoadRegisterBuffer
()
{
mThreadwiseLoad
.
Run
();
}
__device__
void
RunStoreRegisterBuffer
()
const
{
mThreadwiseStore
.
Run
();
}
__device__
void
Run
()
{
mThreadwiseLoad
.
Run
();
mThreadwiseStore
.
Run
();
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSliceWindow
(
step_sizes
,
positive_direction
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSliceWindow
(
step_sizes
,
positive_direction
);
}
private:
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferTensor
=
NormalTensorView
<
ThreadBufferDesc
,
data_type
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3r1
<
SrcTensor
,
ThreadBufferTensor
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3r1
<
ThreadBufferTensor
,
DstTensor
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
data_type
mpBuffer
[
ThreadBufferDesc
::
GetElementSpace
()];
ThreadBufferTensor
mThreadBuffer
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
View file @
52423948
...
@@ -165,7 +165,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -165,7 +165,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
#endif
#endif
}
}
__device__
static
constexpr
index_t
GetRegister
Clipboard
Size
()
__device__
static
constexpr
index_t
GetRegister
Buffer
Size
()
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
...
@@ -183,8 +183,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -183,8 +183,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
return
thread_tensor_desc
.
GetElementSpace
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
__device__
void
RunLoadRegister
Clipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegister
Buffer
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_clipboard
)
const
Float
*
__restrict__
p_clipboard
)
const
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
...
@@ -219,8 +219,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -219,8 +219,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
});
});
}
}
__device__
void
RunStoreRegister
Clipboard
(
const
Float
*
__restrict__
p_clipboard
,
__device__
void
RunStoreRegister
Buffer
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
constexpr
auto
thread_sub_tensor_lengths
=
SrcSubLengths
{};
...
@@ -274,10 +274,10 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -274,10 +274,10 @@ struct BlockwiseTensorSliceReorderCopy_v3
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
{
Float
p_clipboard
[
GetRegister
Clipboard
Size
()];
Float
p_clipboard
[
GetRegister
Buffer
Size
()];
RunLoadRegister
Clipboard
(
p_src
,
p_clipboard
);
RunLoadRegister
Buffer
(
p_src
,
p_clipboard
);
RunStoreRegister
Clipboard
(
p_clipboard
,
p_dst
);
RunStoreRegister
Buffer
(
p_clipboard
,
p_dst
);
}
}
// this function doesn't do santiy check on whether the slicing window is out of the boundary
// this function doesn't do santiy check on whether the slicing window is out of the boundary
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
52423948
...
@@ -2,334 +2,86 @@
...
@@ -2,334 +2,86 @@
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
ConstantT
ensor
D
escriptor.hpp"
#include "
t
ensor
_d
escriptor.hpp"
#include "
ConstantMergedT
ensor
D
escriptor.hpp"
#include "
t
ensor
_d
escriptor
_helper
.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_coordinate.hpp"
#ifndef CK_
EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#ifndef CK_
USE_AMD_INTRINSIC
#define CK_
EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_
USE_AMD_INTRINSIC 1
#endif
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#endif
namespace
ck
{
namespace
ck
{
// This threadwise copy allow vector access of src and dst.
// This version use multi-index transformation
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r1
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r1
()
:
ThreadwiseGenericTensorSliceCopy_v1r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
constexpr
auto
src_data_begin_id
=
src_access_id
.
Modify
(
src_vector_access_dim
,
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_begin_id
);
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_begin_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
#else
ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
auto
src_data_begin_id
=
src_access_id
;
src_data_begin_id
(
src_vector_access_dim
)
=
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_begin_id
);
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_begin_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
#endif
}
// copy data from buffer to dst
{
using
vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
constexpr
auto
dst_data_begin_id
=
dst_access_id
.
Modify
(
dst_vector_access_dim
,
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
vector_t
vector_data
;
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#else
ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
auto
dst_data_begin_id
=
dst_access_id
;
dst_data_begin_id
(
dst_vector_access_dim
)
=
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#endif
}
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This threadwise copy allow vector access of src and dst.
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
// the other is device memory or LDS
template
<
class
SrcDesc
,
template
<
typename
SrcDesc
,
class
DstDesc
,
typename
DstDesc
,
class
SliceLengths
,
typename
SliceLengths
,
class
DimAccessOrder
,
typename
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v
1
r2
struct
ThreadwiseGenericTensorSliceCopy_v
4
r2
{
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
(
using
SrcCoord
=
typename
TensorCoordinate
<
SrcDesc
>::
type
;
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
using
DstCoord
=
typename
TensorCoordinate
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
(
const
Index
&
src_slice_origin
,
const
Index
&
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Get
Size
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
DimAccessOrder
::
Get
Size
(),
nDim
==
DimAccessOrder
::
Size
(),
"wrong! # of dimensions not the same"
);
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>
::
value
,
"wrong! map is not valid"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>
{}
,
"wrong! map is not valid"
);
static_assert
(
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
"wrong! cannot evenly divide"
);
// check vectorized memory access
// TODO:: sanity-check if vectorized memory access is allowed on src and dst
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v
1
r2
()
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v
4
r2
()
:
ThreadwiseGenericTensorSliceCopy_v
1
r2
(
make_zero_array
<
index_t
,
nDim
>
(),
:
ThreadwiseGenericTensorSliceCopy_v
4
r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
make_zero_array
<
index_t
,
nDim
>
())
{
{
}
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
__device__
void
SetSrcSliceOrigin
(
SrcCoord
src_slice_origin
)
{
{
mSrcSliceOrigin
=
src_slice_origin
;
mSrcSliceOrigin
=
src_slice_origin
;
}
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
__device__
void
SetDstSliceOrigin
(
DstCoord
dst_slice_origin
)
{
{
mDstSliceOrigin
=
dst_slice_origin
;
mDstSliceOrigin
=
dst_slice_origin
;
}
}
template
<
class
TData
>
// Will do padding check on src data: Read 0 if src data is in padding area.
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
// Will do padding check on dst data: No write if dst data is in paddin area.
template
<
typename
SrcData
,
typename
DstData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
{
using
src_vector_t
=
typename
vector_type
<
T
Data
,
SrcDataPerAccess
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
Src
Data
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
T
Data
,
DstDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
Dst
Data
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
...
@@ -341,603 +93,442 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
...
@@ -341,603 +93,442 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
static_ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
// data id w.r.t slicing-window
constexpr
auto
long_vector_data_begin_id
=
long_vector_access_id
.
Modify
(
auto
long_vector_data_begin_id
=
long_vector_access_id
;
vector_access_dim
,
long_vector_access_id
[
vector_access_dim
]
*
long_vector_size
);
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
static_for
<
0
,
long_vector_size
/
src_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
// buffer to hold a src long-vector
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
))
;
SrcData
p_src_long_vector
[
long_vector_size
]
;
constexpr
index_t
buffer_offset
=
i
*
src_data_per_access
;
// zero out buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_src_long_vector
[
i
]
=
0
;
}
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
// load data from src to the long-vector buffer
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
});
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
// store data from the long-vector buffer to dst
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
static_for
<
0
,
long_vector_size
/
dst_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
dst_data_per_access
);
const
expr
index_t
buffer_offset
=
i
*
dst
_data_per_access
;
const
index_t
buffer_offset
=
i
*
src
_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
// Check src vector's padding situation, only check the first data in this src
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
// vector. It's user's responsiblity to make sure all data in the src vector
});
// has
});
// the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
__buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
p_src
,
0
,
src_coord
.
GetOffset
());
#else
#else
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
[
&
](
auto
long_vector_access_id
)
{
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
#endif
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
});
}
}
// data id w.r.t slicing-window
// SrcData to DstData conversion
auto
long_vector_data_begin_id
=
long_vector_access_id
;
DstData
p_dst_long_vector
[
long_vector_size
];
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
TData
p_long_vector
[
long_vector_size
];
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
}
// load
data from
src to
the long-vector buffer
// store
data from the long-vector buffer
to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src
_data_per_access
;
++
i
)
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst
_data_per_access
;
++
i
)
{
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src
_data_per_access
;
scalar_id
(
vector_access_dim
)
=
i
*
dst
_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
const
auto
dst_coord
=
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
// Check dst vector's padding situation, only check the first data in this dst
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
// vector. It's user's responsiblity to make sure all data in the dst vector
}
// has
// the same padding situation
// store data from the long-vector buffer to dst
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
static_if
<
DstAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store
<
DstData
,
DstDataPerAccess
>
(
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
p_dst
,
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
0
,
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
dst_coord
.
GetOffset
());
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
}
});
#endif
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
}
}
});
}
}
private:
// Modify Length to 1, if Mask is set to false
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
// Used for isolating linear dimension from non-linear dimensions
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
template
<
index_t
...
Lengths
,
index_t
...
Mask
>
};
__device__
static
constexpr
auto
mask_lengths
(
Sequence
<
Lengths
...
>
,
Sequence
<
Mask
...
>
)
template
<
class
SrcDesc
,
class
DstDesc
,
class
SrcCoordinate
,
class
DstCoordinate
,
class
SliceLengths
>
struct
ThreadwiseGenericTensorSliceCopy_v2
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2
(
SrcCoordinate
src_slice_origin
,
DstCoordinate
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2
()
:
ThreadwiseGenericTensorSliceCopy_v2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
{
{
mDstSliceOrigin
=
dst_slice_origin
;
return
Sequence
<
(
Mask
?
Lengths
:
1
)...
>
{}
;
}
}
template
<
class
TDesc
,
class
Seq
>
// p_src must be global-memory, p_dst can be any memory-space.
struct
IsolateMergedDimSliceLengthsHack
// User should make sure p_src is a block-invariant pointer, because
{
// buffer_load is used for loading from global-memory into register buffer.
template
<
class
IDim
>
// Will do padding check on src data: Read 0 if src data is in padding area.
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
// Will do padding check on dst data: No write if dst data is in paddin area.
{
// This version is optimized for address calculation of src tensor
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Seq
{}[
idim
]
:
1
;
template
<
typename
SrcData
,
}
typename
DstData
,
};
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
template
<
class
TData
>
__device__
void
Run_optimized_src_address_calculation
(
const
SrcData
*
p_src
,
__device__
void
Run
(
const
TData
*
p_src
,
T
Data
*
p_dst
)
const
Dst
Data
*
p_dst
)
const
{
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
TData
*
p_buffer
=
p_buffer_
;
// hacks to isolate merged dimension from normal dimensions, and calculate their offset
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
// seperately
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
// SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
using
SrcMergedDimSliceLengthsHack
=
typename
sequence_gen
<
SliceLengths
::
GetSize
(),
IsolateMergedDimSliceLengthsHack
<
SrcDesc
,
SliceLengths
>>::
type
;
using
SrcNormalDimSliceLengthsHack
=
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
SrcMergedDimSliceLengthsHack
{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
static_ford
<
SrcMergedDimSliceLengthsHack
>
{}([
&
](
auto
merged_dim_data_id_
)
{
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
constexpr
auto
merged_dim_data_id
=
decltype
(
merged_dim_data_id_
){};
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
merged_dim_data_id
).
GetOffset
();
// separate linear dimensions from non-linear dimensions
constexpr
auto
src_linear_dim_mask
=
SrcDesc
::
GetLinearDimensionMask
();
constexpr
auto
src_nonlinear_dim_mask
=
SrcDesc
::
GetNonLinearDimensionMask
();
static_ford
<
SrcNormalDimSliceLengthsHack
>
{}([
&
](
auto
normal_dim_data_id_
)
{
static_assert
(
src_linear_dim_mask
.
At
(
VectorAccessDim
)
||
constexpr
auto
normal_dim_data_id
=
decltype
(
normal_dim_data_id_
){};
long_vector_size
==
SrcDataPerAccess
,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
"would drop"
);
constexpr
index_t
buffer_offset
=
// separate steps into linear and non-linear components, accoording to src tensor
buffer_desc
.
GetOffsetFromMultiIndex
(
merged_dim_data_id
+
normal_dim_data_id
);
constexpr
auto
linear_long_vector_access_lengths
=
mask_lengths
(
long_vector_access_lengths
,
src_linear_dim_mask
);
constexpr
index_t
src_normal_offset
=
constexpr
auto
nonlinear_long_vector_access_lengths
=
SrcDesc
::
GetOffsetFromMultiIndex
(
normal_dim_data_id
);
mask_lengths
(
long_vector_access_lengths
,
src_nonlinear_dim_mask
);
p_buffer
[
buffer_offset
]
=
p_src_tmp
[
src_normal_offset
];
// loop over src's non-linear dimensions
});
ford
<
decltype
(
nonlinear_long_vector_access_lengths
)
>
{}([
&
](
});
auto
nonlinear_dim_long_vector_access_id
)
{
#else
ford
<
SrcMergedDimSliceLengthsHack
>
{}([
&
](
auto
merged_dim_data_id
)
{
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
merged_dim_data_id
).
GetOffset
();
ford
<
SrcNormalDimSliceLengthsHack
>
{}([
&
](
auto
normal_dim_data_id
)
{
// calculate step-sizes along src's nonlinear dimensions
const
index_t
buffer_offset
=
auto
nonlinear_dim_data_steps
=
nonlinear_dim_long_vector_access_id
;
buffer_desc
.
GetOffsetFromMultiIndex
(
merged_dim_data_id
+
normal_dim_data_id
);
nonlinear_dim_data_steps
(
vector_access_dim
)
=
long_vector_size
*
nonlinear_dim_long_vector_access_id
[
vector_access_dim
];
const
index_t
src_normal_offset
=
// move src cooridnate along nonlinear dimensions
SrcDesc
::
GetOffsetFromMultiIndex
(
normal_dim_data_id
);
// this coordinate contains run-time per-thread offset
const
auto
src_nonlinear_coord
=
mSrcSliceOrigin
+
nonlinear_dim_data_steps
;
p_buffer
[
buffer_offset
]
=
p_src_tmp
[
src_normal_offset
];
// loop over src's linear dimensions
});
ford
<
decltype
(
linear_long_vector_access_lengths
)
>
{}([
&
](
});
auto
linear_dim_long_vector_access_id
)
{
#endif
// DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions,
// step-sizes along src's linear dimensions
// but 1 on normal dimensions;
auto
linear_dim_data_steps
=
linear_dim_long_vector_access_id
;
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
linear_dim_data_steps
(
vector_access_dim
)
=
// but 1 on merged dimensions;
long_vector_size
*
linear_dim_long_vector_access_id
[
vector_access_dim
];
using
DstMergedDimSliceLengthsHack
=
typename
sequence_gen
<
SliceLengths
::
GetSize
(),
IsolateMergedDimSliceLengthsHack
<
DstDesc
,
SliceLengths
>>::
type
;
using
DstNormalDimSliceLengthsHack
=
// buffer to hold a long-vector
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
DstMergedDimSliceLengthsHack
{})
;
SrcData
p_src_long_vector
[
long_vector_size
]
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
// zero out buffer
static_ford
<
DstMergedDimSliceLengthsHack
>
{}([
&
](
auto
merged_dim_data_id_
)
{
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
constexpr
auto
merged_dim_data_id
=
decltype
(
merged_dim_data_id_
){};
{
p_src_long_vector
[
i
]
=
0
;
}
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
merged_dim_data_id
).
GetOffset
();
// Loop over VectorAccessDim, and load data from src to the
// long-vector buffer.
// If VectorAccessDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be src's linear dimension
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
static_ford
<
DstNormalDimSliceLengthsHack
>
{}([
&
](
auto
normal_dim_data_id_
)
{
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
constexpr
auto
normal_dim_data_id
=
decltype
(
normal_dim_data_id_
){};
constexpr
index_t
buffer_offset
=
// move src cooridnate along linear dimensions
buffer_desc
.
GetOffsetFromMultiIndex
(
merged_dim_data_id
+
normal_dim_data_id
);
const
auto
src_coord
=
src_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
constexpr
index_t
dst_normal_offset
=
// this is src compile-time offset
DstDesc
::
GetOffsetFromMultiIndex
(
normal_dim_data_id
);
// TODO: is this good implementation?
const
index_t
src_linear_offset
=
src_coord
.
GetOffset
()
-
src_nonlinear_coord
.
GetOffset
();
p_dst_tmp
[
dst_normal_offset
]
=
p_buffer
[
buffer_offset
];
// Check src vector's padding situation, only check the first data in
});
// this src vector. It's user's responsiblity to make sure all data in
});
// the src vector has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
__buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
p_src
,
src_nonlinear_coord
.
GetOffset
(),
src_linear_offset
);
#else
#else
ford
<
DstMergedDimSliceLengthsHack
>
{}([
&
](
auto
merged_dim_data_id
)
{
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
merged_dim_data_id
).
GetOffset
();
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
]);
ford
<
DstNormalDimSliceLengthsHack
>
{}([
&
](
auto
normal_dim_data_id
)
{
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
merged_dim_data_id
+
normal_dim_data_id
);
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
normal_dim_data_id
);
p_dst_tmp
[
dst_normal_offset
]
=
p_buffer
[
buffer_offset
];
});
});
#endif
#endif
}
}).
Else
([
&
](
auto
)
{
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
]);
});
}
}
// T can be Sequence or Array
// SrcData to DstData conversion
template
<
class
T
,
bool
PositiveDirection
>
DstData
p_dst_long_vector
[
long_vector_size
];
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
class
T
,
bool
PositiveDirection
>
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
}
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoordinate
mSrcSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
// This threadwise copy allow vector access of src and dst.
// store data from the long-vector buffer to dst
// It allows the dimensions of vector access to be different on src and dst.
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
// It also allows the vector size to be different on src and dst.
{
// It also allows order of access to be different on src and dst.
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
// It use register as buffer to hold all data moving from src to dst.
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
class
SrcDesc
,
class
DstDesc
,
class
SrcCoordinate
,
class
DstCoordinate
,
class
SliceLengths
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v2r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
(
SrcCoordinate
src_slice_origin
,
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
DstCoordinate
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
// dst offset is calculated here, without explicitly separating into
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
// compile-time and per-thread component
"wrong! map is not valid"
);
const
auto
dst_coord
=
mDstSliceOrigin
+
(
nonlinear_dim_data_steps
+
linear_dim_data_steps
+
scalar_id
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
// Check dst vector's padding situation, only check the first data in
[
&
](
auto
fwd
)
{
// this dst vector. It's user's responsiblity to make sure all data in
static_assert
(
// the dst vector has the same padding situation
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
"wrong! vectorized access is allowed only if stride == 1"
);
{
})
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
.
Else
([
&
](
auto
fwd
)
{
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
static_assert
(
}
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
}
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
});
});
}
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
()
// p_src could be any memory space, d_dst must be global memory.
:
ThreadwiseGenericTensorSliceCopy_v2r1
(
make_zero_array
<
index_t
,
nDim
>
(),
// User should make sure p_dst is a block-invariant pointer, because
make_zero_array
<
index_t
,
nDim
>
())
// buffer_load is used for storing data from regsiter buffer into global-memory.
{
// Will do padding check on src data: Read 0 if src data is in padding area.
}
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
template
<
typename
SrcData
,
{
typename
DstData
,
mSrcSliceOrigin
=
src_slice_origin
;
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
}
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run_optimized_dst_address_calculation
(
const
SrcData
*
p_src
,
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
DstData
*
p_dst
)
const
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
TDesc
,
class
Lengths
>
struct
IsolateMergedDimLengths
{
{
template
<
class
IDim
>
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
{
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Lengths
{}[
idim
]
:
1
;
}
};
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
// offset w.r.t. merged dimension need to be computed at run-time
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
static_ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id_
)
{
constexpr
auto
src_merged_dim_access_id
=
decltype
(
src_merged_dim_access_id_
)
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
.
Modify
(
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
TData
*
p_src_tmp
=
// separate linear dimensions from non-linear dimensions
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
constexpr
auto
dst_linear_dim_mask
=
DstDesc
::
GetLinearDimensionMask
();
constexpr
auto
dst_nonlinear_dim_mask
=
DstDesc
::
GetNonLinearDimensionMask
();
// offset w.r.t. normal dimension can be computed at compile-time
static_assert
(
dst_linear_dim_mask
.
At
(
VectorAccessDim
)
||
static_ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
long_vector_size
==
DstDataPerAccess
,
auto
src_normal_dim_access_id_
)
{
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
"would drop"
);
constexpr
auto
src_normal_dim_access_id
=
decltype
(
src_normal_dim_access_id_
){};
// separate steps into linear and non-linear components, accoording to dst tensor
constexpr
auto
linear_long_vector_access_lengths
=
mask_lengths
(
long_vector_access_lengths
,
dst_linear_dim_mask
);
constexpr
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
.
Modify
(
constexpr
auto
nonlinear_long_vector_access_lengths
=
src_vector_access_dim
,
mask_lengths
(
long_vector_access_lengths
,
dst_nonlinear_dim_mask
);
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
constexpr
index_t
src_normal_offset
=
// loop over dst's non-linear dimensions
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
ford
<
decltype
(
nonlinear_long_vector_access_lengths
)
>
{}([
&
](
auto
nonlinear_dim_long_vector_access_id
)
{
// load vector from src
// calculate step-sizes along dst's nonlinear dimensions
const
src_vector_t
vector_data
=
auto
nonlinear_dim_data_steps
=
nonlinear_dim_long_vector_access_id
;
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
nonlinear_dim_data_steps
(
vector_access_dim
)
=
long_vector_size
*
nonlinear_dim_long_vector_access_id
[
vector_access_dim
];
// unpack vector into buffer
// move dst cooridnate along nonlinear dimensions
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
// this coordinate contains run-time per-thread offset
constexpr
auto
scalar_id
=
const
auto
dst_nonlinear_coord
=
mDstSliceOrigin
+
nonlinear_dim_data_steps
;
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
// loop over dst's linear dimensions
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
ford
<
decltype
(
linear_long_vector_access_lengths
)
>
{}([
&
](
auto
linear_dim_long_vector_access_id
)
{
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
// step-sizes along dst's linear dimensions
});
auto
linear_dim_data_steps
=
linear_dim_long_vector_access_id
;
});
linear_dim_data_steps
(
vector_access_dim
)
=
});
long_vector_size
*
linear_dim_long_vector_access_id
[
vector_access_dim
];
#else
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
// buffer to hold a long-vector
src_merged_dim_data_id
(
src_vector_access_dim
)
=
SrcData
p_src_long_vector
[
long_vector_size
];
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// these should be compile-time known
// zero out buffer
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
auto
src_normal_dim_access_id
)
{
{
p_src_long_vector
[
i
]
=
0
;
}
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
// Loop over VectorAccessDim, and load data from src to the
src_normal_dim_data_id
(
src_vector_access_dim
)
=
// long-vector buffer.
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// If VectorAccessDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be dst's linear dimension
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
src_normal_offset
=
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
// src offset is calculated here, without explicitly separating into
const
src_vector_t
vector_data
=
// compile-time and per-thread component
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
const
auto
src_coord
=
mSrcSliceOrigin
+
(
nonlinear_dim_data_steps
+
linear_dim_data_steps
+
scalar_id
);
// unpack vector into buffer
// Check src vector's padding situation, only check the first data in
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
// this src vector. It's user's responsiblity to make sure all data in
// the src vector has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
scalar_id
(
src_vector_access_dim
)
=
i
;
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
}
});
}
});
#endif
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id_
)
{
constexpr
auto
dst_merged_dim_access_id
=
decltype
(
dst_merged_dim_access_id_
){};
constexpr
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id_
)
{
constexpr
auto
dst_normal_dim_access_id
=
decltype
(
dst_normal_dim_access_id_
){};
constexpr
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
dst_vector_t
vector_data
;
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
constexpr
index_t
dst_normal_offset
=
// SrcData to DstData conversion
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
)
;
DstData
p_dst_long_vector
[
long_vector_size
]
;
// write vector into dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
{
});
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
});
}
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
// offset w.r.t. normal dimension can be computed at compile-time
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
// move dst cooridnate along linear dimensions
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
const
auto
dst_coord
=
dst_no
rmal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_no
nlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
)
;
dst_vector_t
vector_data
;
// this is dst compile-time offset
// TODO: is this good implementation?
const
index_t
dst_linear_offset
=
dst_coord
.
GetOffset
()
-
dst_nonlinear_coord
.
GetOffset
();
// pack vector from buffer
// Check dst vector's padding situation, only check the first data in
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
// this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
static_if
<
DstAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
scalar_id
(
dst_vector_access_dim
)
=
i
;
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store
<
DstData
,
DstDataPerAccess
>
(
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
p_dst
,
dst_nonlinear_coord
.
GetOffset
(),
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
dst_linear_offset
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
}
}
}
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
});
#endif
});
}
}
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
const
T
&
step_sizes_
,
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
const
auto
step_sizes
=
to_array
(
step_sizes_
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
mSrcSliceOrigin
+=
to_array
(
step_sizes
)
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveDstSliceWindow
(
const
T
&
step_sizes_
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
const
auto
step_sizes
=
to_array
(
step_sizes_
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
}
private:
private:
SrcCoord
inate
mSrcSliceOrigin
;
SrcCoord
mSrcSliceOrigin
;
DstCoord
inate
mDstSliceOrigin
;
DstCoord
mDstSliceOrigin
;
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
0 → 100644
View file @
52423948
#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_view.hpp"
#include "tensor_coordinate_deprecated.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#ifndef CK_USE_AMD_INTRINSIC
#define CK_USE_AMD_INTRINSIC 1
#endif
#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#endif
namespace
ck
{
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r1
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r1
()
:
ThreadwiseGenericTensorSliceCopy_v1r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
constexpr
auto
src_data_begin_id
=
src_access_id
.
Modify
(
src_vector_access_dim
,
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_begin_id
);
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_begin_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
#else
ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
auto
src_data_begin_id
=
src_access_id
;
src_data_begin_id
(
src_vector_access_dim
)
=
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_begin_id
);
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_begin_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
#endif
}
// copy data from buffer to dst
{
using
vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
constexpr
auto
dst_data_begin_id
=
dst_access_id
.
Modify
(
dst_vector_access_dim
,
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
vector_t
vector_data
{};
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#else
ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
auto
dst_data_begin_id
=
dst_access_id
;
dst_data_begin_id
(
dst_vector_access_dim
)
=
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
vector_t
vector_data
{};
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#endif
}
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
()
:
ThreadwiseGenericTensorSliceCopy_v1r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
constexpr
auto
long_vector_data_begin_id
=
long_vector_access_id
.
Modify
(
vector_access_dim
,
long_vector_access_id
[
vector_access_dim
]
*
long_vector_size
);
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
static_for
<
0
,
long_vector_size
/
src_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
constexpr
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
});
// store data from the long-vector buffer to dst
static_for
<
0
,
long_vector_size
/
dst_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
dst_data_per_access
);
constexpr
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
});
});
#else
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
});
#endif
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This version use TensorCoordinate_deprecated
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v2r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoordinate
=
typename
TensorCoordinate_deprecated
<
SrcDesc
>::
type
;
using
DstCoordinate
=
typename
TensorCoordinate_deprecated
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
(
const
Index
&
src_slice_origin
,
const
Index
&
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
()
:
ThreadwiseGenericTensorSliceCopy_v2r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TDesc
,
class
Lengths
>
struct
IsolateMergedDimLengths
{
template
<
typename
IDim
>
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
{
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Lengths
{}[
idim
]
:
1
;
}
};
template
<
typename
TData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
src_merged_offset
=
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. normal dimension is known at compile-time
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
src_vector_t
vector_data
;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Inline asm versions using global_load or buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using inline asm (global_load or buffer_load), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
vector_data
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
src_merged_offset
,
src_normal_offset
);
#else
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
});
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
dst_merged_offset
=
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
// offset w.r.t. normal dimension is known at compile-time
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Inline asm versions using global_store or buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using inline asm (global_store or buffer_store), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
});
});
});
}
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoordinate
mSrcSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
// this version use TensorView and TensorCoordinate_deprecated
template
<
typename
SrcTensor
,
typename
DstTensor
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v3r1
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3r1
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
:
mSrc
{
src
},
mDst
{
dst
},
mSrcSlice
{
src
.
Slice
(
src_slice_origin
,
SliceLengths
{})},
mDstSlice
{
dst
.
Slice
(
dst_slice_origin
,
SliceLengths
{})}
{
static_assert
(
nDim
==
SrcTensor
::
GetNumOfDimension
()
&&
nDim
==
DstTensor
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
is_same
<
remove_cv_t
<
typename
SrcTensor
::
data_type
>
,
remove_cv_t
<
typename
DstTensor
::
data_type
>>
{},
"wrong! type conversion is not supported yet"
);
static_assert
(
decltype
(
mSrcSlice
)
::
IsVectorizationAllowed
(
Number
<
SrcVectorAccessDim
>
{},
Number
<
SrcDataPerAccess
>
{})
&&
decltype
(
mDstSlice
)
::
IsVectorizationAllowed
(
Number
<
DstVectorAccessDim
>
{},
Number
<
DstDataPerAccess
>
{}),
"wrong! vectorized access is not allowed"
);
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3r1
()
:
ThreadwiseGenericTensorSliceCopy_v3r1
(
SrcTensor
{},
SrcCoordinate
{},
DstTensor
{},
DstCoordinate
{})
{
}
__device__
void
Run
()
const
{
// buffer
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SrcTensor
::
GetLengths
());
data_type
p_buffer
[
buffer_desc
.
GetElementSpace
()];
auto
buffer
=
make_TensorView
(
buffer_desc
,
p_buffer
);
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
data_type
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
auto
src_slice_vectorized
=
mSrcSlice
.
Vectorize
(
src_vector_access_dim
,
src_data_per_access
);
ford
<
decltype
(
src_slice_vectorized
.
GetLengths
()),
SrcDimAccessOrder
>
{}(
[
&
](
auto
src_vector_id
)
{
// load vector from src
const
src_vector_t
vector_data
=
src_slice_vectorized
[
src_vector_id
];
// unpack vector into buffer
auto
src_scalar_id
=
src_vector_id
;
src_scalar_id
(
src_vector_access_dim
)
*=
src_data_per_access
;
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
id
=
make_zero_array
<
index_t
,
nDim
>
();
id
(
src_vector_access_dim
)
=
i
;
buffer
(
src_scalar_id
+
id
)
=
reinterpret_cast
<
const
data_type
*>
(
&
vector_data
)[
i
];
}
});
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
data_type
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
auto
dst_slice_vectorized
=
mDstSlice
.
Vectorize
(
dst_vector_access_dim
,
dst_data_per_access
);
ford
<
decltype
(
dst_slice_vectorized
.
GetLengths
()),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_vector_id
)
{
dst_vector_t
vector_data
{};
// pack vector from buffer
auto
dst_scalar_id
=
dst_vector_id
;
dst_scalar_id
(
dst_vector_access_dim
)
*=
dst_data_per_access
;
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
id
=
make_zero_array
<
index_t
,
nDim
>
();
id
(
dst_vector_access_dim
)
=
i
;
reinterpret_cast
<
data_type
*>
(
&
vector_data
)[
i
]
=
buffer
[
dst_scalar_id
+
id
];
}
// write vector into dst
dst_slice_vectorized
(
dst_vector_id
)
=
vector_data
;
});
}
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
mSrc
.
MoveSliceWindow
(
mSrcSlice
,
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
{});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
mDst
.
MoveSliceWindow
(
mDstSlice
,
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
{});
}
private:
using
SrcSlice
=
decltype
(
SrcTensor
{}.
Slice
(
make_zero_array
<
index_t
,
nDim
>
(),
SliceLengths
{}));
using
DstSlice
=
decltype
(
DstTensor
{}.
Slice
(
make_zero_array
<
index_t
,
nDim
>
(),
SliceLengths
{}));
SrcTensor
mSrc
;
DstTensor
mDst
;
SrcSlice
mSrcSlice
;
DstSlice
mDstSlice
;
};
}
// namespace ck
#endif
composable_kernel/include/utility/amd_intrinsic.hpp
0 → 100644
View file @
52423948
#ifndef CK_AMD_INTRINSIC_HPP
#define CK_AMD_INTRINSIC_HPP
#include "vector_type.hpp"
namespace
ck
{
__device__
float
__llvm_amdgcn_buffer_load
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load"
);
__device__
vector_type
<
float
,
2
>::
MemoryType
__llvm_amdgcn_buffer_loadx2
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.dwordx2"
);
__device__
vector_type
<
float
,
4
>::
MemoryType
__llvm_amdgcn_buffer_loadx4
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.dwordx4"
);
__device__
void
__llvm_amdgcn_buffer_store
(
float
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store"
);
__device__
void
__llvm_amdgcn_buffer_storex2
(
vector_type
<
float
,
2
>::
MemoryType
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.dwordx2"
);
__device__
void
__llvm_amdgcn_buffer_storex4
(
vector_type
<
float
,
4
>::
MemoryType
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.dwordx4"
);
// buffer_load and buffer_store
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__buffer_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
);
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
);
template
<
>
__device__
float
__buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
#if 0
float dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
float
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_load
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
__buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
#if 0
vector_type<float, 2>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
vector_type
<
float
,
2
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_loadx2
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
__buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
#if 0
vector_type<float, 4>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#elif
1
vector_type
<
float
,
4
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_loadx4
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_store
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
2
>
(
const
vector_type
<
float
,
2
>::
MemoryType
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_storex2
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
4
>
(
const
vector_type
<
float
,
4
>::
MemoryType
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_storex4
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
}
// namespace ck
#endif
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment