Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
38a90b6e
Unverified
Commit
38a90b6e
authored
Oct 20, 2021
by
Chao Liu
Committed by
GitHub
Oct 20, 2021
Browse files
Merge pull request #43 from ROCmSoftwarePlatform/develop
Merge develop into master
parents
88833bd9
c3018794
Changes
71
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4308 additions
and
10 deletions
+4308
-10
composable_kernel/include/utility/data_type.hpp
composable_kernel/include/utility/data_type.hpp
+17
-10
composable_kernel/include/utility/dynamic_buffer.hpp
composable_kernel/include/utility/dynamic_buffer.hpp
+4
-0
composable_kernel/include/utility/reduction_common.hpp
composable_kernel/include/utility/reduction_common.hpp
+53
-0
composable_kernel/include/utility/reduction_enums.hpp
composable_kernel/include/utility/reduction_enums.hpp
+66
-0
composable_kernel/include/utility/reduction_functions_binop.hpp
...able_kernel/include/utility/reduction_functions_binop.hpp
+100
-0
composable_kernel/include/utility/reduction_operator.hpp
composable_kernel/include/utility/reduction_operator.hpp
+419
-0
composable_kernel/include/utility/static_buffer.hpp
composable_kernel/include/utility/static_buffer.hpp
+92
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
...eneric_reduction_first_call_blockwise_reduce_all_dims.cpp
+271
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
...ic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+305
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
...neric_reduction_first_call_multiblock_reduce_all_dims.cpp
+276
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
...c_reduction_first_call_multiblock_reduce_partial_dims.cpp
+310
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
...neric_reduction_first_call_threadwise_reduce_all_dims.cpp
+284
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
...c_reduction_first_call_threadwise_reduce_partial_dims.cpp
+318
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
...generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+285
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
...ric_reduction_first_call_warpwise_reduce_partial_dims.cpp
+320
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
...neric_reduction_second_call_blockwise_reduce_all_dims.cpp
+205
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
...c_reduction_second_call_blockwise_reduce_partial_dims.cpp
+263
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
...eric_reduction_second_call_threadwise_reduce_all_dims.cpp
+222
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
..._reduction_second_call_threadwise_reduce_partial_dims.cpp
+277
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
...eneric_reduction_second_call_warpwise_reduce_all_dims.cpp
+221
-0
No files found.
composable_kernel/include/utility/data_type.hpp
View file @
38a90b6e
...
...
@@ -1008,20 +1008,27 @@ struct inner_product_with_conversion
};
template
<
typename
T
>
struct
NumericLimits
;
struct
NumericLimits
{
__host__
__device__
static
constexpr
T
Min
()
{
return
std
::
numeric_limits
<
T
>::
min
();
}
__host__
__device__
static
constexpr
T
Max
()
{
return
std
::
numeric_limits
<
T
>::
max
();
}
__host__
__device__
static
constexpr
T
Lowest
()
{
return
std
::
numeric_limits
<
T
>::
lowest
();
}
};
template
<
>
struct
NumericLimits
<
int32
_t
>
struct
NumericLimits
<
half
_t
>
{
__host__
__device__
static
constexpr
int32_t
Min
()
{
return
std
::
numeric_limits
<
int32_t
>::
min
();
}
static
constexpr
unsigned
short
binary_min
=
0x0400
;
static
constexpr
unsigned
short
binary_max
=
0x7BFF
;
static
constexpr
unsigned
short
binary_lowest
=
0xFBFF
;
__host__
__device__
static
constexpr
int32_t
Max
()
{
return
std
::
numeric_limits
<
int32_t
>::
max
();
}
__host__
__device__
static
constexpr
half_t
Min
()
{
return
as_type
<
half_t
>
(
binary_min
);
}
__host__
__device__
static
constexpr
half_t
Max
()
{
return
as_type
<
half_t
>
(
binary_max
);
}
__host__
__device__
static
constexpr
half_t
Lowest
()
{
return
as_type
<
half_t
>
(
binary_lowest
);
}
};
}
// namespace ck
...
...
composable_kernel/include/utility/dynamic_buffer.hpp
View file @
38a90b6e
...
...
@@ -38,6 +38,10 @@ struct DynamicBuffer
return
BufferAddressSpace
;
}
__host__
__device__
constexpr
const
T
&
operator
[](
index_t
i
)
const
{
return
p_data_
[
i
];
}
__host__
__device__
constexpr
T
&
operator
()(
index_t
i
)
{
return
p_data_
[
i
];
}
template
<
typename
X
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
...
...
composable_kernel/include/utility/reduction_common.hpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_COMMON_HPP
#define CK_REDUCTION_COMMON_HPP
#include "reduction_enums.hpp"
namespace
ck
{
struct
float_equal_one
{
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
{
return
x
<=
static_cast
<
T
>
(
1.0
f
)
and
x
>=
static_cast
<
T
>
(
1.0
f
);
};
};
struct
float_equal_zero
{
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
{
return
x
<=
static_cast
<
T
>
(
0.0
f
)
and
x
>=
static_cast
<
T
>
(
0.0
f
);
};
};
};
// end of namespace ck
#endif
composable_kernel/include/utility/reduction_enums.hpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_ENUMS_HPP
#define CK_REDUCTION_ENUMS_HPP
namespace
ck
{
enum
class
ReduceTensorOp_t
{
ADD
=
0
,
MUL
=
1
,
MIN
=
2
,
MAX
=
3
,
AMAX
=
4
,
AVG
=
5
,
NORM1
=
6
,
NORM2
=
7
,
// MUL_NO_ZEROS = 8,
};
enum
class
NanPropagation_t
{
NOT_PROPAGATE_NAN
=
0
,
PROPAGATE_NAN
=
1
,
};
enum
class
ReduceTensorIndices_t
{
NO_INDICES
=
0
,
FLATTENED_INDICES
=
1
,
};
enum
class
IndicesType_t
{
INDICES_32BIT
=
0
,
INDICES_64BIT
=
1
,
INDICES_16BIT
=
2
,
INDICES_8BIT
=
3
,
};
};
// end of namespace ck
#endif
composable_kernel/include/utility/reduction_functions_binop.hpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP
#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
namespace
ck
{
namespace
detail
{
static
inline
__device__
bool
isnan
(
half_t
x
)
{
return
__hisnan
(
x
);
};
template
<
NanPropagation_t
nanPropaOpt
,
typename
opReduce
,
typename
compType
>
struct
binop_with_nan_check
;
template
<
typename
opReduce
,
typename
compType
>
struct
binop_with_nan_check
<
NanPropagation_t
::
NOT_PROPAGATE_NAN
,
opReduce
,
compType
>
{
// cppcheck-suppress constParameter
__device__
static
inline
void
calculate
(
compType
&
accuVal
,
compType
currVal
)
{
opReduce
{}(
accuVal
,
currVal
);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__
static
inline
void
// cppcheck-suppress constParameter
calculate
(
compType
&
accuVal
,
compType
currVal
,
int
&
accuIndex
,
int
currIndex
)
{
bool
changed
=
false
;
opReduce
{}(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
};
};
template
<
typename
opReduce
,
typename
compType
>
struct
binop_with_nan_check
<
NanPropagation_t
::
PROPAGATE_NAN
,
opReduce
,
compType
>
{
__device__
static
inline
void
calculate
(
compType
&
accuVal
,
compType
currVal
)
{
if
(
isnan
(
currVal
))
accuVal
=
currVal
;
else
opReduce
{}(
accuVal
,
currVal
);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__
static
inline
void
calculate
(
compType
&
accuVal
,
compType
currVal
,
int
&
accuIndex
,
int
currIndex
)
{
if
(
isnan
(
currVal
))
{
accuVal
=
currVal
;
accuIndex
=
currIndex
;
}
else
{
bool
changed
=
false
;
opReduce
{}(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
}
};
};
};
// namespace detail
};
// end of namespace ck
#endif
composable_kernel/include/utility/reduction_operator.hpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP
#include "reduction_common.hpp"
namespace
ck
{
namespace
reduce
{
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// class must provide at least
// three members:
// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary
// operator, "identity element" is the unique
// element in the algebraic space that doesn't affect the value of other elements
// when operated against them, and the concept is similar to zero vector in
// vector space
// (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
// 2) indexable -- boolean value indicating whether indices of the operated elements could be
// recorded. Usually, Min/Max operator could
// need to record the indices of elements. For operator like Add/Mul, no need to
// record the indices.
// 3) operator() -- the first argument of the operator must be both an input & output, and the
// corresponding variable usually stores
// the accumulated result of many operator() calls; the second argument is only an
// input. For indexable binary
// operator, the second version of operator() has third argument (which is an
// output) to indicate whether the
// accumulated value (the first argument) has changed, in which case the recorded
// accumulated index also need be
// changed.
template
<
class
T
>
struct
Add
{
using
dataType
=
T
;
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
+
b
;
}
static
constexpr
bool
indexable
=
false
;
};
template
<
class
T
>
struct
Mul
{
using
dataType
=
T
;
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
*
b
;
}
static
constexpr
bool
indexable
=
false
;
};
template
<
class
T
>
struct
Max
{
using
dataType
=
T
;
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
NumericLimits
<
T
>::
Lowest
();
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
<
b
)
a
=
b
;
}
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
if
(
a
<
b
)
{
a
=
b
;
changed
=
true
;
}
}
static
constexpr
bool
indexable
=
true
;
};
template
<
class
T
>
struct
Min
{
using
dataType
=
T
;
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
NumericLimits
<
T
>::
Max
();
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
>
b
)
a
=
b
;
}
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
if
(
a
>
b
)
{
a
=
b
;
changed
=
true
;
}
}
static
constexpr
bool
indexable
=
true
;
};
template
<
class
T
>
struct
AMax
{
using
dataType
=
T
;
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
<
b
)
a
=
b
;
}
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
if
(
a
<
b
)
{
a
=
b
;
changed
=
true
;
}
}
static
constexpr
bool
indexable
=
true
;
};
// Unary operators are usually called element-wisely before the reduction is executed on the
// elements.
// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template
<
class
T
,
bool
hasDividing
>
struct
unary_identic
{
__device__
unary_identic
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
a
*
type_convert
<
T
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
class
T
>
struct
unary_identic
<
T
,
false
>
{
__device__
unary_identic
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
a
;
};
};
template
<
class
T
,
bool
hasDividing
>
struct
unary_square
{
__device__
unary_square
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
a
=
a
*
a
;
return
a
*
type_convert
<
T
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
class
T
>
struct
unary_square
<
T
,
false
>
{
__device__
unary_square
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
a
*
a
;
};
};
template
<
class
T
,
bool
hasDividing
>
struct
unary_abs
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
a
=
abs
(
a
);
return
a
*
type_convert
<
T
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
class
T
>
struct
unary_abs
<
T
,
false
>
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
abs
(
a
);
};
};
// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
// Let's assume that __habs() exists since 3.5.
#if HIP_PACKAGE_VERSION_FLAT < 3005000000
inline
__device__
__half
__habs
(
__half
x
)
{
union
{
__half
half
;
unsigned
short
u16
;
}
val
;
val
.
half
=
x
;
val
.
u16
=
val
.
u16
&
0x7fff
;
return
val
.
half
;
}
#endif
template
<
bool
hasDividing
>
struct
unary_abs
<
half_t
,
hasDividing
>
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
half_t
operator
()(
half_t
a
)
const
{
a
=
static_cast
<
half_t
>
(
__habs
(
a
));
return
a
*
type_convert
<
half_t
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
>
struct
unary_abs
<
half_t
,
false
>
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
half_t
operator
()(
half_t
a
)
const
{
return
static_cast
<
half_t
>
(
__habs
(
a
));
};
};
template
<
class
T
>
struct
unary_sqrt
{
__device__
unary_sqrt
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
T
operator
()(
T
a
)
const
{
return
sqrtf
(
a
);
};
};
template
<
>
struct
unary_sqrt
<
half_t
>
{
__device__
unary_sqrt
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
half_t
operator
()(
half_t
a
)
const
{
return
static_cast
<
half_t
>
(
hsqrt
(
a
));
};
};
};
// end of namespace reduce
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in
// reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template
<
typename
T
,
ReduceTensorOp_t
op
>
struct
reduce_binary_operator
;
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
ADD
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MUL
>
{
using
opType
=
reduce
::
Mul
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Mul
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MIN
>
{
using
opType
=
reduce
::
Min
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Min
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MAX
>
{
using
opType
=
reduce
::
Max
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Max
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
AMAX
>
{
using
opType
=
reduce
::
AMax
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Max
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
AVG
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
NORM1
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template
<
typename
T
,
ReduceTensorOp_t
op
,
bool
isFirsReduce
,
bool
isLastReduce
>
struct
reduce_unary_operator
{
using
preUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
,
bool
isFirstReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
AVG
,
isFirstReduce
,
true
>
{
using
preUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
true
>
;
};
template
<
typename
T
,
bool
isLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM1
,
true
,
isLastReduce
>
{
using
preUnaryOp
=
reduce
::
unary_abs
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
,
bool
isLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
AMAX
,
true
,
isLastReduce
>
{
using
preUnaryOp
=
reduce
::
unary_abs
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
true
,
false
>
{
using
preUnaryOp
=
reduce
::
unary_square
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
true
,
true
>
{
using
preUnaryOp
=
reduce
::
unary_square
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_sqrt
<
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
false
,
true
>
{
using
preUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_sqrt
<
T
>
;
};
}
// end of namespace ck
#endif
composable_kernel/include/utility/static_buffer.hpp
View file @
38a90b6e
...
...
@@ -55,6 +55,98 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
false
;
}
};
template
<
AddressSpaceEnum_t
BufferAddressSpace
,
typename
T
,
index_t
N
,
bool
InvalidElementUseNumericalZeroValue
>
struct
StaticBufferV2
:
public
StaticallyIndexedArray
<
T
,
N
>
{
using
type
=
T
;
using
base
=
StaticallyIndexedArray
<
T
,
N
>
;
using
VecBaseType
=
typename
T
::
d1_t
;
__host__
__device__
static
constexpr
index_t
GetVectorSize
()
{
return
sizeof
(
typename
T
::
type
)
/
sizeof
(
VecBaseType
);
}
static
constexpr
index_t
vector_size
=
GetVectorSize
();
VecBaseType
invalid_element_value_
=
VecBaseType
{
0
};
T
invalid_vec_value_
=
T
{
0
};
__host__
__device__
constexpr
StaticBufferV2
()
:
base
{}
{}
__host__
__device__
constexpr
StaticBufferV2
(
VecBaseType
invalid_element_value
)
:
base
{},
invalid_vec_value_
{
invalid_element_value
},
invalid_element_value_
{
invalid_element_value
}
{
}
__host__
__device__
static
constexpr
AddressSpaceEnum_t
GetAddressSpace
()
{
return
BufferAddressSpace
;
}
template
<
index_t
I
>
__host__
__device__
constexpr
auto
&
GetVector
(
Number
<
I
>
vec_id
)
{
return
this
->
At
(
vec_id
);
}
template
<
index_t
I
>
__host__
__device__
constexpr
const
auto
&
GetVector
(
Number
<
I
>
vec_id
)
const
{
return
this
->
At
(
vec_id
);
}
template
<
index_t
I
>
__host__
__device__
constexpr
auto
&
GetElement
(
Number
<
I
>
i
,
bool
)
{
constexpr
auto
vec_id
=
Number
<
i
/
vector_size
>
{};
constexpr
auto
vec_off
=
Number
<
i
%
vector_size
>
{};
return
this
->
At
(
vec_id
).
template
AsType
<
VecBaseType
>()(
vec_off
);
}
template
<
index_t
I
>
__host__
__device__
constexpr
auto
GetElement
(
Number
<
I
>
i
,
bool
is_valid_element
)
const
{
constexpr
auto
vec_id
=
Number
<
i
/
vector_size
>
{};
constexpr
auto
vec_off
=
Number
<
i
%
vector_size
>
{};
if
constexpr
(
InvalidElementUseNumericalZeroValue
)
{
return
is_valid_element
?
this
->
At
(
vec_id
).
template
AsType
<
VecBaseType
>()[
vec_off
]
:
VecBaseType
{
0
};
}
else
{
return
is_valid_element
?
this
->
At
(
vec_id
).
template
AsType
<
VecBaseType
>()[
vec_off
]
:
invalid_element_value_
;
}
}
template
<
index_t
I
>
__host__
__device__
constexpr
auto
operator
[](
Number
<
I
>
i
)
const
{
return
GetElement
(
i
,
true
);
}
template
<
index_t
I
>
__host__
__device__
constexpr
auto
&
operator
()(
Number
<
I
>
i
)
{
return
GetElement
(
i
,
true
);
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
false
;
}
};
template
<
AddressSpaceEnum_t
BufferAddressSpace
,
typename
T
,
index_t
N
>
__host__
__device__
constexpr
auto
make_static_buffer
(
Number
<
N
>
)
{
...
...
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
constexpr
int
invariantLen
=
1
;
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
};
template
<
index_t
srcDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple
(
1
),
make_tuple
(
1
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInBlock
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
index_t
num_toReduceDims
=
CK_PARAM_NUM_TOREDUCE_DIMS
;
constexpr
index_t
num_invariantDims
=
srcDims
-
num_toReduceDims
;
using
invariantDims
=
typename
arithmetic_sequence_gen
<
0
,
num_invariantDims
,
1
>::
type
;
using
toReduceDims
=
typename
arithmetic_sequence_gen
<
num_invariantDims
,
srcDims
,
1
>::
type
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
static_assert
(
num_invariantDims
>
0
,
"Not all dimensins are reduced for this kernel !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInBlock
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
constexpr
int
invariantLen
=
1
;
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLen
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
reduceSizePerBlock
*
BlkGroupSize
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
};
template
<
index_t
srcDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple
(
1
),
make_tuple
(
1
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_dst_global
;
(
void
)
indices_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_multiblock
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
BlkGroupSize
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
static_cast
<
int
*
const
__restrict__
>
(
ws_buf2_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
index_t
num_toReduceDims
=
CK_PARAM_NUM_TOREDUCE_DIMS
;
constexpr
index_t
num_invariantDims
=
srcDims
-
num_toReduceDims
;
using
invariantDims
=
typename
arithmetic_sequence_gen
<
0
,
num_invariantDims
,
1
>::
type
;
using
toReduceDims
=
typename
arithmetic_sequence_gen
<
num_invariantDims
,
srcDims
,
1
>::
type
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
static_assert
(
num_invariantDims
>
0
,
"Not all dimensins are reduced for this kernel !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLen
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
reduceSizePerBlock
*
BlkGroupSize
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_dst_global
;
(
void
)
indices_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_multiblock
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
BlkGroupSize
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
static_cast
<
int
*
const
__restrict__
>
(
ws_buf2_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredThreadBufferLength
=
CK_PARAM_THREAD_BUFFER_LENGTH
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
constexpr
int
invariantLen
=
1
;
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
GredThreadBufferLength
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dstdDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
}
};
template
<
index_t
srcDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple
(
1
),
make_tuple
(
1
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_threadwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredThreadBufferLength
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
index_t
num_toReduceDims
=
CK_PARAM_NUM_TOREDUCE_DIMS
;
constexpr
index_t
num_invariantDims
=
srcDims
-
num_toReduceDims
;
using
invariantDims
=
typename
arithmetic_sequence_gen
<
0
,
num_invariantDims
,
1
>::
type
;
using
toReduceDims
=
typename
arithmetic_sequence_gen
<
num_invariantDims
,
srcDims
,
1
>::
type
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
static_assert
(
num_invariantDims
>
0
,
"Not all dimensins are reduced for this kernel !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredThreadBufferLength
=
CK_PARAM_THREAD_BUFFER_LENGTH
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
GredThreadBufferLength
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dst1dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
}
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_threadwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredThreadBufferLength
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInWarp
=
CK_PARAM_ACCESSES_PER_THREAD_INWARP
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
constexpr
int
invariantLen
=
1
;
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
warpSize
*
GredAccessesPerThreadInWarp
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
/
warpSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
/
warpSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
}
};
template
<
index_t
srcDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple
(
1
),
make_tuple
(
1
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_warpwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInWarp
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
index_t
num_toReduceDims
=
CK_PARAM_NUM_TOREDUCE_DIMS
;
constexpr
index_t
num_invariantDims
=
srcDims
-
num_toReduceDims
;
using
invariantDims
=
typename
arithmetic_sequence_gen
<
0
,
num_invariantDims
,
1
>::
type
;
using
toReduceDims
=
typename
arithmetic_sequence_gen
<
num_invariantDims
,
srcDims
,
1
>::
type
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
static_assert
(
num_invariantDims
>
0
,
"Not all dimensins are reduced for this kernel !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInWarp
=
CK_PARAM_ACCESSES_PER_THREAD_INWARP
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
warpSize
*
GredAccessesPerThreadInWarp
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
/
warpSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
/
warpSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dst1dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
}
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_warpwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInWarp
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
extern
"C"
__global__
void
gridwise_generic_reduce_2_prepare
(
int
GridSize
,
int
BlkGroupSize
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
index_t
invariantLen
=
dstDesc
.
GetLength
(
Number
<
0
>
{});
const
index_t
toReduceLen
=
BlkGroupSize
;
auto
src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
invariantLen
,
toReduceLen
));
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
};
struct
get_ref_desc_types
{
static
constexpr
auto
ref_tupleDstLengths
=
make_tuple
(
8
);
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
ref_tupleDstLengths
,
ref_tupleDstLengths
);
static
constexpr
index_t
ref_invariantLen
=
ref_dstDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
index_t
ref_toReduceLen
=
8
;
static
constexpr
auto
ref_src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
ref_invariantLen
,
ref_toReduceLen
));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_2
(
int
origReduceLen
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_src_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
false
,
true
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
3
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
ws_buf2_global
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_2_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
outLength0
,
int
outLength1
,
int
outLength2
,
int
outLength3
,
int
outLength4
,
int
outLength5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
dstLengths
[
6
]
=
{
outLength0
,
outLength1
,
outLength2
,
outLength3
,
outLength4
,
outLength5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleDstLengths
=
make_tuple_from_array
(
dstLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
index_t
invariantLen
=
dst1dDesc
.
GetLength
(
Number
<
0
>
{});
const
index_t
toReduceLen
=
BlkGroupSize
;
auto
src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
invariantLen
,
toReduceLen
));
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
dstDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_tupleDstLengths
=
make_tuple_from_seq
(
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{});
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
ref_tupleDstLengths
,
ref_tupleDstLengths
);
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
ref_tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
index_t
ref_invariantLen
=
ref_dst1dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
index_t
ref_toReduceLen
=
8
;
static
constexpr
auto
ref_src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
ref_invariantLen
,
ref_toReduceLen
));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
dstDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
dstDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
dstDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
dstDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_2
(
int
origReduceLen
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_src_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
false
,
true
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
3
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
ws_buf2_global
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
using
toReduceDims
=
Sequence
<
CK_PARAM_TOREDUCE_DIMS
>
;
using
invariantDims
=
Sequence
<
CK_PARAM_INVARIANT_DIMS
>
;
// this could be empty
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredThreadBufferLength
=
CK_PARAM_THREAD_BUFFER_LENGTH
;
// tunable
extern
"C"
__global__
void
gridwise_generic_reduce_2_prepare
(
int
GridSize
,
int
BlkGroupSize
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
index_t
invariantLen
=
dstDesc
.
GetLength
(
Number
<
0
>
{});
const
index_t
toReduceLen
=
BlkGroupSize
;
auto
src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
invariantLen
,
toReduceLen
));
constexpr
auto
copySliceLen
=
GredThreadBufferLength
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
}
};
struct
get_ref_desc_types
{
static
constexpr
auto
ref_tupleDstLengths
=
make_tuple
(
8
);
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
ref_tupleDstLengths
,
ref_tupleDstLengths
);
static
constexpr
index_t
ref_invariantLen
=
ref_dstDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
index_t
ref_toReduceLen
=
8
;
static
constexpr
auto
ref_src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
ref_invariantLen
,
ref_toReduceLen
));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
=
typename
get_ref_desc_types
::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_2
(
int
origReduceLen
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_src_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_threadwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
false
,
true
,
GredThreadBufferLength
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
3
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
ws_buf2_global
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredThreadBufferLength
=
CK_PARAM_THREAD_BUFFER_LENGTH
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_2_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
outLength0
,
int
outLength1
,
int
outLength2
,
int
outLength3
,
int
outLength4
,
int
outLength5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
dstLengths
[
6
]
=
{
outLength0
,
outLength1
,
outLength2
,
outLength3
,
outLength4
,
outLength5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleDstLengths
=
make_tuple_from_array
(
dstLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
index_t
invariantLen
=
dst1dDesc
.
GetLength
(
Number
<
0
>
{});
const
index_t
toReduceLen
=
BlkGroupSize
;
auto
src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
invariantLen
,
toReduceLen
));
constexpr
auto
copySliceLen
=
GredThreadBufferLength
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dst1dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
}
};
template
<
index_t
dstDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_tupleDstLengths
=
make_tuple_from_seq
(
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{});
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
ref_tupleDstLengths
,
ref_tupleDstLengths
);
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
ref_tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
index_t
ref_invariantLen
=
ref_dst1dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
index_t
ref_toReduceLen
=
8
;
static
constexpr
auto
ref_src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
ref_invariantLen
,
ref_toReduceLen
));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
dstDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
dstDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
=
typename
get_ref_desc_types
<
dstDims
>::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
dstDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_2
(
int
origReduceLen
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_src_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_threadwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
false
,
true
,
GredThreadBufferLength
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
3
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
ws_buf2_global
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
0 → 100644
View file @
38a90b6e
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInWarp
=
CK_PARAM_ACCESSES_PER_THREAD_INWARP
;
// tunable
extern
"C"
__global__
void
gridwise_generic_reduce_2_prepare
(
int
GridSize
,
int
BlkGroupSize
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
index_t
invariantLen
=
dstDesc
.
GetLength
(
Number
<
0
>
{});
const
index_t
toReduceLen
=
BlkGroupSize
;
auto
src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
invariantLen
,
toReduceLen
));
constexpr
auto
copySliceLen
=
warpSize
*
GredAccessesPerThreadInWarp
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
/
warpSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
/
warpSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
}
};
struct
get_ref_desc_types
{
static
constexpr
auto
ref_tupleDstLengths
=
make_tuple
(
8
);
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
ref_tupleDstLengths
,
ref_tupleDstLengths
);
static
constexpr
index_t
ref_invariantLen
=
ref_dstDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
index_t
ref_toReduceLen
=
8
;
static
constexpr
auto
ref_src2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
ref_invariantLen
,
ref_toReduceLen
));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
=
typename
get_ref_desc_types
::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_2
(
int
origReduceLen
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_src_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_warpwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
false
,
true
,
GredAccessesPerThreadInWarp
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
3
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
ws_buf2_global
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment