Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
963e4a71
Unverified
Commit
963e4a71
authored
Oct 27, 2022
by
rocking5566
Committed by
GitHub
Oct 27, 2022
Browse files
Merge branch 'develop' into conv_quant_int8
parents
ad29b25b
0ee3aea1
Changes
193
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
580 additions
and
351 deletions
+580
-351
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+5
-1
example/12_reduce/reduce_blockwise_impl.hpp
example/12_reduce/reduce_blockwise_impl.hpp
+16
-14
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+26
-26
example/12_reduce/reduce_example_common.hpp
example/12_reduce/reduce_example_common.hpp
+7
-6
example/12_reduce/reduce_multiblock_atomic_add.cpp
example/12_reduce/reduce_multiblock_atomic_add.cpp
+5
-1
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+16
-14
include/ck/tensor_operation/gpu/device/device_reduce.hpp
include/ck/tensor_operation/gpu/device/device_reduce.hpp
+19
-13
include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
...or_operation/gpu/device/impl/device_reduce_multiblock.hpp
+43
-37
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
...or_operation/gpu/device/impl/device_reduce_threadwise.hpp
+36
-30
include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
.../tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+139
-56
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
..._operation_instance/device_operation_instance_factory.hpp
+10
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
..._operation_instance/gpu/reduce/device_reduce_instance.hpp
+74
-21
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
..._instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+10
-67
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
...u/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
+0
-59
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
...uce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
...ce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
+27
-0
No files found.
example/12_reduce/reduce_blockwise.cpp
View file @
963e4a71
...
@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
...
@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
return
;
return
;
std
::
array
<
int
,
ShapeType
::
NumReduceDim_
>
arrReduceDims
;
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
arrReduceDims
.
begin
());
result
=
reduce_blockwise_impl
<
InOutDataType
,
result
=
reduce_blockwise_impl
<
InOutDataType
,
AccDataType
,
AccDataType
,
ReduceOpId
,
ReduceOpId
,
...
@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
...
@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
ShapeType
::
NumReduceDim_
,
ShapeType
::
NumReduceDim_
,
PropagateNan
,
PropagateNan
,
OutputIndex
>
(
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
inLengths
,
r
educeDims
,
alpha
,
beta
);
do_verification
,
init_method
,
time_kernel
,
inLengths
,
arrR
educeDims
,
alpha
,
beta
);
matched
=
true
;
matched
=
true
;
});
});
...
...
example/12_reduce/reduce_blockwise_impl.hpp
View file @
963e4a71
...
@@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
...
@@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification,
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_indices
=
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
...
@@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification,
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
invariantDims
.
empty
())
if
(
invariantDims
.
empty
())
outLengths
.
push_back
(
1
);
outLengths
.
push_back
(
1
);
...
@@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths
;
std
::
array
<
index_t
,
Rank
>
arrI
nLengths
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides
;
std
::
array
<
index_t
,
Rank
>
arrI
nStrides
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
std
::
copy
(
inLengths
.
begin
(),
inLengths
.
end
()
,
arrInLengths
.
begin
()
);
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
std
::
copy
(
inStrides
.
begin
(),
inStrides
.
end
()
,
arrInStrides
.
begin
()
);
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
()
);
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
()
);
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_i
nLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrI
nLengths
,
i_i
nStrides
,
arrI
nStrides
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
...
example/12_reduce/reduce_blockwise_two_call.cpp
View file @
963e4a71
...
@@ -90,15 +90,15 @@ static bool time_kernel;
...
@@ -90,15 +90,15 @@ static bool time_kernel;
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
// used by the device reduction
// used by the device reduction
const
std
::
vector
<
int
>
reduceDims_1
=
{
4
};
const
std
::
array
<
int
,
1
>
reduceDims_1
=
{
4
};
const
std
::
vector
<
int
>
invariantDims_1
=
{
0
,
1
,
2
,
3
};
//
const std::
array
<int
, 4
> invariantDims_1 = {0, 1, 2, 3};
const
std
::
vector
<
int
>
reduceDims_2
=
{
3
};
const
std
::
array
<
int
,
1
>
reduceDims_2
=
{
3
};
const
std
::
vector
<
int
>
invariantDims_2
=
{
0
,
1
,
2
};
//
const std::
array
<int
, 3
> invariantDims_2 = {0, 1, 2};
// used by the host reduction
// used by the host reduction
const
std
::
vector
<
int
>
reduceDims
=
{
3
,
4
};
const
std
::
array
<
int
,
2
>
reduceDims
=
{
3
,
4
};
const
std
::
vector
<
int
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
array
<
int
,
3
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
...
@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
...
@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths_1
;
std
::
array
<
index_t
,
5
>
arrI
nLengths_1
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides_1
;
std
::
array
<
index_t
,
5
>
arrI
nStrides_1
;
std
::
vector
<
ck
::
index_t
>
i_i
nLengths_2
;
std
::
array
<
index_t
,
4
>
arrI
nLengths_2
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides_2
;
std
::
array
<
index_t
,
4
>
arrI
nStrides_2
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
3
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
3
>
arrO
utStrides
;
i_inLengths_1
.
assign
(
inLengths_1
.
begin
(),
inLengths_1
.
end
());
std
::
copy
(
inLengths_1
.
begin
(),
inLengths_1
.
end
()
,
arrInLengths_1
.
begin
()
);
i_inStrides_1
.
assign
(
inStrides_1
.
begin
(),
inStrides_1
.
end
());
std
::
copy
(
inStrides_1
.
begin
(),
inStrides_1
.
end
()
,
arrInStrides_1
.
begin
()
);
i_inLengths_2
.
assign
(
inLengths_2
.
begin
(),
inLengths_2
.
end
());
std
::
copy
(
inLengths_2
.
begin
(),
inLengths_2
.
end
()
,
arrInLengths_2
.
begin
()
);
i_inStrides_2
.
assign
(
inStrides_2
.
begin
(),
inStrides_2
.
end
());
std
::
copy
(
inStrides_2
.
begin
(),
inStrides_2
.
end
()
,
arrInStrides_2
.
begin
()
);
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
()
);
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
()
);
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_i
nLengths_1
,
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
arrI
nLengths_1
,
i_i
nStrides_1
,
arrI
nStrides_1
,
i_i
nLengths_2
,
arrI
nLengths_2
,
i_i
nStrides_2
,
arrI
nStrides_2
,
reduceDims_1
,
reduceDims_1
,
1.0
f
,
1.0
f
,
0.0
f
,
0.0
f
,
...
@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
...
@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_i
nLengths_2
,
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
arrI
nLengths_2
,
i_i
nStrides_2
,
arrI
nStrides_2
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims_2
,
reduceDims_2
,
alpha
,
alpha
,
beta
,
beta
,
...
...
example/12_reduce/reduce_example_common.hpp
View file @
963e4a71
...
@@ -5,11 +5,10 @@
...
@@ -5,11 +5,10 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
template
<
ck
::
index_t
Rank
,
ck
::
index_t
NumReduceDim
>
template
<
int
Rank
,
int
NumReduceDim
>
std
::
vector
<
int
>
get_invariant_dims
(
const
std
::
vector
<
int
>&
reduceDims
)
static
inline
std
::
array
<
int
,
Rank
-
NumReduceDim
>
get_invariant_dims
(
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
)
{
{
assert
(
NumReduceDim
==
reduceDims
.
size
());
int
reduceFlag
=
0
;
int
reduceFlag
=
0
;
// flag the bits for the reduceDims
// flag the bits for the reduceDims
...
@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
...
@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
reduceFlag
|=
1
<<
reduceDims
[
i
];
reduceFlag
|=
1
<<
reduceDims
[
i
];
};
};
std
::
vector
<
int
>
invariantDims
;
std
::
array
<
int
,
Rank
-
NumReduceDim
>
invariantDims
;
// collect invariant dimensions
// collect invariant dimensions
int
dim
=
0
;
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
{
invariantDims
.
push_back
(
i
);
invariantDims
[
dim
]
=
i
;
dim
++
;
};
};
return
invariantDims
;
return
invariantDims
;
...
...
example/12_reduce/reduce_multiblock_atomic_add.cpp
View file @
963e4a71
...
@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
...
@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
return
;
return
;
std
::
array
<
int
,
ShapeType
::
NumReduceDim_
>
a_reduceDims
;
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
a_reduceDims
.
begin
());
result
=
reduce_multiblock_atomic_add_impl
<
InOutDataType
,
result
=
reduce_multiblock_atomic_add_impl
<
InOutDataType
,
AccDataType
,
AccDataType
,
ReduceOpId
,
ReduceOpId
,
ShapeType
::
Rank_
,
ShapeType
::
Rank_
,
ShapeType
::
NumReduceDim_
,
ShapeType
::
NumReduceDim_
,
PropagateNan
>
(
PropagateNan
>
(
do_verification
,
init_method
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
do_verification
,
init_method
,
time_kernel
,
inLengths
,
a_
reduceDims
,
alpha
,
beta
);
matched
=
true
;
matched
=
true
;
});
});
...
...
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
View file @
963e4a71
...
@@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
...
@@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_atomic_add
=
constexpr
bool
op_support_atomic_add
=
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
);
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
);
...
@@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
invariantDims
.
empty
())
if
(
invariantDims
.
empty
())
outLengths
.
push_back
(
1
);
outLengths
.
push_back
(
1
);
...
@@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths
;
std
::
array
<
index_t
,
Rank
>
arrI
nLengths
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides
;
std
::
array
<
index_t
,
Rank
>
arrI
nStrides
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
std
::
copy
(
inLengths
.
begin
(),
inLengths
.
end
()
,
arrInLengths
.
begin
()
);
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
std
::
copy
(
inStrides
.
begin
(),
inStrides
.
end
()
,
arrInStrides
.
begin
()
);
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
()
);
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
()
);
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_i
nLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrI
nLengths
,
i_i
nStrides
,
arrI
nStrides
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
...
include/ck/tensor_operation/gpu/device/device_reduce.hpp
View file @
963e4a71
...
@@ -3,27 +3,30 @@
...
@@ -3,27 +3,30 @@
#pragma once
#pragma once
#include <
vector
>
#include <
array
>
#include <memory>
#include <memory>
#include <iostream>
#include "ck/utility/common_header.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
template
<
index_t
Rank
,
index_t
NumReduceDim
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
struct
DeviceReduce
:
public
BaseOperator
struct
DeviceReduce
:
public
BaseOperator
{
{
static
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
MakeArgumentPointer
(
const
std
::
array
<
index_t
,
Rank
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
array
<
index_t
,
Rank
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
array
<
index_t
,
NumOutDim
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
array
<
index_t
,
NumOutDim
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_dev
,
...
@@ -36,9 +39,12 @@ struct DeviceReduce : public BaseOperator
...
@@ -36,9 +39,12 @@ struct DeviceReduce : public BaseOperator
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
template
<
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
template
<
index_t
Rank
,
using
DeviceReducePtr
=
index_t
NumReduceDim
,
std
::
unique_ptr
<
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>>
;
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
using
DeviceReducePtr
=
std
::
unique_ptr
<
DeviceReduce
<
Rank
,
NumReduceDim
,
InElementwiseOperation
,
AccElementwiseOperation
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
View file @
963e4a71
...
@@ -5,9 +5,8 @@
...
@@ -5,9 +5,8 @@
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include <array>
#include "ck/utility/common_header.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
...
@@ -41,7 +40,8 @@ template <typename InDataType,
...
@@ -41,7 +40,8 @@ template <typename InDataType,
index_t
InSrcVectorDim
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
index_t
OutDstVectorSize
>
struct
DeviceReduceMultiBlock
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
struct
DeviceReduceMultiBlock
:
public
DeviceReduce
<
Rank
,
NumReduceDim
,
InElementwiseOperation
,
AccElementwiseOperation
>
{
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
...
@@ -58,8 +58,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -58,8 +58,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
n
umSrcDim
=
Rank
;
static
constexpr
index_t
N
umSrcDim
=
Rank
;
static
constexpr
index_t
n
umDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
index_t
N
umDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
// So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
// So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
...
@@ -81,13 +81,15 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -81,13 +81,15 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
index_t
>&
inLengths
,
static
auto
MakeSrc2dDescriptor
(
const
std
::
array
<
index_t
,
Rank
>&
inLengths
,
const
std
::
vector
<
index_t
>&
inStrides
,
const
std
::
array
<
index_t
,
Rank
>&
inStrides
,
int
blkGroupSize
,
int
blkGroupSize
,
int
numBlockTileIteration
)
int
numBlockTileIteration
)
{
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcLengths
=
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
Rank
>
{});
const
auto
tupleSrcStrides
=
generate_tuple
([
&
](
auto
I
)
{
return
inStrides
[
I
];
},
Number
<
Rank
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
...
@@ -97,7 +99,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -97,7 +99,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
n
umSrcDim
,
1
>::
type
{}),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
N
umSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
return
transform_tensor_descriptor
(
one_dim_inDesc
,
...
@@ -111,10 +113,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -111,10 +113,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
const
auto
reduceDimLengths
=
generate_tuple
(
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDim
s
{});
[
&
](
auto
I
)
{
return
inLengths
[
NumInvariantDim
+
I
];
},
Number
<
Num
ReduceDim
>
{});
const
auto
invariantDimLengths
=
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDim
s
{});
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
Num
InvariantDim
>
{});
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
inDesc
,
inDesc
,
...
@@ -143,18 +145,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -143,18 +145,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
return
(
in_grid_desc_m_k_padded
);
return
(
in_grid_desc_m_k_padded
);
};
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
index_t
>&
outLengths
,
static
auto
MakeDst1dDescriptor
(
const
std
::
array
<
index_t
,
NumDstDim
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
)
const
std
::
array
<
index_t
,
NumDstDim
>&
outStrides
)
{
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstLengths
=
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
generate_tuple
([
&
](
auto
I
)
{
return
outLengths
[
I
];
},
Number
<
NumDstDim
>
{});
const
auto
tupleDstStrides
=
generate_tuple
([
&
](
auto
I
)
{
return
outStrides
[
I
];
},
Number
<
NumDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
n
umDstDim
,
1
>::
type
{}),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
N
umDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
...
@@ -170,18 +174,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -170,18 +174,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
return
(
out_grid_desc_m_padded
);
return
(
out_grid_desc_m_padded
);
};
};
static
auto
MakeDst1dDescriptorForBufferSet
(
const
std
::
vector
<
index_t
>&
outLengths
,
static
auto
MakeDst1dDescriptorForBufferSet
(
const
std
::
array
<
index_t
,
NumDstDim
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
)
const
std
::
array
<
index_t
,
NumDstDim
>&
outStrides
)
{
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstLengths
=
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
generate_tuple
([
&
](
auto
I
)
{
return
outLengths
[
I
];
},
Number
<
NumDstDim
>
{});
const
auto
tupleDstStrides
=
generate_tuple
([
&
](
auto
I
)
{
return
outStrides
[
I
];
},
Number
<
NumDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
n
umDstDim
,
1
>::
type
{}),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
N
umDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
const
auto
length
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
length
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
...
@@ -198,11 +204,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -198,11 +204,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
Argument
(
const
std
::
array
<
index_t
,
Rank
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
array
<
index_t
,
Rank
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
array
<
index_t
,
NumDstDim
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
array
<
index_t
,
NumDstDim
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
InDataType
*
in_dev
,
const
InDataType
*
in_dev
,
...
@@ -272,10 +278,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -272,10 +278,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
math
::
integer_least_multiple
(
invariant_total_length
,
BlockSize
)
/
BlockSize
;
math
::
integer_least_multiple
(
invariant_total_length
,
BlockSize
)
/
BlockSize
;
}
}
std
::
vector
<
index_t
>
inLengths_
;
std
::
array
<
index_t
,
Rank
>
inLengths_
;
std
::
vector
<
index_t
>
inStrides_
;
std
::
array
<
index_t
,
Rank
>
inStrides_
;
std
::
vector
<
index_t
>
outLengths_
;
std
::
array
<
index_t
,
NumDstDim
>
outLengths_
;
std
::
vector
<
index_t
>
outStrides_
;
std
::
array
<
index_t
,
NumDstDim
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
alpha_
;
AccDataType
beta_
;
AccDataType
beta_
;
...
@@ -459,11 +465,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -459,11 +465,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
};
};
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
MakeArgumentPointer
(
const
std
::
array
<
index_t
,
Rank
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
array
<
index_t
,
Rank
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
array
<
index_t
,
NumDstDim
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
array
<
index_t
,
NumDstDim
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_dev
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
View file @
963e4a71
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include <array>
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
@@ -34,7 +35,8 @@ template <typename InDataType,
...
@@ -34,7 +35,8 @@ template <typename InDataType,
index_t
InSrcVectorDim
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
index_t
OutDstVectorSize
>
struct
DeviceReduceThreadWise
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
struct
DeviceReduceThreadWise
:
public
DeviceReduce
<
Rank
,
NumReduceDim
,
InElementwiseOperation
,
AccElementwiseOperation
>
{
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
...
@@ -49,18 +51,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -49,18 +51,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
n
umSrcDim
=
Rank
;
static
constexpr
index_t
N
umSrcDim
=
Rank
;
static
constexpr
index_t
n
umDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
index_t
N
umDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
index_t
M_BlockTileSize
=
BlockSize
*
MThreadSliceSize
;
static
constexpr
index_t
M_BlockTileSize
=
BlockSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
1
*
KThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
1
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
index_t
>&
inLengths
,
static
auto
MakeSrc2dDescriptor
(
const
std
::
array
<
index_t
,
Rank
>&
inLengths
,
const
std
::
vector
<
index_t
>&
inStrides
)
const
std
::
array
<
index_t
,
Rank
>&
inStrides
)
{
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcLengths
=
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
Rank
>
{});
const
auto
tupleSrcStrides
=
generate_tuple
([
&
](
auto
I
)
{
return
inStrides
[
I
];
},
Number
<
Rank
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
...
@@ -70,7 +74,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -70,7 +74,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
n
umSrcDim
,
1
>::
type
{}),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
N
umSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
return
transform_tensor_descriptor
(
one_dim_inDesc
,
...
@@ -84,10 +88,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -84,10 +88,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
const
auto
reduceDimLengths
=
generate_tuple
(
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDim
s
{});
[
&
](
auto
I
)
{
return
inLengths
[
NumInvariantDim
+
I
];
},
Number
<
Num
ReduceDim
>
{});
const
auto
invariantDimLengths
=
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDim
s
{});
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
Num
InvariantDim
>
{});
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
inDesc
,
inDesc
,
...
@@ -116,18 +120,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -116,18 +120,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
return
(
in_grid_desc_m_k_padded
);
return
(
in_grid_desc_m_k_padded
);
};
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
index_t
>&
outLengths
,
static
auto
MakeDst1dDescriptor
(
const
std
::
array
<
index_t
,
NumDstDim
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
)
const
std
::
array
<
index_t
,
NumDstDim
>&
outStrides
)
{
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstLengths
=
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
generate_tuple
([
&
](
auto
I
)
{
return
outLengths
[
I
];
},
Number
<
NumDstDim
>
{});
const
auto
tupleDstStrides
=
generate_tuple
([
&
](
auto
I
)
{
return
outStrides
[
I
];
},
Number
<
NumDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
n
umDstDim
,
1
>::
type
{}),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
N
umDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
...
@@ -145,11 +151,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -145,11 +151,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
Argument
(
const
std
::
array
<
index_t
,
Rank
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
array
<
index_t
,
Rank
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
array
<
index_t
,
NumDstDim
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
array
<
index_t
,
NumDstDim
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
InDataType
*
in_dev
,
const
InDataType
*
in_dev
,
...
@@ -187,10 +193,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -187,10 +193,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
M_BlockTileSize
;
M_BlockTileSize
;
}
}
std
::
vector
<
index_t
>
inLengths_
;
std
::
array
<
index_t
,
Rank
>
inLengths_
;
std
::
vector
<
index_t
>
inStrides_
;
std
::
array
<
index_t
,
Rank
>
inStrides_
;
std
::
vector
<
index_t
>
outLengths_
;
std
::
array
<
index_t
,
NumDstDim
>
outLengths_
;
std
::
vector
<
index_t
>
outStrides_
;
std
::
array
<
index_t
,
NumDstDim
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
alpha_
;
AccDataType
beta_
;
AccDataType
beta_
;
...
@@ -321,11 +327,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -321,11 +327,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
};
};
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
MakeArgumentPointer
(
const
std
::
array
<
index_t
,
Rank
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
array
<
index_t
,
Rank
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
array
<
index_t
,
NumDstDim
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
array
<
index_t
,
NumDstDim
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_dev
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
View file @
963e4a71
...
@@ -8,12 +8,9 @@
...
@@ -8,12 +8,9 @@
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
@@ -50,29 +47,80 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
...
@@ -50,29 +47,80 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
virtual
index_t
GetNumReduceDim
()
const
override
{
return
kNumReduceDim
;
}
virtual
index_t
GetNumReduceDim
()
const
override
{
return
kNumReduceDim
;
}
// Used for freeloading of some handy functions from DeviceReduceMultiBlock
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
using
Reduction
=
DeviceReduceMultiBlock
<
InDataType
,
AccDataType
,
static
constexpr
index_t
NumSrcDim
=
Rank
;
OutDataType
,
static
constexpr
index_t
NumDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
Rank
,
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
NumReduceDim
,
reduce
::
Add
,
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
InElementwiseOp
,
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
AccElementwiseOp
,
InMemoryDataOperationEnum
::
Set
,
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
index_t
>&
inLengths
,
false
,
// PropagateNan
const
std
::
vector
<
index_t
>&
inStrides
,
false
,
// OutputIndex
int
blkGroupSize
,
false
,
// HaveIndexInputIfOutputIndex
int
numBlockTileIteration
)
BlockSize
,
{
MThreadClusterSize
,
const
auto
tupleSrcLengths
=
KThreadClusterSize
,
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
Rank
>
{});
MThreadSliceSize
,
const
auto
tupleSrcStrides
=
KThreadSliceSize
,
generate_tuple
([
&
](
auto
I
)
{
return
inStrides
[
I
];
},
Number
<
Rank
>
{});
InSrcVectorDim
,
InSrcVectorSize
,
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
1
>
;
// OutDstVectorSize
const
auto
in_grid_desc_m_k
=
[
&
]()
{
using
GridDesc_M_K
=
decltype
(
Reduction
::
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
if
constexpr
(
reduceAllDim
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
NumSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
generate_tuple
(
[
&
](
auto
I
)
{
return
inLengths
[
NumInvariantDim
+
I
];
},
Number
<
NumReduceDim
>
{});
const
auto
invariantDimLengths
=
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
NumInvariantDim
>
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
reduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
int
reduceSizePerBlock
=
K_BlockTileSize
*
numBlockTileIteration
;
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
reduceSizePerBlock
*
blkGroupSize
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
using
GridDesc_M_K
=
decltype
(
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridwiseSoftmaxGeneric
=
GridwiseSoftmax_mk_to_mk
<
InDataType
,
using
GridwiseSoftmaxGeneric
=
GridwiseSoftmax_mk_to_mk
<
InDataType
,
OutDataType
,
OutDataType
,
...
@@ -102,7 +150,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
...
@@ -102,7 +150,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
OutDstVectorSize
,
OutDstVectorSize
,
true
>
;
true
>
;
struct
Argument
:
public
Reduction
::
Argument
struct
Argument
:
public
Base
Argument
{
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
inStrides
,
...
@@ -113,42 +161,60 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
...
@@ -113,42 +161,60 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
OutDataType
*
out_dev
,
OutDataType
*
out_dev
,
InElementwiseOp
in_elementwise_op
,
InElementwiseOp
in_elementwise_op
,
AccElementwiseOp
acc_elementwise_op
)
AccElementwiseOp
acc_elementwise_op
)
:
Reduction
::
Argument
(
inLengths
,
:
alpha_
{
alpha
},
inStrides
,
beta_
{
beta
},
{},
in_dev_
{
in_dev
},
{},
out_dev_
{
out_dev
},
reduceDims
,
in_elementwise_op_
{
in_elementwise_op
},
0.0
f
,
// alpha
acc_elementwise_op_
{
acc_elementwise_op
}
0.0
f
,
// beta
in_dev
,
nullptr
,
out_dev
,
nullptr
,
in_elementwise_op
,
acc_elementwise_op
),
// FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
// float32 precision. Make it support any data type so the fields can be removed.
alpha_
(
alpha
),
beta_
(
beta
)
{
{
// std::cout << "blkGroupSize= " << this->blkGroupSize
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
// << ", numBlockTileIteration= " << this->numBlockTileIteration
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
// << ", gridSize=" << this->gridSize
// << ", invariant_total_length=" << this->invariant_total_length <<
long_index_t
invariant_total_length
;
// std::endl;
long_index_t
reduce_total_length
;
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
inLengths_
);
if
constexpr
(
NumInvariantDim
==
0
)
invariant_lowest_length_
=
1
;
else
invariant_lowest_length_
=
inLengths_
[
NumInvariantDim
-
1
];
blkGroupSize
=
1
;
numBlockTileIteration
=
(
reduce_total_length
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
*
blkGroupSize
;
}
}
std
::
vector
<
index_t
>
inLengths_
;
std
::
vector
<
index_t
>
inStrides_
;
AccDataType
alpha_
;
AccDataType
alpha_
;
AccDataType
beta_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
InElementwiseOp
in_elementwise_op_
;
AccElementwiseOp
acc_elementwise_op_
;
index_t
invariant_lowest_length_
;
int
blkGroupSize
;
int
numBlockTileIteration
;
size_t
gridSize
;
};
};
struct
Invoker
:
public
BaseInvoker
struct
Invoker
:
public
BaseInvoker
{
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
const
auto
in_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
const
auto
in_grid_desc_m_k
=
DeviceSoftmaxImpl
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
out_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
const
auto
out_grid_desc_m_k
=
DeviceSoftmaxImpl
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
bool
sweep_once
=
bool
sweep_once
=
...
@@ -195,15 +261,32 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
...
@@ -195,15 +261,32 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
{
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
!
Reduction
::
IsSupportedArgument
(
p_arg_
)
)
if
constexpr
(
InSrcVectorDim
==
0
)
{
{
return
false
;
if
constexpr
(
NumInvariantDim
==
0
)
}
{
return
false
;
}
else
{
if
(
p_arg_
->
inStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
false
;
if
(
p_arg_
->
inLengths_
[
Rank
-
1
]
%
OutDstVectorSize
!=
0
)
if
(
p_arg_
->
invariant_lowest_length_
%
InSrcVectorSize
!=
0
)
return
false
;
};
}
else
{
{
if
(
p_arg_
->
inStrides_
[
Rank
-
1
]
!=
1
)
return
false
;
if
(
p_arg_
->
inLengths_
[
Rank
-
1
]
%
InSrcVectorSize
!=
0
)
return
false
;
};
if
(
p_arg_
->
invariant_lowest_length_
%
OutDstVectorSize
!=
0
)
return
false
;
return
false
;
}
return
true
;
return
true
;
};
};
...
...
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
View file @
963e4a71
...
@@ -3,7 +3,10 @@
...
@@ -3,7 +3,10 @@
#pragma once
#pragma once
#include <cstdlib>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -11,11 +14,12 @@ namespace device {
...
@@ -11,11 +14,12 @@ namespace device {
namespace
instance
{
namespace
instance
{
// aliasing, for commonly used data type
// aliasing, for commonly used data type
using
F64
=
double
;
using
F64
=
double
;
using
F32
=
float
;
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
INT32
=
int32_t
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
View file @
963e4a71
...
@@ -3,24 +3,77 @@
...
@@ -3,24 +3,77 @@
#pragma once
#pragma once
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp"
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
View file @
963e4a71
...
@@ -5,6 +5,8 @@
...
@@ -5,6 +5,8 @@
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -63,33 +65,20 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
...
@@ -63,33 +65,20 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
>
;
>
;
#endif
#endif
template
<
ReduceTensorOp
ReduceOpId
>
using
deviceReduceBlockWisePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
typename
ReduceOperation
,
typename
InElementwiseOp
,
typename
AccElementwiseOp
,
bool
PropagateNan
,
bool
PropagateNan
,
bool
Use
Index
>
bool
Output
Index
>
void
add_device_reduce_instance_blockwise
(
void
add_device_reduce_instance_blockwise
(
std
::
vector
<
deviceReduceBlockWisePtrType
<
ReduceOpId
>>&
device_op_instances
)
std
::
vector
<
DeviceReducePtr
<
Rank
,
NumReduceDim
,
InElementwiseOp
,
AccElementwiseOp
>>&
device_op_instances
)
{
{
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
OutputIndex
=
Indexable
&&
UseIndex
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances_blockwise
>::
value
,
1
>
{}(
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances_blockwise
>::
value
,
1
>
{}(
[
&
](
auto
i
)
{
[
&
](
auto
i
)
{
using
cfg1
=
remove_cvref_t
<
decltype
(
using
cfg1
=
remove_cvref_t
<
decltype
(
...
@@ -107,8 +96,8 @@ void add_device_reduce_instance_blockwise(
...
@@ -107,8 +96,8 @@ void add_device_reduce_instance_blockwise(
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
ReduceOperation
,
ReduceOperation
,
InElementwiseOp
eration
,
InElementwiseOp
,
AccElementwiseOp
eration
,
AccElementwiseOp
,
InMemoryDataOperationEnum
::
Set
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
PropagateNan
,
OutputIndex
,
OutputIndex
,
...
@@ -128,52 +117,6 @@ void add_device_reduce_instance_blockwise(
...
@@ -128,52 +117,6 @@ void add_device_reduce_instance_blockwise(
});
});
};
};
#define ADD_BLOCKWISE_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
template void add_device_reduce_instance_blockwise<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
PropagateNan, \
UseIndex>( \
std::vector<deviceReduceBlockWisePtrType<ReduceOpId>> & device_op_instances)
#define ADD_BLOCKWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<bool>(NanOpt), \
static_cast<bool>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_blockwise<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
PropagateNan, \
UseIndex>( \
std::vector<deviceReduceBlockWisePtrType<ReduceOpId>> & device_op_instances)
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<bool>(NanOpt), \
static_cast<bool>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
deleted
100644 → 0
View file @
ad29b25b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
5
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment