Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
d0b49a14
Commit
d0b49a14
authored
Oct 28, 2022
by
Qianfeng Zhang
Browse files
Merge branch 'develop' into bnorm_bwd_pr
parents
29026b0e
87fd1152
Changes
602
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
94 additions
and
81 deletions
+94
-81
example/01_gemm/gemm_xdl_fp16.cpp
example/01_gemm/gemm_xdl_fp16.cpp
+2
-2
example/01_gemm/gemm_xdl_fp64.cpp
example/01_gemm/gemm_xdl_fp64.cpp
+1
-1
example/01_gemm/gemm_xdl_int4.cpp
example/01_gemm/gemm_xdl_int4.cpp
+1
-1
example/01_gemm/gemm_xdl_int8.cpp
example/01_gemm/gemm_xdl_int8.cpp
+1
-1
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+2
-2
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+1
-1
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+1
-1
example/04_gemm_add_add_fastgelu/common.hpp
example/04_gemm_add_add_fastgelu/common.hpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+1
-1
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+1
-1
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+5
-1
example/12_reduce/reduce_blockwise_impl.hpp
example/12_reduce/reduce_blockwise_impl.hpp
+17
-15
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+27
-27
example/12_reduce/reduce_example_common.hpp
example/12_reduce/reduce_example_common.hpp
+7
-6
example/12_reduce/reduce_multiblock_atomic_add.cpp
example/12_reduce/reduce_multiblock_atomic_add.cpp
+5
-1
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+17
-15
No files found.
example/01_gemm/gemm_xdl_fp16.cpp
View file @
d0b49a14
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_cshuffle.hpp"
using
ADataType
=
ck
::
half_t
;
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
...
...
example/01_gemm/gemm_xdl_fp64.cpp
View file @
d0b49a14
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl.hpp"
using
ADataType
=
double
;
using
ADataType
=
double
;
using
BDataType
=
double
;
using
BDataType
=
double
;
...
...
example/01_gemm/gemm_xdl_int4.cpp
View file @
d0b49a14
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_cshuffle.hpp"
using
ADataType
=
ck
::
int4_t
;
using
ADataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
...
...
example/01_gemm/gemm_xdl_int8.cpp
View file @
d0b49a14
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_cshuffle.hpp"
using
ADataType
=
int8_t
;
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
using
BDataType
=
int8_t
;
...
...
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
View file @
d0b49a14
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_skip_b_lds.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
...
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
View file @
d0b49a14
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
View file @
d0b49a14
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
example/04_gemm_add_add_fastgelu/common.hpp
View file @
d0b49a14
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
View file @
d0b49a14
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
View file @
d0b49a14
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
View file @
d0b49a14
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
View file @
d0b49a14
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
View file @
d0b49a14
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
View file @
d0b49a14
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include <vector>
#include <vector>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
...
...
example/12_reduce/reduce_blockwise.cpp
View file @
d0b49a14
...
@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
...
@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
return
;
return
;
std
::
array
<
int
,
ShapeType
::
NumReduceDim_
>
arrReduceDims
;
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
arrReduceDims
.
begin
());
result
=
reduce_blockwise_impl
<
InOutDataType
,
result
=
reduce_blockwise_impl
<
InOutDataType
,
AccDataType
,
AccDataType
,
ReduceOpId
,
ReduceOpId
,
...
@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
...
@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
ShapeType
::
NumReduceDim_
,
ShapeType
::
NumReduceDim_
,
PropagateNan
,
PropagateNan
,
OutputIndex
>
(
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
inLengths
,
r
educeDims
,
alpha
,
beta
);
do_verification
,
init_method
,
time_kernel
,
inLengths
,
arrR
educeDims
,
alpha
,
beta
);
matched
=
true
;
matched
=
true
;
});
});
...
...
example/12_reduce/reduce_blockwise_impl.hpp
View file @
d0b49a14
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
...
@@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification,
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_indices
=
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
...
@@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification,
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
invariantDims
.
empty
())
if
(
invariantDims
.
empty
())
outLengths
.
push_back
(
1
);
outLengths
.
push_back
(
1
);
...
@@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths
;
std
::
array
<
index_t
,
Rank
>
arrI
nLengths
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides
;
std
::
array
<
index_t
,
Rank
>
arrI
nStrides
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
std
::
copy
(
inLengths
.
begin
(),
inLengths
.
end
()
,
arrInLengths
.
begin
()
);
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
std
::
copy
(
inStrides
.
begin
(),
inStrides
.
end
()
,
arrInStrides
.
begin
()
);
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
()
);
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
()
);
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_i
nLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrI
nLengths
,
i_i
nStrides
,
arrI
nStrides
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
...
example/12_reduce/reduce_blockwise_two_call.cpp
View file @
d0b49a14
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -90,15 +90,15 @@ static bool time_kernel;
...
@@ -90,15 +90,15 @@ static bool time_kernel;
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
// used by the device reduction
// used by the device reduction
const
std
::
vector
<
int
>
reduceDims_1
=
{
4
};
const
std
::
array
<
int
,
1
>
reduceDims_1
=
{
4
};
const
std
::
vector
<
int
>
invariantDims_1
=
{
0
,
1
,
2
,
3
};
//
const std::
array
<int
, 4
> invariantDims_1 = {0, 1, 2, 3};
const
std
::
vector
<
int
>
reduceDims_2
=
{
3
};
const
std
::
array
<
int
,
1
>
reduceDims_2
=
{
3
};
const
std
::
vector
<
int
>
invariantDims_2
=
{
0
,
1
,
2
};
//
const std::
array
<int
, 3
> invariantDims_2 = {0, 1, 2};
// used by the host reduction
// used by the host reduction
const
std
::
vector
<
int
>
reduceDims
=
{
3
,
4
};
const
std
::
array
<
int
,
2
>
reduceDims
=
{
3
,
4
};
const
std
::
vector
<
int
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
array
<
int
,
3
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
...
@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
...
@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths_1
;
std
::
array
<
index_t
,
5
>
arrI
nLengths_1
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides_1
;
std
::
array
<
index_t
,
5
>
arrI
nStrides_1
;
std
::
vector
<
ck
::
index_t
>
i_i
nLengths_2
;
std
::
array
<
index_t
,
4
>
arrI
nLengths_2
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides_2
;
std
::
array
<
index_t
,
4
>
arrI
nStrides_2
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
3
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
3
>
arrO
utStrides
;
i_inLengths_1
.
assign
(
inLengths_1
.
begin
(),
inLengths_1
.
end
());
std
::
copy
(
inLengths_1
.
begin
(),
inLengths_1
.
end
()
,
arrInLengths_1
.
begin
()
);
i_inStrides_1
.
assign
(
inStrides_1
.
begin
(),
inStrides_1
.
end
());
std
::
copy
(
inStrides_1
.
begin
(),
inStrides_1
.
end
()
,
arrInStrides_1
.
begin
()
);
i_inLengths_2
.
assign
(
inLengths_2
.
begin
(),
inLengths_2
.
end
());
std
::
copy
(
inLengths_2
.
begin
(),
inLengths_2
.
end
()
,
arrInLengths_2
.
begin
()
);
i_inStrides_2
.
assign
(
inStrides_2
.
begin
(),
inStrides_2
.
end
());
std
::
copy
(
inStrides_2
.
begin
(),
inStrides_2
.
end
()
,
arrInStrides_2
.
begin
()
);
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
()
);
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
()
);
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_i
nLengths_1
,
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
arrI
nLengths_1
,
i_i
nStrides_1
,
arrI
nStrides_1
,
i_i
nLengths_2
,
arrI
nLengths_2
,
i_i
nStrides_2
,
arrI
nStrides_2
,
reduceDims_1
,
reduceDims_1
,
1.0
f
,
1.0
f
,
0.0
f
,
0.0
f
,
...
@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
...
@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_i
nLengths_2
,
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
arrI
nLengths_2
,
i_i
nStrides_2
,
arrI
nStrides_2
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims_2
,
reduceDims_2
,
alpha
,
alpha
,
beta
,
beta
,
...
...
example/12_reduce/reduce_example_common.hpp
View file @
d0b49a14
...
@@ -5,11 +5,10 @@
...
@@ -5,11 +5,10 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
template
<
ck
::
index_t
Rank
,
ck
::
index_t
NumReduceDim
>
template
<
int
Rank
,
int
NumReduceDim
>
std
::
vector
<
int
>
get_invariant_dims
(
const
std
::
vector
<
int
>&
reduceDims
)
static
inline
std
::
array
<
int
,
Rank
-
NumReduceDim
>
get_invariant_dims
(
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
)
{
{
assert
(
NumReduceDim
==
reduceDims
.
size
());
int
reduceFlag
=
0
;
int
reduceFlag
=
0
;
// flag the bits for the reduceDims
// flag the bits for the reduceDims
...
@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
...
@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
reduceFlag
|=
1
<<
reduceDims
[
i
];
reduceFlag
|=
1
<<
reduceDims
[
i
];
};
};
std
::
vector
<
int
>
invariantDims
;
std
::
array
<
int
,
Rank
-
NumReduceDim
>
invariantDims
;
// collect invariant dimensions
// collect invariant dimensions
int
dim
=
0
;
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
{
invariantDims
.
push_back
(
i
);
invariantDims
[
dim
]
=
i
;
dim
++
;
};
};
return
invariantDims
;
return
invariantDims
;
...
...
example/12_reduce/reduce_multiblock_atomic_add.cpp
View file @
d0b49a14
...
@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
...
@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
return
;
return
;
std
::
array
<
int
,
ShapeType
::
NumReduceDim_
>
a_reduceDims
;
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
a_reduceDims
.
begin
());
result
=
reduce_multiblock_atomic_add_impl
<
InOutDataType
,
result
=
reduce_multiblock_atomic_add_impl
<
InOutDataType
,
AccDataType
,
AccDataType
,
ReduceOpId
,
ReduceOpId
,
ShapeType
::
Rank_
,
ShapeType
::
Rank_
,
ShapeType
::
NumReduceDim_
,
ShapeType
::
NumReduceDim_
,
PropagateNan
>
(
PropagateNan
>
(
do_verification
,
init_method
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
do_verification
,
init_method
,
time_kernel
,
inLengths
,
a_
reduceDims
,
alpha
,
beta
);
matched
=
true
;
matched
=
true
;
});
});
...
...
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
View file @
d0b49a14
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
...
@@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_atomic_add
=
constexpr
bool
op_support_atomic_add
=
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
);
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
);
...
@@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
invariantDims
.
empty
())
if
(
invariantDims
.
empty
())
outLengths
.
push_back
(
1
);
outLengths
.
push_back
(
1
);
...
@@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths
;
std
::
array
<
index_t
,
Rank
>
arrI
nLengths
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides
;
std
::
array
<
index_t
,
Rank
>
arrI
nStrides
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
std
::
copy
(
inLengths
.
begin
(),
inLengths
.
end
()
,
arrInLengths
.
begin
()
);
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
std
::
copy
(
inStrides
.
begin
(),
inStrides
.
end
()
,
arrInStrides
.
begin
()
);
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
()
);
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
()
);
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_i
nLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrI
nLengths
,
i_i
nStrides
,
arrI
nStrides
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
...
Prev
1
2
3
4
5
6
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment