Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
b17ce193
"...composable_kernel_rocm.git" did not exist on "3276a5e9b94dc9ffccacf469e57b1f8502dabccb"
Commit
b17ce193
authored
Apr 11, 2024
by
root
Browse files
init for reduce_threadwise multi_d
parent
381d44aa
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
1242 additions
and
0 deletions
+1242
-0
example/12_reduce/CMakeLists.txt
example/12_reduce/CMakeLists.txt
+1
-0
example/12_reduce/reduce_threadwise.cpp
example/12_reduce/reduce_threadwise.cpp
+272
-0
example/12_reduce/reduce_threadwise_impl.hpp
example/12_reduce/reduce_threadwise_impl.hpp
+303
-0
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
...tion/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
+409
-0
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp
...ion/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp
+257
-0
No files found.
example/12_reduce/CMakeLists.txt
View file @
b17ce193
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
)
add_example_executable
(
example_reduce_threadwise reduce_threadwise.cpp
)
add_example_executable
(
example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp
)
add_example_executable
(
example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp
)
example/12_reduce/reduce_threadwise.cpp
0 → 100644
View file @
b17ce193
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/utility/reduction_enums.hpp"
#include "reduce_threadwise_impl.hpp"
#include "reduce_example_common.hpp"
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
class
SimpleAppArgs
{
private:
int
option_index
=
0
;
public:
std
::
vector
<
size_t
>
inLengths
=
{
16
,
64
,
32
,
16
};
std
::
vector
<
int
>
reduceDims
=
{
0
,
1
,
2
};
std
::
vector
<
float
>
scales
=
{
1.0
f
,
0.0
f
};
bool
do_verification
=
true
;
int
data_type
=
1
;
int
init_method
=
2
;
bool
time_kernel
=
true
;
public:
void
show_usage
(
const
char
*
cmd
)
{
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
<<
std
::
endl
;
std
::
cout
<<
"--reduceDims or -R, comma separated list of to-reduce dimensions"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<<
std
::
endl
;
std
::
cout
<<
"Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<<
std
::
endl
;
std
::
cout
<<
"Arg3 -- time kernel (0=no, 1=yes)"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
while
(
1
)
{
ch
=
getopt_long
(
argc
,
argv
,
"D:R:v:l:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
break
;
switch
(
ch
)
{
case
'D'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
inLengths
=
getTypeValuesFromString
<
size_t
>
(
optarg
);
break
;
case
'R'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
reduceDims
=
getTypeValuesFromString
<
int
>
(
optarg
);
break
;
case
'v'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
do_verification
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
case
'?'
:
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"help"
)
{
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
break
;
default:
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
};
if
(
optind
+
3
>
argc
)
{
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
};
data_type
=
std
::
atoi
(
argv
[
optind
++
]);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
]));
if
(
scales
.
empty
())
{
scales
.
push_back
(
1.0
f
);
scales
.
push_back
(
0.0
f
);
};
return
(
0
);
};
};
template
<
typename
InOutDataType
,
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
,
index_t
PropagateNan
,
index_t
OutputIndex
>
bool
reduce_threadwise_test
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
)
{
bool
matched
=
false
;
int
result
=
0
;
const
auto
tuple_object
=
reduce_shape_instances
{};
static_for
<
0
,
std
::
tuple_size
<
reduce_shape_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
if
(
matched
)
return
;
using
ShapeType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
return
;
std
::
array
<
int
,
ShapeType
::
NumReduceDim_
>
arrReduceDims
;
ck
::
ranges
::
copy
(
reduceDims
,
arrReduceDims
.
begin
());
result
=
reduce_threadwise_impl
<
InOutDataType
,
AccDataType
,
ReduceOpId
,
ShapeType
::
Rank_
,
ShapeType
::
NumReduceDim_
,
PropagateNan
,
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
inLengths
,
arrReduceDims
,
alpha
,
beta
);
matched
=
true
;
});
return
(
result
==
0
)
?
true
:
false
;
};
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
AVG
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
pass
=
true
;
if
(
argc
>
1
)
{
SimpleAppArgs
arg
;
if
(
arg
.
processArgs
(
argc
,
argv
)
<
0
)
return
(
-
1
);
if
(
arg
.
data_type
==
0
)
{
pass
=
reduce_threadwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inLengths
,
arg
.
reduceDims
,
arg
.
scales
[
0
],
arg
.
scales
[
1
]);
}
else
if
(
arg
.
data_type
==
1
)
{
pass
=
reduce_threadwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inLengths
,
arg
.
reduceDims
,
arg
.
scales
[
0
],
arg
.
scales
[
1
]);
}
#if 0
else if(arg.data_type == 3)
{
pass = reduce_threadwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
else if(arg.data_type == 5)
{
pass = reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
else if(arg.data_type == 6)
{
pass = reduce_threadwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
#endif
}
else
{
// for testing half_t
pass
=
pass
&&
reduce_threadwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
// for testing float
pass
=
pass
&&
reduce_threadwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
// for testing double
pass
=
pass
&&
reduce_threadwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
// for testing bhalf_t
pass
=
pass
&&
reduce_threadwise_test
<
ck
::
bhalf_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
#if 0
// for testing int8_t
pass =
pass && reduce_threadwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
// for testing 3D input
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 960}, {0}, 1.0f, 0.0f);
// for testing 5D input
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 2, 960}, {0}, 1.0f, 0.0f);
#endif
}
return
(
pass
?
0
:
1
);
};
example/12_reduce/reduce_threadwise_impl.hpp
0 → 100644
View file @
b17ce193
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
//#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "reduce_example_common.hpp"
template
<
typename
InOutDataType
,
typename
AccDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
ck
::
index_t
Rank
,
ck
::
index_t
NumReduceDim
,
bool
PropagateNan
,
bool
OutputIndex
>
int
reduce_threadwise_impl
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
beta
)
{
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
invalid_reduce_1
=
OutputIndex
&&
!
op_support_indices
;
// 1) If InOutDataType is half_t, must use half_t as AccDataType for indexable reduction
// operations 2) If InOutDataType is half_t, must use float as AccDataType for non-indexable
// reduction operations
constexpr
bool
invalid_reduce_2
=
std
::
is_same
<
InOutDataType
,
half_t
>::
value
&&
((
!
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
)
||
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
half_t
>::
value
));
// 1) If InOutDataType is float, must use float as AccDataType for indexable reduction
// operations
constexpr
bool
invalid_reduce_3
=
std
::
is_same
<
InOutDataType
,
float
>::
value
&&
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// 1) If InOutDataType is int8_t or int4_t, must use int8_t as AccDataType for indexable
// reduction operations 2) If InOutDataType is int8_t or int4_t, must use int32_t as AccDataType
// for non-indexable reduction operations
constexpr
bool
invalid_reduce_4
=
std
::
is_same
<
InOutDataType
,
int8_t
>::
value
&&
((
!
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
int32_t
>::
value
)
||
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
int8_t
>::
value
));
// 1) If InOutDataType is int8_t or int4_t, the supported operation must be either indexable
// operations or ADD/AVG
constexpr
bool
invalid_reduce_5
=
std
::
is_same
<
InOutDataType
,
int8_t
>::
value
&&
(
!
op_support_indices
&&
ReduceOpId
!=
ReduceTensorOp
::
ADD
&&
ReduceOpId
!=
ReduceTensorOp
::
AVG
);
// 1) If InOutDataType is bhalf_t, must use float as AccDataType for all reduction operations
constexpr
bool
invalid_reduce_6
=
std
::
is_same
<
InOutDataType
,
bhalf_t
>::
value
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
;
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
||
invalid_reduce_4
||
invalid_reduce_5
||
invalid_reduce_6
);
if
constexpr
(
invalid_reduce
)
{
std
::
cerr
<<
"The reduction setting is invalid, exiting!"
<<
std
::
endl
;
return
(
-
1
);
};
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
InOutDataTypeInDevice
=
InOutDataType
;
using
DeviceReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceReduceThreadWiseMultiD
<
InOutDataTypeInDevice
,
AccDataType
,
InOutDataTypeInDevice
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
OutputIndex
,
false
,
false
,
// HaveIndexInputIfOutputIndex
256
,
// BlockSize
4
,
// MThreadSliceSize
1
,
// KThreadSliceSize
0
,
// InSrcVectorDim
1
,
// InSrceVectorSize
1
>
;
// OutDstVectorSize
Tensor
<
InOutDataType
>
in
(
inLengths
);
std
::
vector
<
size_t
>
outLengths
;
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
invariantDims
.
empty
())
outLengths
.
push_back
(
1
);
else
for
(
auto
dim
:
invariantDims
)
outLengths
.
push_back
(
inLengths
[
dim
]);
Tensor
<
InOutDataType
>
out_ref
(
outLengths
);
Tensor
<
InOutDataType
>
out
(
outLengths
);
Tensor
<
int
>
out_indices_ref
(
outLengths
);
Tensor
<
int
>
out_indices
(
outLengths
);
auto
inStrides
=
in
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
1
;
if
(
do_verification
)
{
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
}
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpaceSize
();
i
++
)
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
};
// these buffers are usually provided by the user application
DeviceMem
in_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
out
.
mDesc
.
GetElementSpaceSize
());
in_dev
.
ToDevice
(
in
.
mData
.
data
());
if
(
beta
!=
0.0
f
)
{
out_dev
.
ToDevice
(
out
.
mData
.
data
());
};
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int32_t
)
:
0
;
DeviceMem
out_index_dev
(
indicesSizeInBytes
);
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
std
::
tie
(
in_elementwise_op
,
acc_elementwise_op
)
=
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
static_cast
<
int32_t
>
(
reduce_total_length
));
std
::
array
<
index_t
,
Rank
>
arrInLengths
;
std
::
array
<
index_t
,
Rank
>
arrInStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrOutLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrOutStrides
;
ck
::
ranges
::
copy
(
inLengths
,
arrInLengths
.
begin
());
ck
::
ranges
::
copy
(
inStrides
,
arrInStrides
.
begin
());
ck
::
ranges
::
copy
(
outLengths
,
arrOutLengths
.
begin
());
ck
::
ranges
::
copy
(
outStrides
,
arrOutStrides
.
begin
());
if
(
do_verification
)
{
using
ReferenceReduceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceReduce
<
InOutDataType
,
AccDataType
,
InOutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
OutputIndex
>
;
auto
reduce_ref
=
ReferenceReduceInstance
{};
auto
argument_ptr_ref
=
reduce_ref
.
MakeArgumentPointer
(
arrInLengths
,
arrInStrides
,
arrOutLengths
,
arrOutStrides
,
reduceDims
,
static_cast
<
double
>
(
alpha
),
static_cast
<
double
>
(
beta
),
in
.
mData
.
data
(),
nullptr
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
(),
in_elementwise_op
,
acc_elementwise_op
);
if
(
!
reduce_ref
.
IsSupportedArgument
(
argument_ptr_ref
.
get
()))
{
std
::
cout
<<
"The runtime parameters not supported by the reduce reference, exiting!"
<<
std
::
endl
;
return
(
false
);
};
auto
invoker_ptr_ref
=
reduce_ref
.
MakeInvokerPointer
();
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrInLengths
,
arrInStrides
,
arrOutLengths
,
arrOutStrides
,
reduceDims
,
static_cast
<
double
>
(
alpha
),
static_cast
<
double
>
(
beta
),
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_index_dev
.
GetDeviceBuffer
(),
in_elementwise_op
,
acc_elementwise_op
);
if
(
!
reduce
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
cerr
<<
"The runtime parameters not supported by the DeviceReduce instance, exiting!"
<<
std
::
endl
;
return
(
-
2
);
};
std
::
string
reduce_name
=
reduce
.
GetTypeString
();
auto
invoker_ptr
=
reduce
.
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InOutDataType
)
+
invariant_total_length
*
sizeof
(
InOutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
,
out_ref
);
if
(
OutputIndex
)
{
out_index_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices
,
out_indices_ref
);
};
};
return
(
pass
?
0
:
1
);
}
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
0 → 100644
View file @
b17ce193
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include <array>
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
bool
OutputIndex
,
bool
TransformIndexKtoGlobal
,
bool
HaveIndexInputIfOutputIndex
,
index_t
BlockSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceThreadWiseMultiD
:
public
DeviceReduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
OutputIndex
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
MThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
using
IndexDataType
=
int32_t
;
static
constexpr
bool
HaveIndexInput
=
OutputIndex
&&
HaveIndexInputIfOutputIndex
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
NumSrcDim
=
Rank
;
static
constexpr
index_t
NumDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
index_t
M_BlockTileSize
=
BlockSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
1
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
array
<
index_t
,
Rank
>&
inLengths
,
const
std
::
array
<
index_t
,
Rank
>&
inStrides
)
{
const
auto
tupleSrcLengths
=
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
Rank
>
{});
const
auto
tupleSrcStrides
=
generate_tuple
([
&
](
auto
I
)
{
return
inStrides
[
I
];
},
Number
<
Rank
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDim
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
NumSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
generate_tuple
(
[
&
](
auto
I
)
{
return
inLengths
[
NumInvariantDim
+
I
];
},
Number
<
NumReduceDim
>
{});
const
auto
invariantDimLengths
=
generate_tuple
([
&
](
auto
I
)
{
return
inLengths
[
I
];
},
Number
<
NumInvariantDim
>
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
reduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
math
::
integer_least_multiple
(
reduceLength
,
K_BlockTileSize
)
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeDst1dDescriptor
(
const
std
::
array
<
index_t
,
NumDstDim
>&
outLengths
,
const
std
::
array
<
index_t
,
NumDstDim
>&
outStrides
)
{
const
auto
tupleDstLengths
=
generate_tuple
([
&
](
auto
I
)
{
return
outLengths
[
I
];
},
Number
<
NumDstDim
>
{});
const
auto
tupleDstStrides
=
generate_tuple
([
&
](
auto
I
)
{
return
outStrides
[
I
];
},
Number
<
NumDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
NumDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
outPad
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
outPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
array
<
index_t
,
Rank
>
inLengths
,
const
std
::
array
<
index_t
,
Rank
>
inStrides
,
const
std
::
array
<
index_t
,
NumDstDim
>
outLengths
,
const
std
::
array
<
index_t
,
NumDstDim
>
outStrides
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
,
double
alpha
,
double
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_index_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_index_dev_
{
out_index_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
alpha_
=
type_convert
<
AccDataType
>
(
alpha
);
beta_
=
type_convert
<
AccDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
inLengths_
);
if
constexpr
(
NumInvariantDim
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths_
[
NumInvariantDim
-
1
];
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
numBlockTileIteration
=
(
reduce_total_length
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
}
std
::
array
<
index_t
,
Rank
>
inLengths_
;
std
::
array
<
index_t
,
Rank
>
inStrides_
;
std
::
array
<
index_t
,
NumDstDim
>
outLengths_
;
std
::
array
<
index_t
,
NumDstDim
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_index_dev_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
index_t
invariant_lowest_length
;
index_t
reduce_lowest_length
;
long_index_t
invariant_total_length
;
long_index_t
reduce_total_length
;
int
numBlockTileIteration
;
size_t
gridSize
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
DeviceReduceThreadWiseMultiD
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
const
auto
out_grid_desc_m
=
DeviceReduceThreadWiseMultiD
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
const
auto
ds_grid_desc_m
=
generate_tuple
(
[
&
](
auto
i
)
{
ignore
=
i
;
return
DeviceReduceThreadWiseMultiD
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
},
Number
<
1
>
{});
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
DsGridDesc_M
=
decltype
(
ds_grid_desc_m
);
float
avg_time
=
0
;
using
Add
=
tensor_operation
::
element_wise
::
Add
;
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_threadwise_multi_d
<
InDataType
,
Tuple
<
OutDataType
>
,
OutDataType
,
AccDataType
,
InGridDesc_M_K
,
DsGridDesc_M
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
Add
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
BlockSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
using
DsGridPointer
=
typename
GridwiseReduce
::
DsGridPointer
;
const
auto
kernel
=
kernel_reduce_threadwise_multi_d
<
GridwiseReduce
,
InDataType
,
OutDataType
,
AccDataType
,
InGridDesc_M_K
,
DsGridDesc_M
,
OutGridDesc_M
,
InElementwiseOperation
,
Add
,
DsGridPointer
>
;
DsGridPointer
p_ds_grid_
;
avg_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
ds_grid_desc_m
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
Add
{},
arg
.
in_dev_
,
p_ds_grid_
,
arg
.
out_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
InSrcVectorDim
==
0
)
{
if
constexpr
(
NumInvariantDim
==
0
)
{
return
(
false
);
}
else
{
if
(
pArg
->
inStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
invariant_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
}
else
{
if
(
pArg
->
inStrides_
[
Rank
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
// To improve
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
std
::
cerr
<<
"reduce_total_length = "
<<
pArg
->
reduce_total_length
<<
" KThreadSliceSize = "
<<
KThreadSliceSize
<<
std
::
endl
;
// cases with big reduce_total_length should be handled by Blockwise kernel
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
>=
32
)
return
(
false
);
return
(
true
);
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
array
<
index_t
,
Rank
>
inLengths
,
const
std
::
array
<
index_t
,
Rank
>
inStrides
,
const
std
::
array
<
index_t
,
NumDstDim
>
outLengths
,
const
std
::
array
<
index_t
,
NumDstDim
>
outStrides
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
,
double
alpha
,
double
beta
,
const
void
*
in_dev
,
const
void
*
in_index_dev
,
void
*
out_dev
,
void
*
out_index_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
override
{
(
void
)
in_index_dev
;
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_index_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceThreadWiseMultiD<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
BlockSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
1
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp
0 → 100644
View file @
b17ce193
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/tuple_helper.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
DsGridDesc_M
,
typename
OutGridDesc_M
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
,
typename
DsGridPointer
>
__global__
void
kernel_reduce_threadwise_multi_d
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
DsGridDesc_M
ds_grid_desc_m
,
const
OutGridDesc_M
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
OutElementwiseOperation
out_elementwise_op
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
const
DsGridPointer
p_ds_value_global
,
OutDataType
*
const
__restrict__
p_out_value_global
)
{
GridwiseReduction
::
Run
(
in_grid_desc_m_k
,
ds_grid_desc_m
,
out_grid_desc_m
,
in_elementwise_op
,
out_elementwise_op
,
p_in_value_global
,
p_ds_value_global
,
p_out_value_global
);
}
template
<
typename
InDataType
,
typename
DsDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
DsGridDesc_M
,
typename
OutGridDesc_M
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
,
InMemoryDataOperationEnum
OutMemoryDataOperation
,
bool
PropagateNan
,
index_t
BlockSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
GridwiseReduction_mk_to_m_threadwise_multi_d
{
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
MThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
index_t
NumDTensor
=
DsDataType
::
Size
();
// ck::Tuple<const D0DataType*, const D1DataType*, ...>
static
constexpr
auto
MakeDsGridPointer
()
{
return
generate_tuple
(
[
&
](
auto
i
)
{
using
DDataType
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsDataType
>>
;
return
static_cast
<
const
DDataType
*>
(
nullptr
);
},
Number
<
NumDTensor
>
{});
}
using
DsGridPointer
=
decltype
(
MakeDsGridPointer
());
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
DsGridDesc_M
&
ds_grid_desc_m
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
out_elementwise_op
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
const
DsGridPointer
p_ds_grid
,
OutDataType
*
const
__restrict__
p_out_value_global
)
{
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
const
auto
identityVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
identityVal
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_val_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
KThreadSliceSize
);
index_t
reducedLength
=
0
;
do
{
threadwise_src_val_load
.
Run
(
in_grid_desc_m_k
,
in_global_val_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op
(
in_thread_buf
(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf
,
accu_value_buf
);
threadwise_src_val_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedLength
+=
KThreadSliceSize
;
}
while
(
reducedLength
<
toReduceLength
);
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
auto
ds_thread_buf
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataTypePointer
=
remove_cvref_t
<
decltype
(
DsGridPointer
{}[
I
])
>
;
using
DataType
=
remove_cv_t
<
remove_pointer_t
<
DataTypePointer
>>
;
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
DataType
,
MThreadSliceSize
,
true
>
{};
},
Number
<
NumDTensor
>
{});
auto
ds_global_buf
=
generate_tuple
(
[
&
](
auto
I
)
{
// static_assert(ds_grid_desc_m[I].GetNumOfDimension() == 1, "");
return
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ds_grid
[
I
],
ds_grid_desc_m
[
I
].
GetElementSpaceSize
());
},
Number
<
NumDTensor
>
{});
auto
ds_global_load
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataTypePointer
=
remove_cvref_t
<
decltype
(
DsGridPointer
{}[
I
])
>
;
using
DataType
=
remove_cv_t
<
remove_pointer_t
<
DataTypePointer
>>
;
return
ThreadwiseTensorSliceTransfer_v2
<
DataType
,
DataType
,
decltype
(
ds_grid_desc_m
[
I
]),
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
// SliceLengths
Sequence
<
0
>
,
// DimAccessOrder
0
,
// SrcVectorDim
OutDstVectorSize
,
1
,
// SrcScalarStrideInVector
false
>
{
ds_grid_desc_m
[
I
],
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
)};
},
Number
<
NumDTensor
>
{});
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
I
)
{
ds_global_load
(
I
).
Run
(
ds_grid_desc_m
[
I
],
ds_global_buf
[
I
],
reduced_data_desc
,
make_tuple
(
I0
),
ds_thread_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
const
auto
c_ds_buf_refs
=
concat_tuple_of_reference
(
tie
(
accu_value_buf
[
I
]),
generate_tie
(
[
&
](
auto
Id
)
->
const
auto
&
{
return
ds_thread_buf
[
Id
][
I
];
},
Number
<
NumDTensor
>
{}));
unpack2
(
out_elementwise_op
,
tie
(
accu_value_buf
(
I
)),
c_ds_buf_refs
);
});
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThrough
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
OutMemoryDataOperation
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
),
PassThrough
{});
threadwise_dst_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
dst_global_buf
);
}
};
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment