Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
e72c0c43
Commit
e72c0c43
authored
Mar 26, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
d714fa15
313bbea5
Changes
262
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
745 additions
and
540 deletions
+745
-540
include/ck/utility/tensor_space_filling_curve.hpp
include/ck/utility/tensor_space_filling_curve.hpp
+13
-0
library/include/ck/library/host_tensor/device.hpp
library/include/ck/library/host_tensor/device.hpp
+2
-8
library/include/ck/library/host_tensor/host_generic_reduction.hpp
...include/ck/library/host_tensor/host_generic_reduction.hpp
+0
-424
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+39
-38
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+402
-0
library/include/ck/library/host_tensor/host_tensor.hpp
library/include/ck/library/host_tensor/host_tensor.hpp
+38
-42
library/include/ck/library/host_tensor/host_tensor_generator.hpp
.../include/ck/library/host_tensor/host_tensor_generator.hpp
+2
-18
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
...ary/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+61
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
..._operation_instance/gpu/reduce/device_reduce_instance.hpp
+13
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
..._instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+0
-1
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
...u/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
+60
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
...u/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
+6
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
...u/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
+3
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
...u/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
+9
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
...u/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
+3
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
...u/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
+9
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
...gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
.../gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
+47
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
...u/reduce/device_reduce_instance_blockwise_second_call.hpp
+1
-3
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
...ice_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
+6
-0
No files found.
include/ck/utility/tensor_space_filling_curve.hpp
View file @
e72c0c43
...
...
@@ -37,6 +37,10 @@ struct SpaceFillingCurve
__host__
__device__
static
constexpr
index_t
GetNumOfAccess
()
{
static_assert
(
TensorLengths
::
Size
()
==
ScalarsPerAccess
::
Size
());
static_assert
(
TensorLengths
{}
%
ScalarsPerAccess
{}
==
typename
uniform_sequence_gen
<
TensorLengths
::
Size
(),
0
>::
type
{});
return
reduce_on_sequence
(
TensorLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})
/
ScalarPerVector
;
}
...
...
@@ -140,6 +144,15 @@ struct SpaceFillingCurve
}();
return
idx_md
;
}
// FIXME: rename this function
template
<
index_t
AccessIdx1d
>
static
__device__
__host__
constexpr
auto
GetIndexTupleOfNumber
(
Number
<
AccessIdx1d
>
)
{
constexpr
auto
idx
=
GetIndex
(
Number
<
AccessIdx1d
>
{});
return
generate_tuple
([
&
](
auto
i
)
{
return
Number
<
idx
[
i
]
>
{};
},
Number
<
nDim
>
{});
}
};
}
// namespace ck
...
...
library/include/ck/library/host_tensor/device.hpp
View file @
e72c0c43
...
...
@@ -13,8 +13,10 @@ struct DeviceMem
DeviceMem
()
=
delete
;
DeviceMem
(
std
::
size_t
mem_size
);
void
*
GetDeviceBuffer
();
std
::
size_t
GetBufferSize
();
void
ToDevice
(
const
void
*
p
);
void
FromDevice
(
void
*
p
);
void
SetZero
();
~
DeviceMem
();
void
*
mpDeviceBuf
;
...
...
@@ -48,7 +50,6 @@ template <typename... Args, typename F>
float
launch_and_time_kernel
(
F
kernel
,
int
nrepeat
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
#if 1
KernelTimer
timer
;
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
...
...
@@ -78,13 +79,6 @@ float launch_and_time_kernel(
timer
.
End
();
// std::this_thread::sleep_for (std::chrono::microseconds(10));
return
timer
.
GetElapsedTime
()
/
nrepeat
;
#else
launch_kernel
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
args
...);
return
0
;
#endif
}
#endif
library/include/ck/library/host_tensor/host_generic_reduction.hpp
deleted
100644 → 0
View file @
d714fa15
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_GENERIC_REDUCTION_HPP_
#define HOST_GENERIC_REDUCTION_HPP_
#include <vector>
#include <functional>
#include <limits>
#include <type_traits>
#include <cassert>
#include <cmath>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
using
float16
=
half_float
::
half
;
namespace
ck
{
namespace
host_reduce
{
template
<
typename
T
>
static
void
get_all_indexes
(
const
std
::
vector
<
T
>&
dimLengths
,
int
dim
,
std
::
vector
<
std
::
vector
<
T
>>&
indexes
)
{
if
(
dim
<
dimLengths
.
size
())
{
std
::
vector
<
std
::
vector
<
T
>>
updated_indexes
;
if
(
dim
==
0
)
{
assert
(
indexes
.
size
()
==
0
);
assert
(
dimLengths
[
dim
]
>
0
);
for
(
T
i
=
0
;
i
<
dimLengths
[
dim
];
i
++
)
{
std
::
vector
<
T
>
index
=
{
i
};
updated_indexes
.
push_back
(
index
);
};
}
else
{
// go through all the current indexes
for
(
const
auto
&
index
:
indexes
)
for
(
T
i
=
0
;
i
<
dimLengths
[
dim
];
i
++
)
{
auto
index_new
=
index
;
index_new
.
push_back
(
i
);
updated_indexes
.
push_back
(
index_new
);
};
};
// update to the indexes (output)
indexes
=
updated_indexes
;
// further to construct the indexes from the updated status
get_all_indexes
(
dimLengths
,
dim
+
1
,
indexes
);
};
};
template
<
typename
T
>
static
T
get_offset_from_index
(
const
std
::
vector
<
T
>&
strides
,
const
std
::
vector
<
T
>&
index
)
{
T
offset
=
0
;
assert
(
strides
.
size
()
==
index
.
size
());
for
(
int
i
=
0
;
i
<
index
.
size
();
i
++
)
offset
+=
strides
[
i
]
*
static_cast
<
T
>
(
index
[
i
]);
return
(
offset
);
};
template
<
typename
T
>
static
inline
T
get_flatten_offset
(
const
std
::
vector
<
T
>&
lengths
,
const
std
::
vector
<
T
>&
index
)
{
T
offset
=
0
;
assert
(
lengths
.
size
()
==
index
.
size
()
&&
lengths
.
size
()
>
0
);
int
len
=
lengths
.
size
();
T
stride
=
1
;
// for len==1, the loop is not executed
for
(
int
i
=
len
-
1
;
i
>
0
;
i
--
)
{
offset
+=
stride
*
static_cast
<
T
>
(
index
[
i
]);
stride
*=
lengths
[
i
];
};
offset
+=
stride
*
static_cast
<
T
>
(
index
[
0
]);
return
(
offset
);
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp_t
ReduceOpId
,
bool
PropagateNan
,
bool
NeedIndices
>
class
ReductionHost
{
public:
ReductionHost
()
=
default
;
ReductionHost
(
HostTensorDescriptor
&
inDesc
,
HostTensorDescriptor
&
outDesc
,
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
toReduceDims_
)
{
this
->
inLengths
=
to_int_vector
(
inDesc
.
GetLengths
());
this
->
outLengths
=
to_int_vector
(
outDesc
.
GetLengths
());
this
->
inStrides
=
to_int_vector
(
inDesc
.
GetStrides
());
this
->
outStrides
=
to_int_vector
(
outDesc
.
GetStrides
());
this
->
invariantDims
=
invariantDims_
;
this
->
toReduceDims
=
toReduceDims_
;
assert
(
this
->
inLengths
.
size
()
==
this
->
outLengths
.
size
());
assert
(
!
this
->
toReduceDims
.
empty
());
for
(
const
auto
dim
:
this
->
invariantDims
)
this
->
invariantLengths
.
push_back
(
this
->
inLengths
[
dim
]);
for
(
const
auto
dim
:
this
->
toReduceDims
)
toReduceLengths
.
push_back
(
this
->
inLengths
[
dim
]);
this
->
reduceAllDims
=
this
->
invariantDims
.
empty
();
};
~
ReductionHost
(){};
void
Run
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
int
*
indices
)
{
if
constexpr
(
NeedIndices
)
RunImpl_with_indices
(
alpha
,
in_data
,
beta
,
out_data
,
indices
);
else
RunImpl_no_indices
(
alpha
,
in_data
,
beta
,
out_data
);
};
private:
std
::
vector
<
int
>
inLengths
;
std
::
vector
<
int
>
outLengths
;
std
::
vector
<
int
>
inStrides
;
std
::
vector
<
int
>
outStrides
;
std
::
vector
<
int
>
invariantLengths
;
std
::
vector
<
int
>
toReduceLengths
;
std
::
vector
<
int
>
invariantDims
;
std
::
vector
<
int
>
toReduceDims
;
bool
reduceAllDims
;
void
RunImpl_with_indices
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
int
*
indices
)
{
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
int
divider
=
1
;
for
(
int
i
=
0
;
i
<
toReduceLengths
.
size
();
i
++
)
divider
*=
toReduceLengths
[
i
];
auto
PreUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
auto
PosUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
if
(
reduceAllDims
)
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
;
get_all_indexes
(
inLengths
,
0
,
indexes_1
);
// generate the input indexes space
auto
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
int
accuIndex
=
0
;
// go through indexes of the invariant dimensions
for
(
const
auto
&
src_index
:
indexes_1
)
{
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
// done
PreUnaryOp
(
currVal
);
auto
currIndex
=
get_flatten_offset
(
inLengths
,
src_index
);
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
0
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
0
]
=
static_cast
<
OutDataType
>
(
accuVal
);
indices
[
0
]
=
accuIndex
;
}
else
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
,
indexes_2
;
get_all_indexes
(
this
->
invariantLengths
,
0
,
indexes_1
);
// generate the invariant indexes space
get_all_indexes
(
this
->
toReduceLengths
,
0
,
indexes_2
);
// generate the toReduce indexes space
// go through indexes of the invariant dimensions
for
(
const
auto
&
index_1
:
indexes_1
)
{
std
::
vector
<
int
>
src_index
;
std
::
vector
<
int
>
dst_index
;
src_index
.
resize
(
this
->
inLengths
.
size
());
// generate the part of src index belonging to invariant dims
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
src_index
[
invariantDims
[
k
]]
=
index_1
[
k
];
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
dst_index
.
push_back
(
index_1
[
k
]);
int
dst_offset
=
get_offset_from_index
(
this
->
outStrides
,
dst_index
);
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
int
accuIndex
=
0
;
// go through indexes of the toReduce dimensions
for
(
const
auto
&
index_2
:
indexes_2
)
{
// generate the part of src index belonging to toReduce dims
for
(
int
k
=
0
;
k
<
toReduceDims
.
size
();
k
++
)
src_index
[
toReduceDims
[
k
]]
=
index_2
[
k
];
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
// actually done
PreUnaryOp
(
currVal
);
auto
currIndex
=
get_flatten_offset
(
toReduceLengths
,
index_2
);
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
dst_offset
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
dst_offset
]
=
static_cast
<
OutDataType
>
(
accuVal
);
indices
[
dst_offset
]
=
accuIndex
;
};
};
};
// end of RunImpl_with_indices()
void
RunImpl_no_indices
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
{
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
int
divider
=
1
;
for
(
int
i
=
0
;
i
<
toReduceLengths
.
size
();
i
++
)
divider
*=
toReduceLengths
[
i
];
auto
PreUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
auto
PosUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
if
(
reduceAllDims
)
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
;
get_all_indexes
(
inLengths
,
0
,
indexes_1
);
// generate the input indexes space
auto
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
// go through indexes of the invariant dimensions
for
(
const
auto
&
src_index
:
indexes_1
)
{
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
PreUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
PosUnaryOp
(
accuVal
);
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
0
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
0
]
=
static_cast
<
OutDataType
>
(
accuVal
);
}
else
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
,
indexes_2
;
get_all_indexes
(
this
->
invariantLengths
,
0
,
indexes_1
);
// generate the invariant indexes space
get_all_indexes
(
this
->
toReduceLengths
,
0
,
indexes_2
);
// generate the toReduce indexes space
// go through indexes of the invariant dimensions
for
(
const
auto
&
index_1
:
indexes_1
)
{
std
::
vector
<
int
>
src_index
;
std
::
vector
<
int
>
dst_index
;
src_index
.
resize
(
this
->
inLengths
.
size
());
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
dst_index
.
push_back
(
index_1
[
k
]);
int
dst_offset
=
get_offset_from_index
(
this
->
outStrides
,
dst_index
);
// generate the part of src index belonging to invariant dims
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
src_index
[
invariantDims
[
k
]]
=
index_1
[
k
];
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
// go through indexes of the toReduce dimensions
for
(
const
auto
&
index_2
:
indexes_2
)
{
// generate the part of src index belonging to toReduce dims
for
(
int
k
=
0
;
k
<
toReduceDims
.
size
();
k
++
)
src_index
[
toReduceDims
[
k
]]
=
index_2
[
k
];
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
PreUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
PosUnaryOp
(
accuVal
);
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
dst_offset
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
dst_offset
]
=
static_cast
<
OutDataType
>
(
accuVal
);
};
};
};
// end of RunImpl_no_indices()
};
};
// end of namespace host_reduce
};
// end of namespace ck
#endif
library/include/ck/library/host_tensor/host_reduce_util.hpp
View file @
e72c0c43
...
...
@@ -66,22 +66,22 @@ static inline bool float_equal_zero(half_float::half x)
return
x
==
static_cast
<
half_float
::
half
>
(
0.0
f
);
};
template
<
typename
comp
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
comp
Type
&
)
>
PreUnaryOpFn
(
int
)
template
<
typename
AccData
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccData
Type
&
)
>
PreUnaryOpFn
(
int
)
{
using
std
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
NORM1
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
NORM2
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
a_
*
a_
;
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
a_
*
a_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
AMAX
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
{
...
...
@@ -90,23 +90,23 @@ __host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
// ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX:
return
([
&
](
comp
Type
&
)
{});
return
([
&
](
AccData
Type
&
)
{});
};
};
template
<
typename
comp
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
comp
Type
&
)
>
PosUnaryOpFn
(
int
divider
)
template
<
typename
AccData
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccData
Type
&
)
>
PosUnaryOpFn
(
int
32_t
divider
)
{
using
std
::
sqrt
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
NORM2
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
sqrt
(
a_
);
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
sqrt
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
AVG
)
{
return
([
&
,
divider
](
comp
Type
&
a_
)
{
a_
=
a_
/
static_cast
<
comp
Type
>
(
static_cast
<
float
>
(
divider
));
return
([
&
,
divider
](
AccData
Type
&
a_
)
{
a_
=
a_
/
static_cast
<
AccData
Type
>
(
static_cast
<
float
>
(
divider
));
});
}
else
...
...
@@ -117,44 +117,44 @@ __host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
// ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX:
// ReduceTensorOp_t::AMAX:
return
([
&
](
comp
Type
&
)
{});
return
([
&
](
AccData
Type
&
)
{});
}
};
template
<
typename
comp
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
compType
&
,
comp
Type
)
>
ReduceOpFn
()
template
<
typename
AccData
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
)
>
ReduceOpFn
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
ADD
||
ReduceOpId
==
ReduceTensorOp_t
::
AVG
||
ReduceOpId
==
ReduceTensorOp_t
::
NORM1
||
ReduceOpId
==
ReduceTensorOp_t
::
NORM2
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
a_
=
a_
+
b_
;
});
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
a_
=
a_
+
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MUL
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
a_
=
a_
*
b_
;
});
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
a_
=
a_
*
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MIN
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
if
(
a_
>
b_
)
a_
=
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp_t
::
AMAX
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
if
(
a_
<
b_
)
a_
=
b_
;
});
}
};
template
<
typename
comp
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
compType
&
,
comp
Type
,
bool
&
changed
)
>
ReduceOpFn2
()
template
<
typename
AccData
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
,
bool
&
changed
)
>
ReduceOpFn2
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MIN
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
,
bool
&
changed
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
,
bool
&
changed
)
{
if
(
a_
>
b_
)
{
a_
=
b_
;
...
...
@@ -166,7 +166,7 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp_t
::
AMAX
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
,
bool
&
changed
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
,
bool
&
changed
)
{
if
(
a_
<
b_
)
{
a_
=
b_
;
...
...
@@ -183,28 +183,28 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
// ReduceTensorOp_t::AVG:
// ReduceTensorOp_t::NORM1:
// ReduceTensorOp_t::NORM2:
return
(
std
::
function
<
void
(
compType
&
,
comp
Type
,
bool
&
)
>
{});
return
(
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
,
bool
&
)
>
{});
};
};
template
<
typename
comp
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
comp
Type
ReduceOpZeroVal
()
template
<
typename
AccData
Type
,
ReduceTensorOp_t
ReduceOpId
>
__host__
static
inline
AccData
Type
ReduceOpZeroVal
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MUL
)
{
return
(
static_cast
<
comp
Type
>
(
1.0
f
));
return
(
static_cast
<
AccData
Type
>
(
1.0
f
));
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MIN
)
{
return
(
std
::
numeric_limits
<
comp
Type
>::
max
());
return
(
std
::
numeric_limits
<
AccData
Type
>::
max
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
MAX
)
{
return
(
std
::
numeric_limits
<
comp
Type
>::
lowest
());
return
(
std
::
numeric_limits
<
AccData
Type
>::
lowest
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp_t
::
AMAX
)
{
return
(
static_cast
<
comp
Type
>
(
0.0
f
));
return
(
static_cast
<
AccData
Type
>
(
0.0
f
));
}
else
{
...
...
@@ -212,14 +212,15 @@ __host__ static inline compType ReduceOpZeroVal()
// ReduceTensorOp_t::AVG
// ReduceTensorOp_t::NORM1
// ReduceTensorOp_t::NORM2
return
(
static_cast
<
comp
Type
>
(
0.0
f
));
return
(
static_cast
<
AccData
Type
>
(
0.0
f
));
};
};
template
<
typename
compType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
(
std
::
function
<
void
(
compType
&
,
compType
)
>
opReduce
,
compType
&
accuVal
,
compType
currVal
)
template
<
typename
AccDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
using
std
::
isnan
;
...
...
@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c
};
};
template
<
typename
comp
Type
,
bool
PropagateNan
>
template
<
typename
AccData
Type
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check2
(
std
::
function
<
void
(
compType
&
,
comp
Type
,
bool
&
)
>
opReduce
,
comp
Type
&
accuVal
,
comp
Type
currVal
,
binop_with_nan_check2
(
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
,
bool
&
)
>
opReduce
,
AccData
Type
&
accuVal
,
AccData
Type
currVal
,
int
&
accuIndex
,
int
currIndex
)
{
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
0 → 100644
View file @
e72c0c43
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_REDUCTION_HPP_
#define HOST_REDUCTION_HPP_
#include <vector>
#include <array>
#include <functional>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
template
<
int
NDim
>
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
std
::
vector
<
std
::
array
<
size_t
,
NDim
>>&
indexes
)
{
static_assert
(
NDim
>=
1
,
"NDim >= 1 is required to use this function!"
);
if
constexpr
(
NDim
==
1
)
{
for
(
size_t
i
=
0
;
i
<
dimLengths
[
0
];
i
++
)
{
std
::
array
<
size_t
,
1
>
index
{
i
};
indexes
.
push_back
(
index
);
};
}
else
{
std
::
array
<
size_t
,
NDim
-
1
>
partial_dim_lengths
;
for
(
int
i
=
0
;
i
<
NDim
-
1
;
i
++
)
partial_dim_lengths
[
i
]
=
dimLengths
[
i
+
1
];
std
::
vector
<
std
::
array
<
size_t
,
NDim
-
1
>>
partial_indexes
;
get_all_indexes
<
NDim
-
1
>
(
partial_dim_lengths
,
partial_indexes
);
for
(
size_t
i
=
0
;
i
<
dimLengths
[
0
];
i
++
)
for
(
const
auto
&
index
:
partial_indexes
)
{
std
::
array
<
size_t
,
NDim
>
extIndex
;
extIndex
[
0
]
=
i
;
for
(
int
k
=
0
;
k
<
NDim
-
1
;
k
++
)
extIndex
[
k
+
1
]
=
index
[
k
];
indexes
.
push_back
(
extIndex
);
};
};
};
template
<
int
NDim
>
static
size_t
get_offset_from_index
(
const
std
::
array
<
size_t
,
NDim
>&
strides
,
const
std
::
array
<
size_t
,
NDim
>&
index
)
{
size_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
NDim
;
i
++
)
offset
+=
strides
[
i
]
*
index
[
i
];
return
(
offset
);
};
template
<
int
NDim
>
static
size_t
get_offset_from_index
(
const
std
::
vector
<
size_t
>&
strides
,
const
std
::
array
<
size_t
,
NDim
>&
index
)
{
size_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
NDim
;
i
++
)
offset
+=
strides
[
i
]
*
index
[
i
];
return
(
offset
);
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp_t
ReduceOpId
,
int
Rank
,
int
NumReduceDim
,
bool
PropagateNan
,
bool
NeedIndices
>
struct
ReductionHost
{
using
IndexDataType
=
int32_t
;
static
constexpr
int
NumInvariantDim
=
Rank
-
NumReduceDim
;
std
::
vector
<
size_t
>
outStrides
;
std
::
vector
<
int
>
invariantDims
;
std
::
vector
<
int
>
reduceDims
;
IndexDataType
divider
;
std
::
function
<
void
(
AccDataType
&
)
>
preUnaryOp
;
std
::
function
<
void
(
AccDataType
&
)
>
posUnaryOp
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantStrides
;
std
::
vector
<
std
::
array
<
size_t
,
NumReduceDim
>>
reduce_dim_indexes
;
std
::
vector
<
std
::
array
<
size_t
,
NumInvariantDim
>>
invariant_dim_indexes
;
ReductionHost
(
HostTensorDescriptor
&
inDesc
,
HostTensorDescriptor
&
outDesc
,
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
reduceDims_
)
{
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
// this->outLengths = to_int_vector(outDesc.GetLengths());
this
->
outStrides
=
outDesc
.
GetStrides
();
this
->
invariantDims
=
invariantDims_
;
this
->
reduceDims
=
reduceDims_
;
int
product
=
1
;
for
(
int
i
=
0
;
i
<
NumReduceDim
;
i
++
)
{
reduceLengths
[
i
]
=
inDesc
.
GetLengths
()[
reduceDims
[
i
]];
reduceStrides
[
i
]
=
inDesc
.
GetStrides
()[
reduceDims
[
i
]];
product
*=
inDesc
.
GetLengths
()[
reduceDims
[
i
]];
};
divider
=
product
;
for
(
int
i
=
0
;
i
<
NumInvariantDim
;
i
++
)
{
invariantLengths
[
i
]
=
inDesc
.
GetLengths
()[
invariantDims
[
i
]];
invariantStrides
[
i
]
=
inDesc
.
GetStrides
()[
invariantDims
[
i
]];
};
reduce_dim_indexes
.
clear
();
get_all_indexes
<
NumReduceDim
>
(
reduceLengths
,
reduce_dim_indexes
);
if
constexpr
(
NumInvariantDim
>
0
)
{
invariant_dim_indexes
.
clear
();
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
};
preUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
posUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
};
void
Run
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
{
if
constexpr
(
NeedIndices
)
{
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
);
}
else
{
RunImpl_no_index
(
alpha
,
in_data
,
beta
,
out_data
);
};
};
void
RunImpl_with_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
{
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce2
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
IndexDataType
accuIndex
=
0
;
for
(
IndexDataType
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
auto
currIndex
=
i
;
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_indices
[
0
]
=
accuIndex
;
}
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
IndexDataType
accuIndex
=
0
;
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
IndexDataType
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
auto
currIndex
=
i
;
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_indices
[
dst_offset
]
=
accuIndex
;
};
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
std
::
vector
<
joinable_thread
>
threads
(
num_thread
);
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
invariant_dim_indexes
.
size
());
auto
f
=
[
=
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
{
thread_reduce_func
(
invariant_dim_indexes
[
iw
]);
}
};
threads
[
it
]
=
joinable_thread
(
f
);
}
};
};
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
{
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_index
);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
}
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_index
);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
};
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
std
::
vector
<
joinable_thread
>
threads
(
num_thread
);
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
invariant_dim_indexes
.
size
());
auto
f
=
[
=
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
{
thread_reduce_func
(
invariant_dim_indexes
[
iw
]);
}
};
threads
[
it
]
=
joinable_thread
(
f
);
}
};
};
};
#endif
library/include/ck/library/host_tensor/host_tensor.hpp
View file @
e72c0c43
...
...
@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
return
os
;
}
typedef
enum
{
Half
=
0
,
Float
=
1
,
}
DataType_t
;
template
<
typename
T
>
struct
DataType
;
template
<
>
struct
DataType
<
float
>
:
std
::
integral_constant
<
DataType_t
,
DataType_t
::
Float
>
{
};
template
<
typename
F
,
typename
T
,
std
::
size_t
...
Is
>
auto
call_f_unpack_args_impl
(
F
f
,
T
args
,
std
::
index_sequence
<
Is
...
>
)
{
...
...
@@ -312,48 +298,58 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s
void
ostream_HostTensorDescriptor
(
const
HostTensorDescriptor
&
desc
,
std
::
ostream
&
os
=
std
::
cout
);
#if 1
// FIXME: remove
float
bf16_to_f32_
(
ck
::
bhalf_t
src_val
);
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
);
#endif
template
<
typename
T
>
void
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
float
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
float
error
=
0
;
float
max_diff
=
-
1
;
float
ref_value
=
0
,
result_value
=
0
;
float
l1_error
=
0
;
float
linf_error
=
-
1
;
float
linf_rel_error
=
-
1
;
float
linf_ref_value
=
0
,
linf_result_value
=
0
;
float
linf_rel_ref_value
=
0
,
linf_rel_result_value
=
0
;
if
constexpr
(
std
::
is_same
<
ck
::
bhalf_t
,
T
>::
value
)
constexpr
float
eps
=
1e-10
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
float
ref_v
=
ck
::
type_convert
<
float
>
(
ref
.
mData
[
i
]);
float
result_v
=
ck
::
type_convert
<
float
>
(
result
.
mData
[
i
]);
float
diff
=
std
::
abs
(
ref_v
-
result_v
);
float
rel_diff
=
diff
/
std
::
max
(
std
::
abs
(
ref_v
),
eps
);
l1_error
+=
diff
;
if
(
linf_error
<
diff
)
{
error
+=
std
::
abs
(
bf16_to_f32_
(
ref
.
mData
[
i
])
-
bf16_to_f32_
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
bf16_to_f32_
(
ref
.
mData
[
i
])
-
bf16_to_f32_
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
max_diff
=
diff
;
ref_value
=
bf16_to_f32_
(
ref
.
mData
[
i
]);
result_value
=
bf16_to_f32_
(
result
.
mData
[
i
]);
}
linf_error
=
diff
;
linf_ref_value
=
ref_v
;
linf_result_value
=
result_v
;
}
}
else
{
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
if
(
linf_rel_error
<
rel_diff
)
{
error
+=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
max_diff
=
diff
;
ref_value
=
ref
.
mData
[
i
];
result_value
=
result
.
mData
[
i
];
}
linf_rel_error
=
rel_diff
;
linf_rel_ref_value
=
ref_v
;
linf_rel_result_value
=
result_v
;
}
}
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
std
::
cout
<<
"Absolute Error L1 Norm (sum of abs diff): "
<<
l1_error
<<
std
::
endl
;
std
::
cout
<<
"Absolute Error L-inf Norm (max abs diff): "
<<
linf_error
<<
", ref "
<<
linf_ref_value
<<
", result "
<<
linf_result_value
<<
std
::
endl
;
std
::
cout
<<
"Relative Error L-inf Norm (max relative abs diff): "
<<
linf_rel_error
<<
", ref "
<<
linf_rel_ref_value
<<
", result "
<<
linf_rel_result_value
<<
std
::
endl
;
return
linf_error
;
}
template
<
typename
T
>
...
...
library/include/ck/library/host_tensor/host_tensor_generator.hpp
View file @
e72c0c43
...
...
@@ -93,8 +93,8 @@ struct GeneratorTensor_2<int8_t>
template
<
typename
T
>
struct
GeneratorTensor_3
{
T
min_value
=
0
;
T
max_value
=
1
;
float
min_value
=
0
;
float
max_value
=
1
;
template
<
typename
...
Is
>
T
operator
()(
Is
...)
...
...
@@ -122,22 +122,6 @@ struct GeneratorTensor_3<ck::bhalf_t>
}
};
template
<
>
struct
GeneratorTensor_3
<
int8_t
>
{
float
min_value
=
0
;
float
max_value
=
1
;
template
<
typename
...
Is
>
int8_t
operator
()(
Is
...)
{
int8_t
min_tmp
=
static_cast
<
int8_t
>
(
min_value
);
int8_t
max_tmp
=
static_cast
<
int8_t
>
(
max_value
);
return
(
std
::
rand
()
%
(
max_tmp
-
min_tmp
))
+
min_tmp
;
}
};
struct
GeneratorTensor_Checkboard
{
template
<
typename
...
Ts
>
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
View file @
e72c0c43
...
...
@@ -14,9 +14,9 @@ namespace host {
//
// @brief Reference implementation for forward convolution.
//
// @paragraph Support
ed tensor layouts. Input tensor supports NCHiWi data layout.
//
Weights tensor supports KCYX data layout. Output tensor supports
//
NKHoWo data layout
.
// @paragraph
Support
s both NCHW as well as NHWC formats (and their respective
//
counterparts for weight and output) as long as tensor descriptor
//
lengths is in NCHW
.
//
// @tparam InDataType Input tensor data type.
// @tparam WeiDataType Weights tensor data type.
...
...
@@ -100,9 +100,9 @@ struct ReferenceConvFwd : public device::BaseOperator
float
v_wei
;
arg
.
in_element_op_
(
v_in
,
static_cast
<
const
float
>
(
arg
.
input_
(
n
,
c
,
wi
)));
ck
::
type_convert
<
float
>
(
arg
.
input_
(
n
,
c
,
wi
)));
arg
.
wei_element_op_
(
v_wei
,
static_cast
<
const
float
>
(
arg
.
weight_
(
k
,
c
,
x
)));
ck
::
type_convert
<
float
>
(
arg
.
weight_
(
k
,
c
,
x
)));
v_acc
+=
v_in
*
v_wei
;
}
...
...
@@ -112,7 +112,7 @@ struct ReferenceConvFwd : public device::BaseOperator
float
v_out
;
arg
.
out_element_op_
(
v_out
,
v_acc
);
arg
.
output_
(
n
,
k
,
wo
)
=
v_out
;
arg
.
output_
(
n
,
k
,
wo
)
=
ck
::
type_convert
<
OutDataType
>
(
v_out
)
;
};
make_ParallelTensorFunctor
(
f_ncw
,
...
...
@@ -169,6 +169,61 @@ struct ReferenceConvFwd : public device::BaseOperator
return
0
;
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
d_o
,
auto
ho
,
auto
wo
)
{
float
v_acc
=
0
;
for
(
int
c
=
0
;
c
<
arg
.
weight_
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
z
=
0
;
z
<
arg
.
weight_
.
mDesc
.
GetLengths
()[
2
];
++
z
)
{
int
di
=
d_o
*
arg
.
conv_strides_
[
0
]
+
z
*
arg
.
conv_dilations_
[
0
]
-
arg
.
in_left_pads_
[
0
];
for
(
int
y
=
0
;
y
<
arg
.
weight_
.
mDesc
.
GetLengths
()[
3
];
++
y
)
{
int
hi
=
ho
*
arg
.
conv_strides_
[
1
]
+
y
*
arg
.
conv_dilations_
[
1
]
-
arg
.
in_left_pads_
[
1
];
for
(
int
x
=
0
;
x
<
arg
.
weight_
.
mDesc
.
GetLengths
()[
4
];
++
x
)
{
int
wi
=
wo
*
arg
.
conv_strides_
[
2
]
+
x
*
arg
.
conv_dilations_
[
2
]
-
arg
.
in_left_pads_
[
2
];
if
(
di
>=
0
&&
di
<
arg
.
input_
.
mDesc
.
GetLengths
()[
2
]
&&
hi
>=
0
&&
hi
<
arg
.
input_
.
mDesc
.
GetLengths
()[
3
]
&&
wi
>=
0
&&
wi
<
arg
.
input_
.
mDesc
.
GetLengths
()[
4
])
{
float
v_in
;
float
v_wei
;
arg
.
in_element_op_
(
v_in
,
ck
::
type_convert
<
float
>
(
arg
.
input_
(
n
,
c
,
di
,
hi
,
wi
)));
arg
.
wei_element_op_
(
v_wei
,
ck
::
type_convert
<
float
>
(
arg
.
weight_
(
k
,
c
,
z
,
y
,
x
)));
v_acc
+=
v_in
*
v_wei
;
}
}
}
}
}
float
v_out
;
arg
.
out_element_op_
(
v_out
,
v_acc
);
arg
.
output_
(
n
,
k
,
d_o
,
ho
,
wo
)
=
ck
::
type_convert
<
OutDataType
>
(
v_out
);
};
make_ParallelTensorFunctor
(
f_nchw
,
arg
.
output_
.
mDesc
.
GetLengths
()[
0
],
arg
.
output_
.
mDesc
.
GetLengths
()[
1
],
arg
.
output_
.
mDesc
.
GetLengths
()[
2
],
arg
.
output_
.
mDesc
.
GetLengths
()[
3
],
arg
.
output_
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
)
override
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
View file @
e72c0c43
...
...
@@ -6,23 +6,36 @@
#include "device_reduce_instance_blockwise_f32_f32_f32.hpp"
#include "device_reduce_instance_blockwise_f32_f64_f32.hpp"
#include "device_reduce_instance_blockwise_f64_f64_f64.hpp"
#include "device_reduce_instance_blockwise_i8_i8_i8.hpp"
#include "device_reduce_instance_blockwise_i8_i32_i8.hpp"
#include "device_reduce_instance_blockwise_b16_f32_b16.hpp"
#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp"
#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp"
#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp"
#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp"
#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp"
#include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
#include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
#include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
#include "device_reduce_instance_threadwise_f32_f64_f32.hpp"
#include "device_reduce_instance_threadwise_f64_f64_f64.hpp"
#include "device_reduce_instance_threadwise_i8_i8_i8.hpp"
#include "device_reduce_instance_threadwise_i8_i32_i8.hpp"
#include "device_reduce_instance_threadwise_b16_f32_b16.hpp"
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
View file @
e72c0c43
...
...
@@ -17,7 +17,6 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
ReductionConfiguration_2
<
0
,
2
,
2
,
2
,
1
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
2
,
1
>
,
ReductionConfiguration_2
<
1
,
2
,
1
,
1
,
2
>
,
ReductionConfiguration_2
<
1
,
2
,
2
,
1
,
2
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
3
,
1
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
3
>
// clang-format on
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
0 → 100644
View file @
e72c0c43
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
View file @
e72c0c43
...
...
@@ -13,21 +13,27 @@ namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
View file @
e72c0c43
...
...
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
View file @
e72c0c43
...
...
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
View file @
e72c0c43
...
...
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
View file @
e72c0c43
...
...
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
0 → 100644
View file @
e72c0c43
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
0 → 100644
View file @
e72c0c43
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
View file @
e72c0c43
...
...
@@ -15,9 +15,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
// clang-format off
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
ReductionConfiguration_2
<
1
,
2
,
1
,
1
,
2
>
,
ReductionConfiguration_2
<
1
,
2
,
2
,
1
,
2
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
3
>
,
ReductionConfiguration_2
<
1
,
1
,
2
,
1
,
3
>
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
3
>
// clang-format on
>
;
#else
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
View file @
e72c0c43
...
...
@@ -13,21 +13,27 @@ namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
...
...
Prev
1
2
3
4
5
6
7
8
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment