Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
d55080e9
"src/regex.cpp" did not exist on "0d5a97bffedc62ccdabc3268a5d2a9a4a2b3b8ab"
Commit
d55080e9
authored
Apr 21, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into improve_pipeline
parents
7610e049
860e291c
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
105 additions
and
131 deletions
+105
-131
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+5
-13
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+11
-9
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+53
-3
include/ck/utility/reduction_common.hpp
include/ck/utility/reduction_common.hpp
+2
-2
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+7
-30
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+13
-12
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+5
-12
test/reduce/reduce_no_index.cpp
test/reduce/reduce_no_index.cpp
+4
-25
test/reduce/reduce_with_index.cpp
test/reduce/reduce_with_index.cpp
+5
-25
No files found.
example/12_reduce/reduce_blockwise.cpp
View file @
d55080e9
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <getopt.h>
#include <getopt.h>
#include <half.hpp>
#include "check_err.hpp"
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
...
@@ -27,10 +26,6 @@ using InDataType = ck::half_t;
...
@@ -27,10 +26,6 @@ using InDataType = ck::half_t;
using
OutDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
AccDataType
=
float
;
using
HostInDataType
=
half_float
::
half
;
using
HostOutDataType
=
half_float
::
half
;
using
HostAccDataType
=
float
;
constexpr
int
Rank
=
4
;
constexpr
int
Rank
=
4
;
constexpr
int
NumReduceDim
=
3
;
constexpr
int
NumReduceDim
=
3
;
...
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
...
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
if
(
args
.
do_verification
)
if
(
args
.
do_verification
)
{
{
ReductionHost
<
Host
InDataType
,
ReductionHost
<
InDataType
,
Host
AccDataType
,
AccDataType
,
Host
OutDataType
,
OutDataType
,
ReduceOpId
,
ReduceOpId
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
...
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
...
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
NeedIndices
>
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
reinterpret_cast
<
const
HostInDataType
*>
(
in
.
mData
.
data
()),
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
beta
,
reinterpret_cast
<
HostOutDataType
*>
(
out_ref
.
mData
.
data
()),
out_indices_ref
.
mData
.
data
());
};
};
const
auto
i_inLengths
=
to_int_vector
(
args
.
inLengths
);
const
auto
i_inLengths
=
to_int_vector
(
args
.
inLengths
);
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
d55080e9
...
@@ -37,6 +37,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -37,6 +37,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
auto
xdlops_gemm
=
XdlopsGemm
<
FloatAB
,
MPerXDL
,
NPerXDL
,
KPack
>
{};
static
constexpr
auto
xdlops_gemm
=
XdlopsGemm
<
FloatAB
,
MPerXDL
,
NPerXDL
,
KPack
>
{};
static
constexpr
index_t
KPerThread
=
KPerBlock
/
xdlops_gemm
.
K0PerXdlops
;
static
constexpr
index_t
MWaves
=
MPerBlock
/
(
MRepeat
*
MPerXDL
);
static
constexpr
index_t
MWaves
=
MPerBlock
/
(
MRepeat
*
MPerXDL
);
static
constexpr
index_t
NWaves
=
NPerBlock
/
(
NRepeat
*
NPerXDL
);
static
constexpr
index_t
NWaves
=
NPerBlock
/
(
NRepeat
*
NPerXDL
);
...
@@ -69,7 +71,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -69,7 +71,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
const
auto
xdlops_a_idx
=
xdlops_gemm
.
CalculateAThreadOriginDataIndex
();
const
auto
xdlops_a_idx
=
xdlops_gemm
.
CalculateAThreadOriginDataIndex
();
return
make_tuple
(
0
,
waveId_m
,
xdlops_a_idx
[
I1
],
Number
<
KPack
>
{}
*
xdlops_a_idx
[
I0
]);
return
make_tuple
(
0
,
waveId_m
,
xdlops_a_idx
[
I1
],
KPerThread
*
xdlops_a_idx
[
I0
]);
}
}
__device__
static
auto
CalculateBThreadOriginDataIndex
()
__device__
static
auto
CalculateBThreadOriginDataIndex
()
...
@@ -80,7 +82,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -80,7 +82,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
const
auto
xdlops_b_idx
=
xdlops_gemm
.
CalculateBThreadOriginDataIndex
();
const
auto
xdlops_b_idx
=
xdlops_gemm
.
CalculateBThreadOriginDataIndex
();
return
make_tuple
(
0
,
waveId_n
,
xdlops_b_idx
[
I1
],
Number
<
KPack
>
{}
*
xdlops_b_idx
[
I0
]);
return
make_tuple
(
0
,
waveId_n
,
xdlops_b_idx
[
I1
],
KPerThread
*
xdlops_b_idx
[
I0
]);
}
}
template
<
index_t
m0
,
index_t
n0
,
index_t
xdlops_i
,
index_t
blk_i
>
template
<
index_t
m0
,
index_t
n0
,
index_t
xdlops_i
,
index_t
blk_i
>
...
@@ -271,7 +273,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -271,7 +273,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
make_tuple
(
I0
,
I0
,
I0
,
I0
),
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
);
b_thread_buf
);
static_for
<
0
,
KPer
Block
,
KPack
*
xdlops_gemm
.
K0PerXdlops
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
KPer
Thread
,
KPack
>
{}([
&
](
auto
k
)
{
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
...
@@ -298,13 +300,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -298,13 +300,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
}
}
private:
private:
// A[M0, M1, M2, KPer
Block
]
// A[M0, M1, M2, KPer
Thread
]
static
constexpr
auto
a_thread_desc_
=
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Block
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Thread
>
{}));
// B[N0, N1, N2, KPer
Block
]
// B[N0, N1, N2, KPer
Thread
]
static
constexpr
auto
b_thread_desc_
=
static
constexpr
auto
b_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Block
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
KPer
Thread
>
{}));
// C[M, N, NumRegXdlops]
// C[M, N, NumRegXdlops]
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
...
@@ -314,7 +316,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -314,7 +316,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
FloatAB
,
FloatAB
,
decltype
(
a_block_desc_m0_m1_m2_k
),
decltype
(
a_block_desc_m0_m1_m2_k
),
decltype
(
a_thread_desc_
),
decltype
(
a_thread_desc_
),
Sequence
<
1
,
1
,
1
,
KPer
Block
>
,
Sequence
<
1
,
1
,
1
,
KPer
Thread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
A_K1
,
A_K1
,
...
@@ -324,7 +326,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -324,7 +326,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
FloatAB
,
FloatAB
,
decltype
(
b_block_desc_n0_n1_n2_k
),
decltype
(
b_block_desc_n0_n1_n2_k
),
decltype
(
b_thread_desc_
),
decltype
(
b_thread_desc_
),
Sequence
<
1
,
1
,
1
,
KPer
Block
>
,
Sequence
<
1
,
1
,
1
,
KPer
Thread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
B_K1
,
B_K1
,
...
...
include/ck/utility/math_v2.hpp
View file @
d55080e9
#ifndef CK_MATH_V2_HPP
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include <cmath>
#include "data_type.hpp"
#include "data_type.hpp"
#include "half.hpp"
namespace
ck
{
namespace
ck
{
namespace
math
{
namespace
math
{
static
inline
__device__
half_t
abs
(
half_t
x
)
{
return
__habs
(
x
);
};
static
inline
__host__
float
abs
(
float
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__device__
half_t
sqrtf
(
half_t
x
)
{
return
hsqrt
(
x
);
};
static
inline
__device__
bool
isnan
(
half_t
x
)
{
return
__hisnan
(
x
);
};
static
inline
__host__
double
abs
(
double
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
int8_t
abs
(
int8_t
x
)
{
int8_t
sgn
=
x
>>
(
8
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__host__
int32_t
abs
(
int32_t
x
)
{
int32_t
sgn
=
x
>>
(
32
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__host__
half_t
abs
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
half_float
::
half
abs_xx
=
half_float
::
abs
(
xx
);
half_t
abs_x
=
*
reinterpret_cast
<
half_t
*>
(
&
abs_xx
);
return
abs_x
;
};
static
inline
__host__
float
isnan
(
float
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
double
isnan
(
double
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
int8_t
isnan
(
int8_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__host__
int32_t
isnan
(
int32_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__host__
bool
isnan
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
return
half_float
::
isnan
(
xx
);
};
}
// namespace math
}
// namespace math
}
// namespace ck
}
// namespace ck
...
...
include/ck/utility/reduction_common.hpp
View file @
d55080e9
...
@@ -33,7 +33,7 @@ namespace ck {
...
@@ -33,7 +33,7 @@ namespace ck {
struct
float_equal_one
struct
float_equal_one
{
{
template
<
class
T
>
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
__host__
__device__
inline
bool
operator
()(
T
x
)
{
{
return
x
<=
static_cast
<
T
>
(
1.0
f
)
and
x
>=
static_cast
<
T
>
(
1.0
f
);
return
x
<=
static_cast
<
T
>
(
1.0
f
)
and
x
>=
static_cast
<
T
>
(
1.0
f
);
};
};
...
@@ -42,7 +42,7 @@ struct float_equal_one
...
@@ -42,7 +42,7 @@ struct float_equal_one
struct
float_equal_zero
struct
float_equal_zero
{
{
template
<
class
T
>
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
__host__
__device__
inline
bool
operator
()(
T
x
)
{
{
return
x
<=
static_cast
<
T
>
(
0.0
f
)
and
x
>=
static_cast
<
T
>
(
0.0
f
);
return
x
<=
static_cast
<
T
>
(
0.0
f
)
and
x
>=
static_cast
<
T
>
(
0.0
f
);
};
};
...
...
library/include/ck/library/host_tensor/host_reduce_util.hpp
View file @
d55080e9
...
@@ -26,7 +26,6 @@
...
@@ -26,7 +26,6 @@
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <half.hpp>
#include <limits>
#include <limits>
#include <cmath>
#include <cmath>
#include <cassert>
#include <cassert>
...
@@ -34,6 +33,8 @@
...
@@ -34,6 +33,8 @@
#include <string>
#include <string>
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -42,34 +43,10 @@ namespace host_reduce {
...
@@ -42,34 +43,10 @@ namespace host_reduce {
using
ck
::
NanPropagation
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
using
ck
::
ReduceTensorOp
;
template
<
typename
T
>
static
inline
bool
float_equal_one
(
T
);
static
inline
bool
float_equal_one
(
float
x
)
{
return
x
==
1.0
f
;
};
static
inline
bool
float_equal_one
(
double
x
)
{
return
x
==
1.0
;
};
static
inline
bool
float_equal_one
(
half_float
::
half
x
)
{
return
x
==
static_cast
<
half_float
::
half
>
(
1.0
f
);
};
template
<
typename
T
>
static
inline
bool
float_equal_zero
(
T
x
);
static
inline
bool
float_equal_zero
(
float
x
)
{
return
x
==
0.0
f
;
};
static
inline
bool
float_equal_zero
(
double
x
)
{
return
x
==
0.0
;
};
static
inline
bool
float_equal_zero
(
half_float
::
half
x
)
{
return
x
==
static_cast
<
half_float
::
half
>
(
0.0
f
);
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
{
{
using
std
::
abs
;
using
ck
::
math
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
{
...
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
...
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
}
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
{
return
(
std
::
n
umeric
_l
imits
<
AccDataType
>::
m
ax
());
return
(
ck
::
N
umeric
L
imits
<
AccDataType
>::
M
ax
());
}
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
{
return
(
std
::
n
umeric
_l
imits
<
AccDataType
>::
l
owest
());
return
(
ck
::
N
umeric
L
imits
<
AccDataType
>::
L
owest
());
}
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
{
...
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
...
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType
&
accuVal
,
AccDataType
&
accuVal
,
AccDataType
currVal
)
AccDataType
currVal
)
{
{
using
std
::
isnan
;
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
if
constexpr
(
!
PropagateNan
)
{
{
...
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
...
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
int
&
accuIndex
,
int
&
accuIndex
,
int
currIndex
)
int
currIndex
)
{
{
using
std
::
isnan
;
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
if
constexpr
(
!
PropagateNan
)
{
{
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
d55080e9
...
@@ -32,6 +32,7 @@
...
@@ -32,6 +32,7 @@
#include <functional>
#include <functional>
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
#include "data_type.hpp"
...
@@ -196,10 +197,10 @@ struct ReductionHost
...
@@ -196,10 +197,10 @@ struct ReductionHost
OutDataType
*
out_data
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
IndexDataType
*
out_indices
)
{
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
...
@@ -227,10 +228,10 @@ struct ReductionHost
...
@@ -227,10 +228,10 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
...
@@ -263,13 +264,13 @@ struct ReductionHost
...
@@ -263,13 +264,13 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
type_convert
<
AccDataType
>
(
beta
);
...
@@ -303,10 +304,10 @@ struct ReductionHost
...
@@ -303,10 +304,10 @@ struct ReductionHost
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
{
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
...
@@ -330,10 +331,10 @@ struct ReductionHost
...
@@ -330,10 +331,10 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
...
@@ -361,13 +362,13 @@ struct ReductionHost
...
@@ -361,13 +362,13 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
type_convert
<
AccDataType
>
(
beta
);
...
...
profiler/include/profile_reduce_impl.hpp
View file @
d55080e9
...
@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
using
HostInDataType
=
typename
type_mapping
<
InDataType
>::
OutType
;
ReductionHost
<
InDataType
,
using
HostOutDataType
=
typename
type_mapping
<
OutDataType
>::
OutType
;
AccDataType
,
using
HostAccDataType
=
typename
type_mapping
<
AccDataType
>::
OutType
;
OutDataType
,
ReductionHost
<
HostInDataType
,
HostAccDataType
,
HostOutDataType
,
ReduceOpId
,
ReduceOpId
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
...
@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification,
NeedIndices
>
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
reinterpret_cast
<
const
HostInDataType
*>
(
in
.
mData
.
data
()),
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
beta
,
reinterpret_cast
<
HostOutDataType
*>
(
out_ref
.
mData
.
data
()),
out_indices_ref
.
mData
.
data
());
};
};
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
...
...
test/reduce/reduce_no_index.cpp
View file @
d55080e9
...
@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
...
@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return
invariantDims
;
return
invariantDims
;
};
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template
<
typename
InType
>
struct
type_mapping
{
using
OutType
=
InType
;
};
template
<
>
struct
type_mapping
<
ck
::
half_t
>
{
using
OutType
=
half_float
::
half
;
};
constexpr
int
Rank
=
4
;
constexpr
int
Rank
=
4
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
AVG
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
AVG
;
...
@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
...
@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
bool
result
=
true
;
bool
result
=
true
;
using
HostInDataType
=
typename
type_mapping
<
InDataType
>::
OutType
;
ReductionHost
<
InDataType
,
using
HostOutDataType
=
typename
type_mapping
<
OutDataType
>::
OutType
;
AccDataType
,
using
HostAccDataType
=
typename
type_mapping
<
AccDataType
>::
OutType
;
OutDataType
,
ReductionHost
<
HostInDataType
,
HostAccDataType
,
HostOutDataType
,
ReduceOpId
,
ReduceOpId
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
...
@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
...
@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
NeedIndices
>
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
nullptr
);
reinterpret_cast
<
const
HostInDataType
*>
(
in
.
mData
.
data
()),
beta
,
reinterpret_cast
<
HostOutDataType
*>
(
out_ref
.
mData
.
data
()),
nullptr
);
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
...
...
test/reduce/reduce_with_index.cpp
View file @
d55080e9
...
@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
...
@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return
invariantDims
;
return
invariantDims
;
};
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template
<
typename
InType
>
struct
type_mapping
{
using
OutType
=
InType
;
};
template
<
>
struct
type_mapping
<
ck
::
half_t
>
{
using
OutType
=
half_float
::
half
;
};
constexpr
int
Rank
=
4
;
constexpr
int
Rank
=
4
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
AMAX
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
AMAX
;
...
@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
...
@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
bool
result
=
true
;
bool
result
=
true
;
using
HostInDataType
=
typename
type_mapping
<
InDataType
>::
OutType
;
ReductionHost
<
InDataType
,
using
HostOutDataType
=
typename
type_mapping
<
OutDataType
>::
OutType
;
AccDataType
,
using
HostAccDataType
=
typename
type_mapping
<
AccDataType
>::
OutType
;
OutDataType
,
ReductionHost
<
HostInDataType
,
HostAccDataType
,
HostOutDataType
,
ReduceOpId
,
ReduceOpId
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
...
@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
...
@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
NeedIndices
>
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
reinterpret_cast
<
const
HostInDataType
*>
(
in
.
mData
.
data
()),
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
beta
,
reinterpret_cast
<
HostOutDataType
*>
(
out_ref
.
mData
.
data
()),
out_indices_ref
.
mData
.
data
());
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment